diff --git a/README.md b/README.md
index 70d5085..ed27e80 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 # llama.cpp
 
-Enable AI inferencing on z/os
+Enable AI inferencing on z/OS
 
 # Installation and Usage
 
@@ -27,8 +27,9 @@ See the [zopen porting guide](https://zopen.community/#/Guides/Porting) for more
 
 # Documentation
 
-
 # Troubleshooting
 
+While building if an error is encountered in the `ggml-cpu.cpp` file (perhaps related to pthread), run `zopen upgrade zoslib -y` and try building again.
+
 # Contributing
-Contributions are welcome! Please follow the [zopen contribution guidelines](https://github.com/zopencommunity/meta/blob/main/CONTRIBUTING.md).
\ No newline at end of file
+Contributions are welcome! Please follow the [zopen contribution guidelines](https://github.com/zopencommunity/meta/blob/main/CONTRIBUTING.md).
diff --git a/buildenv b/buildenv
index 68766ee..a6f6c1d 100644
--- a/buildenv
+++ b/buildenv
@@ -3,21 +3,23 @@ export ZOPEN_STABLE_DEPS="zoslib make cmake curl"
 export ZOPEN_DEV_URL="https://github.com/ggerganov/llama.cpp.git"
 export ZOPEN_DEV_DEPS="zoslib make cmake curl openssl libssh2 zlib libpsl"
 export ZOPEN_CATEGORIES="ai"
-export ZOPEN_DEV_TAG="master"
+export ZOPEN_DEV_TAG="b6027"
 export ZOPEN_NAME="llamacpp-master"
 export ZOPEN_RUNTIME_DEPS="ncurses"
 
-# rm -f "llama"
-# ln -s "llama.cpp" "llama"
-# ln -s "llama.cpp" $ZOPEN_NAME
+# export ZOPEN_SKIP_ZOSLIB_ENV_HOOK=1
+
+rm -f "llama"
+ln -s "llama.cpp" "llama"
+ln -s "llama.cpp" $ZOPEN_NAME
 
 export ZOPEN_COMP="CLANG"
 # set env variables
 # export CURL_HOME="/data/zopen/usr/local/zopen/curl/curl"
-# export BLAS_HOME="/usr/lpp/cbclib"
+export BLAS_HOME="/usr/lpp/cbclib"
 
 export ZOPEN_CONFIGURE="cmake"
-export ZOPEN_CONFIGURE_OPTS="-B ../build --install-prefix \"\$ZOPEN_INSTALL_DIR/\" -DCURL_LIBRARY=\$CURL_HOME/lib/libcurl.a -DCURL_INCLUDE_DIR=\$CURL_HOME/include -DBUILD_SHARED_LIBS_DEFAULT=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_STATIC=ON -DGGML_BACKEND_DL=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS=\$BLAS_HOME/include/openblas -DBLAS_LIBRARIES=\$BLAS_HOME/lib/libopenblas.so -DLLAMA_BUILD_TESTS=ON ."
+export ZOPEN_CONFIGURE_OPTS="-B ../build --install-prefix \"\$ZOPEN_INSTALL_DIR/\" -DCMAKE_C_FLAGS=\"-fzvector -m64 -march=z15 -lmass.arch13\" -DCMAKE_C_STANDARD=11 -DCMAKE_C_STANDARD_REQUIRED=ON -DCMAKE_C_EXTENSIONS=OFF -DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_SSL=ON -DOPENSSL_ROOT_DIR=\$OPENSSL_HOME -DCURL_LIBRARY=\$CURL_HOME/lib/libcurl.a -DCURL_INCLUDE_DIR=\$CURL_HOME/include -DBUILD_SHARED_LIBS_DEFAULT=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_STATIC=ON -DGGML_BACKEND_DL=OFF -DGGML_OPENBLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS=/usr/lpp/cbclib/include/openblas -DBLAS_LIBRARIES=/usr/lpp/cbclib/lib/libopenblas.so -DLLAMA_BUILD_TESTS=ON -DLLAMA_CURL=ON ."
 
 export ZOPEN_MAKE="cmake"
 export ZOPEN_MAKE_OPTS="--build ../build --parallel \$ZOPEN_NUM_JOBS --config Release"
@@ -39,10 +41,10 @@ zopen_check_results()
   dir="$1"
   pfx="$2"
   chk="$1/$2_check.log"
-
+  
   if [[ -f "$chk" ]]; then
-    total=$(grep -cE "Test #[0-9]+" "$chk")
-    failed=$(grep -cE "Failed|Subprocess aborted" "$chk")
+    total=$(grep -cE "Test +#" "$chk")
+    failed=$(grep -cE "\*\*\*Failed|Subprocess aborted\*\*\*" "$chk")
     skipped=$(grep -c "Skipped" "$chk")
     passed=$((total - failed - skipped))
   else
@@ -57,15 +59,32 @@ zopen_check_results()
   echo "actualPassed:$passed"
   echo "actualSkipped:$skipped"
   echo "totalTests:$total"
-  echo "expectedFailures:0"
+  echo "expectedFailures:3"
   echo "expectedTotalTests:$total"
 }
 
-zopen_append_to_env()
+zopen_pre_check()
 {
-  # echo envars outside of PATH, MANPATH, LIBPATH
+# unset SSL_CERT_FILE
+unset SSL_CERT_PATH
+export SSL_CERT_PATH=$(curl-config --ca)
+export CFLAGS="$CFLAGS -march=z15 -mzvector"
+export CPPFLAGS="$CPPFLAGS -march=z15"
 }
 
+# zopen_append_to_env()
+# {
+#  export SSL_CERT_PATH=$(curl-config --ca)
+#  # echo envars outside of PATH, MANPATH, LIBPATH
+# }
+
+# zopen_append_to_zoslib_env()
+# {
+#  cat<<EOF
+#  SSL_CERT_PATH|set|$(curl-config --ca)
+#  EOF
+# }
+
 zopen_append_to_setup()
 {
   # echo commands that will run when installing via setup.sh
@@ -76,4 +95,4 @@ zopen_get_version()
   # Modify to echo the version of your tool/library
   # Rather than hardcoding the version, obtain the version by running the tool/library
   echo "1.0.0"
-}
\ No newline at end of file
+}
diff --git a/examples/frontend/src/components/ChatMode.js b/examples/frontend/src/components/ChatMode.js
index 4dabf11..e80a52a 100644
--- a/examples/frontend/src/components/ChatMode.js
+++ b/examples/frontend/src/components/ChatMode.js
@@ -139,7 +139,7 @@ const ChatMode = ({ conversationStarted, setConversationStarted }) => {
             onClick={handleChatSubmit}
             disabled={isTyping}
           >
-            â¤
+            Ã¢ÂÂ¤
           </button>
         </div>
       </div>
diff --git a/patches/CMakeLists.txt.patch b/patches/CMakeLists.txt.patch
new file mode 100644
index 0000000..f6b3e81
--- /dev/null
+++ b/patches/CMakeLists.txt.patch
@@ -0,0 +1,53 @@
+diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
+index 66a5ad8d..f71c7dec 100644
+--- a/ggml/src/ggml-cpu/CMakeLists.txt
++++ b/ggml/src/ggml-cpu/CMakeLists.txt
+@@ -51,7 +51,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
+ 
+     target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
+     target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
+-
++    list(APPEND ARCH_FLAGS -fzvector -m64 -march=z15)
+     if (APPLE AND GGML_ACCELERATE)
+         find_library(ACCELERATE_FRAMEWORK Accelerate)
+         if (ACCELERATE_FRAMEWORK)
+@@ -94,7 +94,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
+         target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
+ 
+         target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
+-    endif()
++   endif()
+ 
+     if (GGML_SYSTEM_ARCH STREQUAL "ARM")
+         message(STATUS "ARM detected")
+@@ -463,11 +463,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
+             message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
+             list(APPEND ARCH_FLAGS -march=native -mtune=native)
+         endif()
+-
+         if (GGML_VXE)
+             message(STATUS "VX/VXE/VXE2 enabled")
+             list(APPEND ARCH_FLAGS -mvx -mzvector)
+-            list(APPEND ARCH_DEFINITIONS GGML_VXE)
++	    list(APPEND ARCH_DEFINITIONS GGML_VXE)
+         endif()
+ 
+         if (GGML_NNPA)
+@@ -480,6 +479,17 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
+     else()
+         message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
+         list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
++	list(APPEND ARCH_DEFINITIONS GGML_VXE)
++	message(STATUS "Added GGML_VXE Flag")
++	if(NOT DEFINED TARGET_ARCH)
++  	  set(TARGET_ARCH 13)
++	endif()
++	if(TARGET_ARCH GREATER 13)
++  	  target_compile_options(${GGML_CPU_NAME} PRIVATE "-qarch=${TARGET_ARCH}")
++	endif()
++	target_include_directories(${GGML_CPU_NAME} PRIVATE /usr/include)
++	target_link_libraries(${GGML_CPU_NAME} PRIVATE "/usr/lpp/cbclib/lib/libmass.arch${TARGET_ARCH}.a")
++	message(STATUS "Found MASS: /usr/lpp/cbclib/lib/libmass.arch${TARGET_ARCH}.a")
+     endif()
+ 
+     if (GGML_CPU_REPACK)
diff --git a/patches/arg.cpp.patch b/patches/arg.cpp.patch
index 8878008..e586009 100644
--- a/patches/arg.cpp.patch
+++ b/patches/arg.cpp.patch
@@ -1,5 +1,5 @@
 diff --git a/common/arg.cpp b/common/arg.cpp
-index 40af7e57..46fec792 100644
+index 06005359..28220aac 100644
 --- a/common/arg.cpp
 +++ b/common/arg.cpp
 @@ -36,6 +36,9 @@
@@ -10,7 +10,7 @@ index 40af7e57..46fec792 100644
 +#   include <cstdlib>
 +#   endif
  #endif
-
+ 
  using json = nlohmann::ordered_json;
 @@ -195,6 +198,8 @@ bool common_has_curl() {
  #   endif
@@ -30,7 +30,17 @@ index 40af7e57..46fec792 100644
      http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
      // Check if hf-token or bearer-token was specified
      if (!bearer_token.empty()) {
-@@ -569,6 +574,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
+@@ -506,7 +511,8 @@ static bool common_download_model(
+         struct gguf_init_params gguf_params = {
+             /*.no_alloc = */ true,
+             /*.ctx      = */ NULL,
+-        };
++            /* .allow_byteswapping = */ true,
++	};
+         auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
+         if (!ctx_gguf) {
+             LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
+@@ -569,6 +575,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
      curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
      curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
      curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
diff --git a/patches/clip.cpp.patch b/patches/clip.cpp.patch
new file mode 100644
index 0000000..fa3ccf9
--- /dev/null
+++ b/patches/clip.cpp.patch
@@ -0,0 +1,12 @@
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
+index a4b62f9a..2d89d4b1 100644
+--- a/tools/mtmd/clip.cpp
++++ b/tools/mtmd/clip.cpp
+@@ -2028,6 +2028,7 @@ struct clip_model_loader {
+         struct gguf_init_params params = {
+             /*.no_alloc = */ true,
+             /*.ctx      = */ &meta,
++	    /*.allow_byteswapping = */ true,
+         };
+ 
+         ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
diff --git a/patches/common.cpp.patch b/patches/common.cpp.patch
index 872bd9f..ee69674 100644
--- a/patches/common.cpp.patch
+++ b/patches/common.cpp.patch
@@ -1,8 +1,8 @@
 diff --git a/common/common.cpp b/common/common.cpp
-index 4cc40ed..234ad95 100644
+index d8c4d988..00aa7d43 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -851,7 +851,7 @@ std::string fs_get_cache_directory() {
+@@ -874,7 +874,7 @@ std::string fs_get_cache_directory() {
      if (getenv("LLAMA_CACHE")) {
          cache_directory = std::getenv("LLAMA_CACHE");
      } else {
@@ -11,3 +11,11 @@ index 4cc40ed..234ad95 100644
          if (std::getenv("XDG_CACHE_HOME")) {
              cache_directory = std::getenv("XDG_CACHE_HOME");
          } else {
+@@ -1436,6 +1436,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
+     struct gguf_init_params meta_gguf_params = {
+         /* .no_alloc = */ false,
+         /* .ctx      = */ &ctx,
++	/* .allow_byteswapping = */ true,
+     };
+     struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+     if (!ctx_gguf) {
diff --git a/patches/convert-llama2c-to-ggml.cpp.patch b/patches/convert-llama2c-to-ggml.cpp.patch
new file mode 100644
index 0000000..a80a89c
--- /dev/null
+++ b/patches/convert-llama2c-to-ggml.cpp.patch
@@ -0,0 +1,12 @@
+diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+index bdf0eed2..b36170bc 100644
+--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
++++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+@@ -535,6 +535,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
+         struct gguf_init_params params = {
+             /*.no_alloc = */ false,
+             /*.ctx      = */ &ctx_data,
++	    /*.allow_byteswapping = */ true,
+         };
+ 
+         struct gguf_context * ctx = gguf_init_from_file(filename, params);
diff --git a/patches/examples_gguf.cpp.patch b/patches/examples_gguf.cpp.patch
new file mode 100644
index 0000000..49e6b43
--- /dev/null
+++ b/patches/examples_gguf.cpp.patch
@@ -0,0 +1,12 @@
+diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
+index f31989c8..94ac3e5a 100644
+--- a/examples/gguf/gguf.cpp
++++ b/examples/gguf/gguf.cpp
+@@ -87,6 +87,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
+     struct gguf_init_params params = {
+         /*.no_alloc = */ false,
+         /*.ctx      = */ NULL,
++	/*.allow_byteswapping = */ true,
+     };
+ 
+     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
diff --git a/patches/export-lora.cpp.patch b/patches/export-lora.cpp.patch
new file mode 100644
index 0000000..44efe88
--- /dev/null
+++ b/patches/export-lora.cpp.patch
@@ -0,0 +1,12 @@
+diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp
+index f038019b..bf750ab9 100644
+--- a/tools/export-lora/export-lora.cpp
++++ b/tools/export-lora/export-lora.cpp
+@@ -50,6 +50,7 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context
+     struct gguf_init_params params = {
+         /*.no_alloc = */ true,
+         /*.ctx      = */ ctx_ggml,
++	/*.allow_byteswapping = */ true,
+     };
+     struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
+     if (!ctx_gguf) {
diff --git a/patches/ggml-backend-reg.cpp.patch b/patches/ggml-backend-reg.cpp.patch
index 964827c..466e10e 100644
--- a/patches/ggml-backend-reg.cpp.patch
+++ b/patches/ggml-backend-reg.cpp.patch
@@ -1,15 +1,15 @@
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 405d8e3..b3682a9 100644
+index f0cdac31..29247c6a 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
-@@ -556,7 +556,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
+@@ -561,7 +561,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
  }
-
+ 
  void ggml_backend_load_all() {
 -    ggml_backend_load_all_from_path(nullptr);
 +#ifdef GGML_BACKEND_DL
 +     ggml_backend_load_all_from_path(nullptr);
 +#endif
  }
-
+ 
  void ggml_backend_load_all_from_path(const char * dir_path) {
diff --git a/patches/ggml-cpu-impl.h.patch b/patches/ggml-cpu-impl.h.patch
new file mode 100644
index 0000000..eb0c5f2
--- /dev/null
+++ b/patches/ggml-cpu-impl.h.patch
@@ -0,0 +1,31 @@
+diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
+index d839cf5c..3f8532f7 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
++++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
+@@ -68,6 +68,15 @@ struct ggml_compute_params {
+ #endif  // __VXE2__
+ #endif  // __s390x__ && __VEC__
+ 
++#if defined(__MVS__) && defined(__VEC__)
++#ifndef __VXE__
++#define __VXE__
++#endif  // __VXE__
++#ifndef __VXE2__
++#define __VXE2__
++#endif  // __VXE2__
++#endif  // __MVS__ && __VEC__
++
+ #if defined(__s390x__) && defined(GGML_NNPA)
+ #ifndef __NNPA__
+ #define __NNPA__
+@@ -352,8 +361,9 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
+ #endif
+ 
+ #if defined(__VXE__) || defined(__VXE2__)
++#ifndef __VEC__
+ #include <vecintrin.h>
+-
++#endif
+ #define vec_neg(a)    (-(a))                // Vector Negate
+ #define vec_add(a, b) ((a) + (b))           // Vector Add
+ #define vec_sub(a, b) ((a) - (b))           // Vector Subtract
diff --git a/patches/ggml-cpu.c.patch b/patches/ggml-cpu.c.patch
index 0592551..b545a79 100644
--- a/patches/ggml-cpu.c.patch
+++ b/patches/ggml-cpu.c.patch
@@ -1,12 +1,13 @@
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index c7426df..e074799 100644
+index c5271b77..6a547e4b 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
 @@ -17,7 +17,7 @@
-
+ 
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <malloc.h> // using malloc.h with MSC/MINGW
 -#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
 +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__)
  #include <alloca.h>
  #endif
+ 
diff --git a/patches/ggml-cpu.cpp.patch b/patches/ggml-cpu.cpp.patch
index 81540ae..e7aa9e1 100644
--- a/patches/ggml-cpu.cpp.patch
+++ b/patches/ggml-cpu.cpp.patch
@@ -1,14 +1,14 @@
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
-index e013e8b..6290b22 100644
+index c9daa4c3..63839d07 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp
 +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
 @@ -33,6 +33,38 @@
  #    include <sys/types.h>
  #endif
-
-+#define CVT_PTR        0x10
-+#define CVTRCEP_OFFSET         0x490
-+#define RCEAFC_OFFSET  0x088
+ 
++#define CVT_PTR 	0x10
++#define CVTRCEP_OFFSET 	0x490
++#define RCEAFC_OFFSET 	0x088
 +
 +typedef unsigned data_area_ptr_assign_type;
 +
@@ -17,7 +17,7 @@ index e013e8b..6290b22 100644
 +#if defined(_LP64)
 +        data_area_ptr_assign_type lower;
 +#endif
-+       data_area_ptr_assign_type assign;
++	data_area_ptr_assign_type assign;
 +    };
 +    char* deref;
 +} data_area_ptr;
@@ -39,15 +39,15 @@ index e013e8b..6290b22 100644
 +}
 +
  // ggml-backend interface
-
+ 
  std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
 @@ -43,7 +75,7 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
          if (ggml_backend_amx_buffer_type()) {
              bufts.push_back(ggml_backend_amx_buffer_type());
          }
 -#endif
-+#endif
-
++#endif	
+ 
  #ifdef GGML_USE_CPU_KLEIDIAI
          if (ggml_backend_cpu_kleidiai_buffer_type()) {
 @@ -340,6 +372,9 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
@@ -59,4 +59,4 @@ index e013e8b..6290b22 100644
 +    *free = get_free_memory();
  #else
      long pages = sysconf(_SC_PHYS_PAGES);
-     long page_size = sysconf(_SC_PAGE_SIZE);
\ No newline at end of file
+     long page_size = sysconf(_SC_PAGE_SIZE);
diff --git a/patches/ggml-impl.h.patch b/patches/ggml-impl.h.patch
new file mode 100644
index 0000000..4de578e
--- /dev/null
+++ b/patches/ggml-impl.h.patch
@@ -0,0 +1,74 @@
+diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
+index a2e30994..d7500273 100644
+--- a/ggml/src/ggml-impl.h
++++ b/ggml/src/ggml-impl.h
+@@ -28,6 +28,18 @@
+ #include <immintrin.h>
+ #endif
+ 
++#if defined(__gnu_linux__)
++#include <byteswap.h>
++#elif defined(__MVS__)
++#define bswap_16(x) __builtin_bswap16(x)
++#define bswap_32(x) __builtin_bswap32(x)
++#define bswap_64(x) __builtin_bswap64(x)
++#else
++#define bswap_16(x) (x)
++#define bswap_32(x) (x)
++#define bswap_64(x) (x)
++#endif // defined(__gnu_linux__)
++
+ #ifdef __cplusplus
+ extern "C" {
+ #endif
+@@ -484,6 +496,18 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+ #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+ #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+ 
++static inline void ggml_bswap16(void * value) {
++	*((uint16_t*)value) = bswap_16(*((uint16_t*)value));
++}
++
++static inline void ggml_bswap32(void * value) {
++	*((uint32_t*)value) = bswap_32(*((uint32_t*)value));
++}
++
++static inline void ggml_bswap64(void * value) {
++	*((uint64_t*)value) = bswap_64(*((uint64_t*)value));
++}
++
+ // return true if the node's results are only used by N other nodes
+ // and can be fused into their calculations.
+ static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
+@@ -548,6 +572,31 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
+ #ifdef __cplusplus
+ #include <initializer_list>
+ #include <vector>
++#include <type_traits>
++
++template <typename T, std::enable_if_t<sizeof(T) == 1, int> = 0>
++static inline void ggml_bswap(T * value)
++{
++    GGML_UNUSED(value);
++}
++
++template <typename T, std::enable_if_t<sizeof(T) == 2, int> = 0>
++static inline void ggml_bswap(T * value)
++{
++    ggml_bswap16(value);
++}
++
++template <typename T, std::enable_if_t<sizeof(T) == 4, int> = 0>
++static inline void ggml_bswap(T * value)
++{
++    ggml_bswap32(value);
++}
++
++template <typename T, std::enable_if_t<sizeof(T) == 8, int> = 0>
++static inline void ggml_bswap(T * value)
++{
++    ggml_bswap64(value);
++}
+ 
+ // nicer C++ syntax for ggml_can_fuse
+ inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
diff --git a/patches/ggml.c.patch b/patches/ggml.c.patch
index cc37a78..40fa28a 100644
--- a/patches/ggml.c.patch
+++ b/patches/ggml.c.patch
@@ -1,11 +1,11 @@
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 97da26b3..212faf9c 100644
+index 124cf3e8..14487858 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
 @@ -14,9 +14,13 @@
  #include <hbwmalloc.h>
  #endif
-
+ 
 +#if defined(__MVS__)
 +#include <zos-base.h>
 +#endif
@@ -16,19 +16,19 @@ index 97da26b3..212faf9c 100644
 +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__)
  #include <alloca.h>
  #endif
-
+ 
 @@ -299,6 +303,15 @@ void * ggml_aligned_malloc(size_t size) {
-
+ 
  #if defined(_MSC_VER) || defined(__MINGW32__)
      return _aligned_malloc(size, alignment);
 +#elif defined(__MVS__)
 +//    void * aligned_memory = NULL;
 +//    if (size ==0)
-+//         size = 1;
++//	    size = 1;
 +//    aligned_memory = malloc(size);
 +//    int result = 0;
 +//    if (aligned_memory == NULL)
-+//         result = errno;
++//	    result = errno;
 +    return __aligned_malloc(size, alignment);
  #else
      if (size == 0) {
@@ -41,4 +41,505 @@ index 97da26b3..212faf9c 100644
 +    __aligned_free(ptr);
  #elif GGML_USE_CPU_HBM
      if (ptr != NULL) {
-         hbw_free(ptr);
\ No newline at end of file
+         hbw_free(ptr);
+@@ -586,12 +601,43 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
+ 
++static void ggml_byteswap_i16 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_i32 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_i64 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q4_0 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q4_1 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q5_0 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q5_1	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q8_0	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q8_1	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q2_k	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q3_k 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q4_k 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q5_k 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q6_k	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_iq2_xxs (void * restrict buffer, size_t elements);
++static void ggml_byteswap_iq2_xs  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_iq3_xxs (void * restrict buffer, size_t elements);
++static void ggml_byteswap_iq3_s   (void * restrict buffer, size_t elements);
++static void ggml_byteswap_iq2_s   (void * restrict buffer, size_t elements);
++static void ggml_byteswap_iq1_s   (void * restrict buffer, size_t elements);
++static void ggml_byteswap_iq4_nl  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_iq4_xs  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q8_k 	  (void * restrict buffer, size_t elements);
++static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements);
++static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements);
++static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements);
++static void ggml_byteswap_tq1_0   (void * restrict buffer, size_t elements);
++static void ggml_byteswap_tq2_0   (void * restrict buffer, size_t elements);
++
++//byteswap functions enabled for all except i16 and iq1_m
+ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+     [GGML_TYPE_I8] = {
+         .type_name                = "i8",
+         .blck_size                = 1,
+         .type_size                = sizeof(int8_t),
+         .is_quantized             = false,
++	.byteswap		  = ggml_byteswap_i16,
+     },
+     [GGML_TYPE_I16] = {
+         .type_name                = "i16",
+@@ -604,24 +650,28 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .blck_size                = 1,
+         .type_size                = sizeof(int32_t),
+         .is_quantized             = false,
++	.byteswap		  = ggml_byteswap_i32,
+     },
+     [GGML_TYPE_I64] = {
+         .type_name                = "i64",
+         .blck_size                = 1,
+         .type_size                = sizeof(int64_t),
+         .is_quantized             = false,
++	.byteswap		  = ggml_byteswap_i64,
+     },
+     [GGML_TYPE_F64] = {
+         .type_name                = "f64",
+         .blck_size                = 1,
+         .type_size                = sizeof(double),
+         .is_quantized             = false,
++	.byteswap		  = ggml_byteswap_i64,
+     },
+     [GGML_TYPE_F32] = {
+         .type_name                = "f32",
+         .blck_size                = 1,
+         .type_size                = sizeof(float),
+         .is_quantized             = false,
++	.byteswap		  = ggml_byteswap_i32,
+     },
+     [GGML_TYPE_F16] = {
+         .type_name                = "f16",
+@@ -630,6 +680,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = false,
+         .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
+         .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_fp16_row,
++	.byteswap		  = ggml_byteswap_i16,
+     },
+     [GGML_TYPE_Q4_0] = {
+         .type_name                = "q4_0",
+@@ -638,6 +689,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_0_ref,
++	.byteswap		  = ggml_byteswap_q4_0,
+     },
+     [GGML_TYPE_Q4_1] = {
+         .type_name                = "q4_1",
+@@ -646,6 +698,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
++	.byteswap		  = ggml_byteswap_q4_1,
+     },
+     [4] = { // GGML_TYPE_Q4_2
+         .type_name                = "DEPRECATED",
+@@ -666,6 +719,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_0_ref,
++	.byteswap		  = ggml_byteswap_q5_0,
+     },
+     [GGML_TYPE_Q5_1] = {
+         .type_name                = "q5_1",
+@@ -674,6 +728,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_1_ref,
++	.byteswap		  = ggml_byteswap_q5_1,
+     },
+     [GGML_TYPE_Q8_0] = {
+         .type_name                = "q8_0",
+@@ -682,6 +737,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q8_0_ref,
++	.byteswap		  = ggml_byteswap_q8_0,
+     },
+     [GGML_TYPE_Q8_1] = {
+         .type_name                = "q8_1",
+@@ -689,6 +745,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .type_size                = sizeof(block_q8_1),
+         .is_quantized             = true,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q8_1_ref,
++	.byteswap		  = ggml_byteswap_q8_1,
+     },
+     [GGML_TYPE_Q2_K] = {
+         .type_name                = "q2_K",
+@@ -697,6 +754,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q2_K_ref,
++	.byteswap		  = ggml_byteswap_q2_k,
+     },
+     [GGML_TYPE_Q3_K] = {
+         .type_name                = "q3_K",
+@@ -705,6 +763,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q3_K_ref,
++	.byteswap		  = ggml_byteswap_q3_k,
+     },
+     [GGML_TYPE_Q4_K] = {
+         .type_name                = "q4_K",
+@@ -713,6 +772,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_K_ref,
++	.byteswap		  = ggml_byteswap_q4_k,
+     },
+     [GGML_TYPE_Q5_K] = {
+         .type_name                = "q5_K",
+@@ -721,6 +781,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q5_K_ref,
++	.byteswap		  = ggml_byteswap_q5_k,
+     },
+     [GGML_TYPE_Q6_K] = {
+         .type_name                = "q6_K",
+@@ -729,6 +790,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_q6_K_ref,
++	.byteswap		  = ggml_byteswap_q6_k,
+     },
+     [GGML_TYPE_IQ2_XXS] = {
+         .type_name                = "iq2_xxs",
+@@ -737,6 +799,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
+         .from_float_ref           = NULL,
++	.byteswap		  = ggml_byteswap_iq2_xxs,
+     },
+     [GGML_TYPE_IQ2_XS] = {
+         .type_name                = "iq2_xs",
+@@ -745,6 +808,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
+         .from_float_ref           = NULL,
++	.byteswap		  = ggml_byteswap_iq2_xs,
+     },
+     [GGML_TYPE_IQ3_XXS] = {
+         .type_name                = "iq3_xxs",
+@@ -753,6 +817,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
+         .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
++	.byteswap		  = ggml_byteswap_iq3_xxs,
+     },
+     [GGML_TYPE_IQ3_S] = {
+         .type_name                = "iq3_s",
+@@ -761,6 +826,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
+         .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_s_ref,
++	.byteswap		  = ggml_byteswap_iq3_s,
+     },
+     [GGML_TYPE_IQ2_S] = {
+         .type_name                = "iq2_s",
+@@ -769,6 +835,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
+         .from_float_ref           = (ggml_from_float_t)quantize_row_iq2_s_ref,
++	.byteswap		  = ggml_byteswap_iq2_s,
+     },
+     [GGML_TYPE_IQ1_S] = {
+         .type_name                = "iq1_s",
+@@ -777,6 +844,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
+         .from_float_ref           = NULL,
++	.byteswap		  = ggml_byteswap_iq1_s,
+     },
+     [GGML_TYPE_IQ1_M] = {
+         .type_name                = "iq1_m",
+@@ -793,6 +861,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
+         .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_nl_ref,
++	.byteswap		  = ggml_byteswap_iq4_nl,
+     },
+     [GGML_TYPE_IQ4_XS] = {
+         .type_name                = "iq4_xs",
+@@ -801,12 +870,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
+         .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_xs_ref,
++	.byteswap		  = ggml_byteswap_iq4_xs,
+     },
+     [GGML_TYPE_Q8_K] = {
+         .type_name                = "q8_K",
+         .blck_size                = QK_K,
+         .type_size                = sizeof(block_q8_K),
+         .is_quantized             = true,
++	.byteswap		  = ggml_byteswap_q8_k,
+     },
+     [GGML_TYPE_BF16] = {
+         .type_name                = "bf16",
+@@ -815,24 +886,28 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = false,
+         .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
+         .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
++	.byteswap		  = ggml_byteswap_i16,
+     },
+     [31] = { // GGML_TYPE_Q4_0_4_4
+         .type_name                = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
+         .blck_size                = 0,
+         .type_size                = 0,
+         .is_quantized             = false,
++	.byteswap		  = ggml_byteswap_q4_0_4x4,
+     },
+     [32] = { // GGML_TYPE_Q4_0_4_8
+         .type_name                = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
+         .blck_size                = 0,
+         .type_size                = 0,
+         .is_quantized             = false,
++	.byteswap		  = ggml_byteswap_q4_0_4x8,
+     },
+     [33] = { // GGML_TYPE_Q4_0_8_8
+         .type_name                = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
+         .blck_size                = 0,
+         .type_size                = 0,
+         .is_quantized             = false,
++	.byteswap		  = ggml_byteswap_q4_0_8x8,
+     },
+     [GGML_TYPE_TQ1_0] = {
+         .type_name                = "tq1_0",
+@@ -841,6 +916,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_tq1_0,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_tq1_0_ref,
++	.byteswap		  = ggml_byteswap_tq1_0,
+     },
+     [GGML_TYPE_TQ2_0] = {
+         .type_name                = "tq2_0",
+@@ -849,6 +925,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+         .is_quantized             = true,
+         .to_float                 = (ggml_to_float_t) dequantize_row_tq2_0,
+         .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
++	.byteswap		  = ggml_byteswap_tq2_0,
+     },
+     [36] = { // GGML_TYPE_IQ4_NL_4_4
+         .type_name                = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
+@@ -6944,3 +7021,215 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
+     if (p0->strict_cpu     != p1->strict_cpu )    return false;
+     return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+ }
++
++static void ggml_byteswap_i16(void * restrict buffer, size_t elements) {
++    uint16_t *data_ptr = (uint16_t*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(data_ptr + i);
++    }
++}
++
++static void ggml_byteswap_i32(void * restrict buffer, size_t elements) {
++    uint32_t *data_ptr = (uint32_t*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap32(data_ptr + i);
++    }
++}
++
++static void ggml_byteswap_i64(void * restrict buffer, size_t elements) {
++    uint64_t *data_ptr = (uint64_t*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap64(data_ptr + i);
++    }
++}
++
++static void ggml_byteswap_q4_0(void * restrict buffer, size_t elements) {
++    block_q4_0 *data_ptr = (block_q4_0*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_q4_1(void * restrict buffer, size_t elements) {
++    block_q4_1 *data_ptr = (block_q4_1*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	ggml_bswap16(&(data_ptr[i].m));
++    }
++}
++
++static void ggml_byteswap_q5_0(void * restrict buffer, size_t elements) {
++    block_q5_0 *data_ptr = (block_q5_0*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_q5_1(void * restrict buffer, size_t elements) {
++    block_q5_1 *data_ptr = (block_q5_1*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	ggml_bswap16(&(data_ptr[i].m));
++    }
++}
++
++static void ggml_byteswap_q8_0(void * restrict buffer, size_t elements) {
++    block_q8_0 *data_ptr = (block_q8_0*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_q8_1(void * restrict buffer, size_t elements) {
++    block_q8_1 *data_ptr = (block_q8_1*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	ggml_bswap16(&(data_ptr[i].s));
++    }
++}
++
++static void ggml_byteswap_q2_k(void * restrict buffer, size_t elements) {
++    block_q2_K *data_ptr = (block_q2_K*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	ggml_bswap16(&(data_ptr[i].dmin));
++    }
++}
++
++static void ggml_byteswap_q3_k(void * restrict buffer, size_t elements) {
++    block_q3_K *data_ptr = (block_q3_K*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) {
++    block_q4_K *data_ptr = (block_q4_K*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	ggml_bswap16(&(data_ptr[i].dmin));
++    }
++}
++
++static void ggml_byteswap_q5_k(void * restrict buffer, size_t elements) {
++    block_q5_K *data_ptr = (block_q5_K*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	ggml_bswap16(&(data_ptr[i].dmin));
++    }
++}
++
++static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) {
++    block_q6_K *data_ptr = (block_q6_K*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) {
++    block_iq2_xxs *data_ptr = (block_iq2_xxs*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	for (size_t j = 0; j < QK_K/8; ++j) {
++	    ggml_bswap16(&(data_ptr[i].qs[j]));
++	}
++    }
++}
++
++static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) {
++    block_iq2_xs *data_ptr = (block_iq2_xs*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	for (size_t j = 0; j < QK_K/8; ++j) {
++	    ggml_bswap16(&(data_ptr[i].qs[j]));
++	}
++    }
++}
++
++static void ggml_byteswap_iq3_xxs(void * restrict buffer, size_t elements) {
++    block_iq3_xxs *data_ptr = (block_iq3_xxs*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_iq3_s(void * restrict buffer, size_t elements) {
++    block_iq3_s *data_ptr = (block_iq3_s*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_iq2_s(void * restrict buffer, size_t elements) {
++    block_iq2_s *data_ptr = (block_iq2_s*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) {
++    block_iq1_s *data_ptr = (block_iq1_s*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	for (size_t j = 0; j < QK_K/32; ++j) {
++	    ggml_bswap16(&(data_ptr[i].qh[j]));
++	}
++    }
++}
++
++static void ggml_byteswap_iq4_nl(void * restrict buffer, size_t elements) {
++    block_iq4_nl *data_ptr = (block_iq4_nl*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_iq4_xs(void * restrict buffer, size_t elements) {
++    block_iq4_xs *data_ptr = (block_iq4_xs*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++	ggml_bswap16(&(data_ptr[i].scales_h));
++    }
++}
++
++static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) {
++    block_q8_K *data_ptr = (block_q8_K*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap32(&(data_ptr[i].d));
++	for (size_t j = 0; j < QK_K/16; ++j) {
++	    ggml_bswap16(&(data_ptr[i].bsums[j]));
++	}
++    }
++}
++
++static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements) {
++    GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x4 is not implemented yet");
++    UNUSED(buffer);
++    UNUSED(elements);
++}
++
++static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements) {
++    GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x8 is not implemented yet");
++    UNUSED(buffer);
++    UNUSED(elements);
++}
++
++static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) {
++    GGML_ASSERT(false && "function ggml_byteswap_q4_0_8x8 is not implemented yet");
++    UNUSED(buffer);
++    UNUSED(elements);
++}
++
++static void ggml_byteswap_tq1_0(void * restrict buffer, size_t elements) {
++    block_tq1_0 *data_ptr = (block_tq1_0*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
++
++static void ggml_byteswap_tq2_0(void * restrict buffer, size_t elements) {
++    block_tq2_0 *data_ptr = (block_tq2_0*) buffer;
++    for (size_t i = 0; i < elements; ++i) {
++	ggml_bswap16(&(data_ptr[i].d));
++    }
++}
diff --git a/patches/ggml.h.patch b/patches/ggml.h.patch
new file mode 100644
index 0000000..e4c985f
--- /dev/null
+++ b/patches/ggml.h.patch
@@ -0,0 +1,20 @@
+diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
+index 8a8775be..cfada317 100644
+--- a/ggml/include/ggml.h
++++ b/ggml/include/ggml.h
+@@ -2355,6 +2355,7 @@ extern "C" {
+ #endif
+     typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
++    typedef void (*ggml_byteswap_t) (void * GGML_RESTRICT buffer, size_t elements);
+ 
+     struct ggml_type_traits {
+         const char             * type_name;
+@@ -2364,6 +2365,7 @@ extern "C" {
+         bool                     is_quantized;
+         ggml_to_float_t          to_float;
+         ggml_from_float_t        from_float_ref;
++	ggml_byteswap_t 	 byteswap;
+     };
+ 
+     GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
diff --git a/patches/gguf-hash.cpp.patch b/patches/gguf-hash.cpp.patch
new file mode 100644
index 0000000..a8e9629
--- /dev/null
+++ b/patches/gguf-hash.cpp.patch
@@ -0,0 +1,12 @@
+diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
+index 9523ec12..952d03f8 100644
+--- a/examples/gguf-hash/gguf-hash.cpp
++++ b/examples/gguf-hash/gguf-hash.cpp
+@@ -290,6 +290,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
+     struct gguf_init_params params = {
+         /*.no_alloc = */ false,
+         /*.ctx      = */ &ctx_data,
++	/*.allow_byteswapping = */ true,
+     };
+ 
+     // xxh64 init
diff --git a/patches/gguf-split.cpp.patch b/patches/gguf-split.cpp.patch
new file mode 100644
index 0000000..7622e14
--- /dev/null
+++ b/patches/gguf-split.cpp.patch
@@ -0,0 +1,80 @@
+diff --git a/tools/gguf-split/gguf-split.cpp b/tools/gguf-split/gguf-split.cpp
+index 30e77156..0472e387 100644
+--- a/tools/gguf-split/gguf-split.cpp
++++ b/tools/gguf-split/gguf-split.cpp
+@@ -328,14 +328,20 @@ struct split_strategy {
+                 const char * t_name = gguf_get_tensor_name(ctx_out, i);
+                 struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+                 auto n_bytes = ggml_nbytes(t);
+-                read_buf.resize(n_bytes);
++                auto n_elements = ggml_nelements(t) / ggml_blck_size(t->type);
++		read_buf.resize(n_bytes);
+ 
+                 // calculate offset
+                 auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file
+                 auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+ 
++		ggml_byteswap_t byteswap_func = nullptr;
++		if (gguf_needs_byteswap(ctx_gguf)) {
++		    byteswap_func = ggml_get_type_traits(t->type)->byteswap;
++		}
++
+                 // copy tensor from input to output file
+-                copy_file_to_file(f_input, fout, offset, n_bytes);
++                copy_file_to_file(f_input, fout, offset, n_bytes, n_elements, byteswap_func);
+                 zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+             }
+ 
+@@ -346,13 +352,18 @@ struct split_strategy {
+         }
+     }
+ 
+-    void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
++    void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len, const size_t elements, ggml_byteswap_t byteswap_func) {
+         // TODO: detect OS and use copy_file_range() here for better performance
+         if (read_buf.size() < len) {
+             read_buf.resize(len);
+         }
+         f_in.seekg(in_offset);
+         f_in.read((char *)read_buf.data(), len);
++
++	if (byteswap_func != nullptr) {
++	    byteswap_func(read_buf.data(), elements);
++	}
++
+         f_out.write((const char *)read_buf.data(), len);
+     }
+ };
+@@ -363,6 +374,7 @@ static void gguf_split(const split_params & split_params) {
+     struct gguf_init_params params = {
+         /*.no_alloc = */ true,
+         /*.ctx      = */ &ctx_meta,
++	/*.allow_byteswapping = */ true,
+     };
+ 
+     std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
+@@ -426,6 +438,7 @@ static void gguf_merge(const split_params & split_params) {
+         struct gguf_init_params params = {
+             /*.no_alloc = */ true,
+             /*.ctx      = */ &ctx_meta,
++	    /*.allow_byteswapping = */ true,
+         };
+ 
+         if (i_split > 0) {
+@@ -539,7 +552,15 @@ static void gguf_merge(const split_params & split_params) {
+             auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
+             f_input.seekg(offset);
+             f_input.read((char *)read_data.data(), n_bytes);
+-            if (!split_params.dry_run) {
++            
++	    if (gguf_needs_byteswap(ctx_gguf)) {
++		auto byteswap = ggml_get_type_traits(t->type)->byteswap;
++		if (byteswap != nullptr) {
++		    byteswap(read_data.data(), ggml_nelements(t) / ggml_blck_size(t->type));
++		}
++	    }
++	    
++	    if (!split_params.dry_run) {
+                 // write tensor data + padding
+                 fout.write((const char *)read_data.data(), n_bytes);
+                 zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
diff --git a/patches/gguf.cpp.patch b/patches/gguf.cpp.patch
new file mode 100644
index 0000000..299677c
--- /dev/null
+++ b/patches/gguf.cpp.patch
@@ -0,0 +1,86 @@
+diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
+index 53504399..e71bd790 100644
+--- a/ggml/src/gguf.cpp
++++ b/ggml/src/gguf.cpp
+@@ -214,16 +214,22 @@ struct gguf_context {
+     size_t size      = 0; // size of `data` in bytes
+ 
+     void * data = nullptr;
++    bool needs_byteswap = false; //only for reading, writing in non-native endianness is not supported
+ };
+ 
+ struct gguf_reader {
+     FILE * file;
++    bool do_byteswap = false;
+ 
+     gguf_reader(FILE * file) : file(file) {}
+ 
+     template <typename T>
+     bool read(T & dst) const {
+-        return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
++	auto res = fread(&dst, 1, sizeof(dst), file);
++     	if (do_byteswap) {
++	   ggml_bswap(&dst);
++	}	
++        return res == sizeof(dst);
+     }
+ 
+     template <typename T>
+@@ -278,7 +284,13 @@ struct gguf_reader {
+             return false;
+         }
+         dst.resize(size);
+-        return fread(dst.data(), 1, dst.length(), file) == dst.length();
++        // return fread(dst.data(), 1, dst.length(), file) == dst.length();
++	std::vector<char> temp(size);
++	if (fread(temp.data(), 1, size, file) != size) {
++	   return false;
++	}
++	dst.assign(temp.begin(), temp.end());
++	return true;
+     }
+ 
+     bool read(void * dst, const size_t size) const {
+@@ -317,7 +329,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
+ }
+ 
+ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
+-    const struct gguf_reader gr(file);
++    struct gguf_reader gr(file);
+     struct gguf_context * ctx = new gguf_context;
+ 
+     bool ok = true;
+@@ -351,6 +363,16 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
+     int64_t n_tensors = 0;
+ 
+     if (ok && gr.read(ctx->version)) {
++	uint32_t original_version = ctx->version;
++
++	if ((params.allow_byteswapping) && ((ctx->version & 0x0000FFFF) == 0) && ((ctx->version & 0xFFFF0000) != 0)) {
++	    // most likely different endianness, do byteswapping
++	    gr.do_byteswap = true;
++	    ctx->needs_byteswap = true;
++	    ggml_bswap(&(ctx->version));
++	    GGML_LOG_INFO("%s: Entered byteswapping, corrected version from %" PRIu32 "to new GGUF file version %" PRIu32 "\n", __func__, original_version, ctx->version);
++	}
++
+         if (ok && ctx->version == 0) {
+             GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
+             ok = false;
+@@ -363,7 +385,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
+          * the last 4 hexadecimal digits to check if the model is the same
+          * endianness as the host system.
+         */
+-        if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
++        if (ok && !ctx->needs_byteswap && (ctx->version & 0x0000FFFF) == 0x00000000) {
+             GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
+             ok = false;
+         }
+@@ -1356,3 +1378,7 @@ void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
+     gguf_write_to_buf(ctx, buf, /*only_meta =*/ true);
+     memcpy(data, buf.data(), buf.size());
+ }
++
++bool gguf_needs_byteswap(const struct gguf_context * ctx) {
++    return ctx->needs_byteswap;
++}
diff --git a/patches/gguf.h.patch b/patches/gguf.h.patch
new file mode 100644
index 0000000..540d4e1
--- /dev/null
+++ b/patches/gguf.h.patch
@@ -0,0 +1,23 @@
+diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
+index 79ee2020..236205c4 100644
+--- a/ggml/include/gguf.h
++++ b/ggml/include/gguf.h
+@@ -74,6 +74,8 @@ extern "C" {
+ 
+         // if not NULL, create a ggml_context and allocate the tensor data in it
+         struct ggml_context ** ctx;
++
++	bool allow_byteswapping;
+     };
+ 
+     GGML_API struct gguf_context * gguf_init_empty(void);
+@@ -197,6 +199,9 @@ extern "C" {
+     // writes the meta data to pointer "data"
+     GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
+ 
++    // returns true if gguf file needs byteswapping when reading. Byteswapping for writing not implemented
++    GGML_API bool gguf_needs_byteswap(const struct gguf_context * ctx);
++
+ #ifdef  __cplusplus
+ }
+ #endif
diff --git a/patches/gguf_writer.py.patch b/patches/gguf_writer.py.patch
new file mode 100644
index 0000000..32e745d
--- /dev/null
+++ b/patches/gguf_writer.py.patch
@@ -0,0 +1,24 @@
+diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
+index 4f23f9b0..d1fd51e0 100644
+--- a/gguf-py/gguf/gguf_writer.py
++++ b/gguf-py/gguf/gguf_writer.py
+@@ -4,6 +4,7 @@ import logging
+ import os
+ import shutil
+ import struct
++import sys
+ import tempfile
+ from dataclasses import dataclass
+ from enum import Enum, auto
+@@ -452,6 +453,11 @@ class GGUFWriter:
+                 for ti in tensors.values():
+                     assert ti.tensor is not None  # can only iterate once over the tensors
+                     assert ti.tensor.nbytes == ti.nbytes
++                    
++                    if (self.endianess == GGUFEndian.BIG and sys.byteorder == 'little') or (self.endianess == GGUFEndian.LITTLE and sys.byteorder == 'big'):
++                        # ti.tensor.byteswap(inplace=True) just didn't work here
++                        ti.tensor = ti.tensor.byteswap()
++
+                     ti.tensor.tofile(fout)
+                     if shard_bar is not None:
+                         shard_bar.update(ti.nbytes)
diff --git a/patches/httplib.h.patch b/patches/httplib.h.patch
index 6f0f621..bd943d3 100644
--- a/patches/httplib.h.patch
+++ b/patches/httplib.h.patch
@@ -1,5 +1,5 @@
 diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h
-index 0aa4e62..cbc6ece 100644
+index 0aa4e627..cbc6ece2 100644
 --- a/vendor/cpp-httplib/httplib.h
 +++ b/vendor/cpp-httplib/httplib.h
 @@ -7,6 +7,7 @@
diff --git a/patches/imatrix.cpp.patch b/patches/imatrix.cpp.patch
new file mode 100644
index 0000000..031ad06
--- /dev/null
+++ b/patches/imatrix.cpp.patch
@@ -0,0 +1,12 @@
+diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp
+index 9aad3711..a11c2548 100644
+--- a/tools/imatrix/imatrix.cpp
++++ b/tools/imatrix/imatrix.cpp
+@@ -700,6 +700,7 @@ bool IMatrixCollector::load_imatrix(const char * file_name) {
+     struct gguf_init_params meta_gguf_params = {
+         /* .no_alloc = */ false, // the data is needed
+         /* .ctx      = */ &ctx,
++	/* .allow_byteswapping = */ true,
+     };
+     struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
+     if (!ctx_gguf) {
diff --git a/patches/llama-adapter.cpp.patch b/patches/llama-adapter.cpp.patch
new file mode 100644
index 0000000..c2465dd
--- /dev/null
+++ b/patches/llama-adapter.cpp.patch
@@ -0,0 +1,12 @@
+diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
+index 8d94034a..ef21c55b 100644
+--- a/src/llama-adapter.cpp
++++ b/src/llama-adapter.cpp
+@@ -152,6 +152,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
+     gguf_init_params meta_gguf_params = {
+         /* .no_alloc = */ true,
+         /* .ctx      = */ &ctx_init,
++	/* .allow_byteswapping = */ true,
+     };
+ 
+     gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
diff --git a/patches/llama-context.cpp.patch b/patches/llama-context.cpp.patch
index 187c22c..55c0d9b 100644
--- a/patches/llama-context.cpp.patch
+++ b/patches/llama-context.cpp.patch
@@ -1,37 +1,37 @@
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 06e93b19..0db8530f 100644
+index 9e77fe6d..6b40f7fd 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -1178,12 +1178,17 @@ int llama_context::decode(const llama_batch & batch_inp) {
-                     }
-                 }
+@@ -1223,12 +1223,17 @@ int llama_context::decode(const llama_batch & batch_inp) {
+                 // remember the swaps and apply them lazily upon logits/embeddings access
+                 output_swaps.push_back({ i, j_min });
              }
 -
-+
++	
 +            #ifndef __MVS__
              std::fill(output_ids.begin(), output_ids.end(), -1);
 -
-+           #else
-+           for (auto& id: output_ids) {
-+               id = -1;
-+           }
++	    #else
++	    for (auto& id: output_ids) {
++		id = -1;
++	    }
              for (uint32_t i = 0; i < n_outputs; ++i) {
                  output_ids[out_ids[i]] = i;
              }
-+           #endif
++	    #endif
          }
      }
-
-@@ -1264,8 +1269,10 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
+ 
+@@ -1311,8 +1316,10 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
      embd   = has_embd   ? output_base + logits_size : nullptr;
-
+ 
      // set all ids as invalid (negative)
 -    std::fill(output_ids.begin(), output_ids.end(), -1);
 -
 +    //std::fill(output_ids.begin(), output_ids.end(), -1);
 +    for (auto& id: output_ids) {
-+       id = -1;
++	id = -1;
 +    }
      this->n_outputs = 0;
-
-     return n_outputs_max;
\ No newline at end of file
+ 
+     return n_outputs_max;
diff --git a/patches/llama-hparams.h.patch b/patches/llama-hparams.h.patch
new file mode 100644
index 0000000..a49ab7e
--- /dev/null
+++ b/patches/llama-hparams.h.patch
@@ -0,0 +1,12 @@
+diff --git a/src/llama-hparams.h b/src/llama-hparams.h
+index 8b7e2a11..530dfea2 100644
+--- a/src/llama-hparams.h
++++ b/src/llama-hparams.h
+@@ -3,6 +3,7 @@
+ #include "llama.h"
+ 
+ #include <array>
++#include <algorithm>
+ 
+ // bump if necessary
+ #define LLAMA_MAX_LAYERS  512
diff --git a/patches/llama-model-loader.cpp.patch b/patches/llama-model-loader.cpp.patch
new file mode 100644
index 0000000..b42e3dd
--- /dev/null
+++ b/patches/llama-model-loader.cpp.patch
@@ -0,0 +1,94 @@
+diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
+index bd9e6da8..cacb935b 100644
+--- a/src/llama-model-loader.cpp
++++ b/src/llama-model-loader.cpp
+@@ -490,6 +490,7 @@ llama_model_loader::llama_model_loader(
+     struct gguf_init_params params = {
+         /*.no_alloc = */ true,
+         /*.ctx      = */ &ctx,
++	/*.allow_byteswapping = */ true,
+     };
+ 
+     meta.reset(gguf_init_from_file(fname.c_str(), params));
+@@ -550,6 +551,7 @@ llama_model_loader::llama_model_loader(
+             struct gguf_init_params split_params = {
+                 /*.no_alloc = */ true,
+                 /*.ctx      = */ &ctx,
++		/*.allow_byteswapping = */ true,
+             };
+             gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
+             if (!ctx_gguf) {
+@@ -709,6 +711,9 @@ llama_model_loader::llama_model_loader(
+     if (!llama_mmap::SUPPORTED) {
+         LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
+         use_mmap = false;
++    } else if (gguf_needs_byteswap(meta.get())) {
++	LLAMA_LOG_WARN("%s: gguf file needs byteswapping, mmap is disabled. This may affect performance.\n", __func__);
++	use_mmap = false;
+     }
+ 
+     this->use_mmap = use_mmap;
+@@ -908,6 +913,13 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
+         const auto & file = files.at(w.idx);
+         file->seek(w.offs, SEEK_SET);
+         file->read_raw(cur->data, ggml_nbytes(cur));
++
++	if (gguf_needs_byteswap(meta.get())) {
++	    auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
++	    if (byteswap != nullptr) {
++		byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type));
++	    }
++	}
+     }
+ 
+     if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
+@@ -1063,6 +1075,14 @@ bool llama_model_loader::load_all_data(
+             if (ggml_backend_buffer_is_host(cur->buffer)) {
+                 file->seek(weight->offs, SEEK_SET);
+                 file->read_raw(cur->data, n_size);
++                   
++	        if (gguf_needs_byteswap(meta.get())) {
++	            auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
++	            if (byteswap != nullptr) {
++		        byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type));
++	            }
++	        }
++
+                 if (check_tensors) {
+                     validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
+                         return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
+@@ -1075,11 +1095,20 @@ bool llama_model_loader::load_all_data(
+ 
+                     size_t bytes_read = 0;
+ 
++		    // for byteswapping purposes ensure that there is whole number of elements in buffer
++		    const size_t buf_size_aligned = gguf_needs_byteswap(meta.get()) ? buffer_size - (buffer_size % ggml_blck_size(cur->type)) : buffer_size;
++
+                     while (bytes_read < n_size) {
+-                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
++                        size_t read_iteration = std::min<size_t>(buf_size_aligned, n_size - bytes_read);
+ 
+                         ggml_backend_event_synchronize(events[buffer_idx]);
+                         file->read_raw(host_ptrs[buffer_idx], read_iteration);
++			if (gguf_needs_byteswap(meta.get())) {
++	    		    auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
++	    		    if (byteswap != nullptr) {
++				byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type));
++	    		    }
++			}
+                         ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+                         ggml_backend_event_record(events[buffer_idx], upload_backend);
+ 
+@@ -1091,6 +1120,12 @@ bool llama_model_loader::load_all_data(
+                     read_buf.resize(n_size);
+                     file->seek(weight->offs, SEEK_SET);
+                     file->read_raw(read_buf.data(), n_size);
++		    if (gguf_needs_byteswap(meta.get())) {
++	    		auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
++	    		if (byteswap != nullptr) {
++			    byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type));
++	    		}
++		    }
+                     ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
+                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
diff --git a/patches/miniaudio.h.patch b/patches/miniaudio.h.patch
index 0b3a155..ddb0ce8 100644
--- a/patches/miniaudio.h.patch
+++ b/patches/miniaudio.h.patch
@@ -1,10 +1,10 @@
 diff --git a/vendor/miniaudio/miniaudio.h b/vendor/miniaudio/miniaudio.h
-index c74bebe..72cbafd 100644
+index c74bebeb..72cbafd5 100644
 --- a/vendor/miniaudio/miniaudio.h
 +++ b/vendor/miniaudio/miniaudio.h
 @@ -3873,8 +3873,8 @@ typedef ma_uint16 wchar_t;
      #define MA_POSIX
-
+ 
      /*
 -    Use the MA_NO_PTHREAD_IN_HEADER option at your own risk. This is intentionally undocumented.
 -    You can use this to avoid including pthread.h in the header section. The downside is that it
@@ -16,9 +16,9 @@ index c74bebe..72cbafd 100644
 @@ -16176,7 +16176,7 @@ static ma_result ma_thread_create__posix(ma_thread* pThread, ma_thread_priority
      int result;
      pthread_attr_t* pAttr = NULL;
-
+ 
 -#if !defined(__EMSCRIPTEN__) && !defined(__3DS__)
 +#if !defined(__EMSCRIPTEN__) && !defined(__3DS__) && !defined(__MVS__)
      /* Try setting the thread priority. It's not critical if anything fails here. */
      pthread_attr_t attr;
-     if (pthread_attr_init(&attr) == 0) {
\ No newline at end of file
+     if (pthread_attr_init(&attr) == 0) {
diff --git a/patches/ops.h.patch b/patches/ops.h.patch
new file mode 100644
index 0000000..e26d1d1
--- /dev/null
+++ b/patches/ops.h.patch
@@ -0,0 +1,12 @@
+diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
+index 3a32ec20..3677919b 100644
+--- a/ggml/src/ggml-cpu/ops.h
++++ b/ggml/src/ggml-cpu/ops.h
+@@ -1,5 +1,7 @@
+ #pragma once
+ 
++#include <mass.h>
++
+ #include "ggml.h"
+ 
+ //
diff --git a/patches/quantize.cpp.patch b/patches/quantize.cpp.patch
new file mode 100644
index 0000000..635c4a5
--- /dev/null
+++ b/patches/quantize.cpp.patch
@@ -0,0 +1,12 @@
+diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
+index 45c59ecb..b99ae61d 100644
+--- a/tools/quantize/quantize.cpp
++++ b/tools/quantize/quantize.cpp
+@@ -219,6 +219,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
+     struct gguf_init_params meta_gguf_params = {
+         /* .no_alloc = */ false, // the data is needed
+         /* .ctx      = */ &ctx,
++	/* .allow_byteswapping = */ true,
+     };
+     struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
+     if (!ctx_gguf) {
diff --git a/patches/repack.cpp.patch b/patches/repack.cpp.patch
new file mode 100644
index 0000000..796ec93
--- /dev/null
+++ b/patches/repack.cpp.patch
@@ -0,0 +1,22 @@
+diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
+index 74c1c02..604bd17 100644
+--- a/ggml/src/ggml-cpu/repack.cpp
++++ b/ggml/src/ggml-cpu/repack.cpp
+@@ -1424,6 +1424,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
+     // instance for IQ4
+     static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
+ 
++#ifndef __MVS__
+     if (cur->type == GGML_TYPE_Q4_0) {
+         if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+             if (cur->ne[1] % 8 == 0) {
+@@ -1453,6 +1454,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
+             }
+         }
+     }
++#endif
++
++    GGML_LOG_DEBUG("REPACK DEBUG: %s: Repacking disabled for tensor '%s' (type %s)\n",  __func__, ggml_get_name(cur) ? ggml_get_name(cur) : "unnamed", ggml_type_name(cur->type));
+ 
+     return nullptr;
+ }
diff --git a/patches/run.cpp.patch b/patches/run.cpp.patch
index d9d7c03..3cb4626 100644
--- a/patches/run.cpp.patch
+++ b/patches/run.cpp.patch
@@ -3,14 +3,14 @@ index 6fe728c6..9ee09d80 100644
 --- a/tools/run/run.cpp
 +++ b/tools/run/run.cpp
 @@ -22,6 +22,9 @@
-
+ 
  #if defined(LLAMA_USE_CURL)
  #    include <curl/curl.h>
 +#    if defined(__MVS__)
 +#    include <cstdlib>
 +#    endif
  #endif
-
+ 
  #include <signal.h>
 @@ -507,6 +510,7 @@ class HttpClient {
          curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
@@ -19,4 +19,4 @@ index 6fe728c6..9ee09d80 100644
 +	curl_easy_setopt(curl, CURLOPT_CAINFO, std::getenv("SSL_CERT_PATH"));
          return curl_easy_perform(curl);
      }
-
+ 
diff --git a/patches/sgemm.h.patch b/patches/sgemm.h.patch
new file mode 100644
index 0000000..181c308
--- /dev/null
+++ b/patches/sgemm.h.patch
@@ -0,0 +1,15 @@
+diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h
+index 729e8853..c85a67dd 100644
+--- a/ggml/src/ggml-cpu/llamafile/sgemm.h
++++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
+@@ -3,8 +3,10 @@
+ #include <stdbool.h>
+ 
+ #if defined(__VXE__) || defined(__VXE2__)
++#ifndef __VEC__
+ #include <vecintrin.h>
+ #endif
++#endif
+ 
+ #ifdef __cplusplus
+ extern "C" {
diff --git a/patches/simd-mappings.h.patch b/patches/simd-mappings.h.patch
new file mode 100644
index 0000000..d25ff3e
--- /dev/null
+++ b/patches/simd-mappings.h.patch
@@ -0,0 +1,13 @@
+diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
+index b4ad68c9..22657c86 100644
+--- a/ggml/src/ggml-cpu/simd-mappings.h
++++ b/ggml/src/ggml-cpu/simd-mappings.h
+@@ -1070,7 +1070,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
+ #define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
+ #define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
+ 
+-#elif defined(__VXE__) || defined(__VXE2__)
++#elif defined(__VXE__) || defined(__VXE2__) || defined(__MVS__)
+ 
+ #define GGML_SIMD
+ 
diff --git a/patches/simd.inprogresspatch b/patches/simd.inprogresspatch
deleted file mode 100644
index fb1c651..0000000
--- a/patches/simd.inprogresspatch
+++ /dev/null
@@ -1,251 +0,0 @@
-diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
-index 66a5ad8d..a67bce53 100644
---- a/ggml/src/ggml-cpu/CMakeLists.txt
-+++ b/ggml/src/ggml-cpu/CMakeLists.txt
-@@ -480,6 +480,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
-     else()
-         message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
-         list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
-+	#ifdef __MVS__
-+	list(APPEND ARCH_FLAGS -mzvector)
-+	message(STATUS "-mzvector enabled")
-+	#endif
-     endif()
-
-     if (GGML_CPU_REPACK)
-diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
-index d839cf5c..9cc46cbf 100644
---- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
-+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
-@@ -352,7 +352,11 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
- #endif
-
- #if defined(__VXE__) || defined(__VXE2__)
-+#ifdef __MVS__
-+#include <builtins.h>
-+#elif
- #include <vecintrin.h>
-+#endif
-
- #define vec_neg(a)    (-(a))                // Vector Negate
- #define vec_add(a, b) ((a) + (b))           // Vector Add
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 11ff228f..4a1e7f63 100644
---- a/ggml/src/ggml-cpu/ggml-cpu.c
-+++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -17,7 +17,7 @@
-
- #if defined(_MSC_VER) || defined(__MINGW32__)
- #include <malloc.h> // using malloc.h with MSC/MINGW
--#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__)
- #include <alloca.h>
- #endif
-
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
-index c9daa4c3..63839d07 100644
---- a/ggml/src/ggml-cpu/ggml-cpu.cpp
-+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
-@@ -33,6 +33,38 @@
- #    include <sys/types.h>
- #endif
-
-+#define CVT_PTR 	0x10
-+#define CVTRCEP_OFFSET 	0x490
-+#define RCEAFC_OFFSET 	0x088
-+
-+typedef unsigned data_area_ptr_assign_type;
-+
-+typedef union {
-+    struct {
-+#if defined(_LP64)
-+        data_area_ptr_assign_type lower;
-+#endif
-+	data_area_ptr_assign_type assign;
-+    };
-+    char* deref;
-+} data_area_ptr;
-+
-+uint64_t get_free_memory(void) {
-+    uint64_t freeram;
-+    data_area_ptr cvt = {0};
-+    data_area_ptr rcep = {0};
-+    cvt.assign = *(data_area_ptr_assign_type*)(CVT_PTR);
-+    rcep.assign = *(data_area_ptr_assign_type*)(cvt.deref + CVTRCEP_OFFSET);
-+    freeram = (uint64_t)*((uint32_t*)(rcep.deref + RCEAFC_OFFSET)) * 4096;
-+
-+    return freeram;
-+}
-+
-+uint64_t get_total_memory(void) {
-+    /* Use CVTRLSTG to get the size of actual real storage online at IPL in K. */
-+    return (uint64_t)((int)((char *__ptr32 *__ptr32 *)0)[4][214]) * 1024;
-+}
-+
- // ggml-backend interface
-
- std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
-@@ -43,7 +75,7 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
-         if (ggml_backend_amx_buffer_type()) {
-             bufts.push_back(ggml_backend_amx_buffer_type());
-         }
--#endif
-+#endif
-
- #ifdef GGML_USE_CPU_KLEIDIAI
-         if (ggml_backend_cpu_kleidiai_buffer_type()) {
-@@ -340,6 +372,9 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
-     GlobalMemoryStatusEx(&status);
-     *total = status.ullTotalPhys;
-     *free = status.ullAvailPhys;
-+#elifdef __MVS__
-+    *total = get_total_memory();
-+    *free = get_free_memory();
- #else
-     long pages = sysconf(_SC_PHYS_PAGES);
-     long page_size = sysconf(_SC_PAGE_SIZE);
-diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
-index ed61869a..1bb64b0f 100644
---- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
-+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
-@@ -194,7 +194,7 @@ inline float hsum(float16x8_t x) {
-
- #if defined(__VXE__) || defined(__VXE2__)
- inline float hsum(float32x4_t x) {
--    float32x4_t tmp = x + vec_reve(x);
-+    float32x4_t tmp = x + (float32x4_t)vec_reve((__vector int)x);
-     return tmp[0] + tmp[1];
- }
- #endif
-@@ -256,10 +256,12 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) {
-         tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]);
-     }
-
--    return vec_xl(0, (const float *)(tmp));
-+    //return vec_xl(0, (const float *)(tmp));
-+    return GGML_VEC_LOAD(tmp);
- }
- template <> inline float32x4_t load(const float * p) {
--    return vec_xl(0, p);
-+    //return vec_xl(0, p);
-+    return GGML_VEC_LOAD(p);
- }
- #endif
-
-diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h
-index 729e8853..5a35157b 100644
---- a/ggml/src/ggml-cpu/llamafile/sgemm.h
-+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
-@@ -3,7 +3,7 @@
- #include <stdbool.h>
-
- #if defined(__VXE__) || defined(__VXE2__)
--#include <vecintrin.h>
-+//#include <vecintrin.h>
- #endif
-
- #ifdef __cplusplus
-diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 2f153419..3a459782 100644
---- a/ggml/src/ggml-cpu/ops.cpp
-+++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -1,3 +1,9 @@
-+#ifdef GGML_MASS
-+#include <mass_simd.h>
-+#define cosf(a) cosd2(a)
-+#define sinf(a) sind2(a)
-+#endif
-+
- #include "ops.h"
-
- #include "ggml-cpu.h"
-diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
-index b4ad68c9..a25387d4 100644
---- a/ggml/src/ggml-cpu/simd-mappings.h
-+++ b/ggml/src/ggml-cpu/simd-mappings.h
-@@ -2,6 +2,18 @@
-
- #include "ggml-cpu-impl.h"
-
-+#if defined(__MVS__)
-+    #define GGML_VEC_LOAD(ptr) (vec_xl(0, (int32_t *)(ptr)))
-+    #define GGML_VEC_STORE(vec, ptr) vec_xst((__vector int)(vec), 0, (int32_t *)(ptr))
-+#else
-+    #define GGML_VEC_LOAD(ptr) vec_xl(0, (const float *)(ptr))
-+    #define GGML_VEC_STORE(vec, ptr) vec_xst((vec), 0, (float *)(ptr))
-+#endif
-+
-+#if defined(__lcbb)
-+#  undef __lcbb
-+#endif
-+
- #ifdef __ARM_FEATURE_SVE
- #include <arm_sve.h>
- #endif // __ARM_FEATURE_SVE
-@@ -1080,10 +1092,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
- #define GGML_F32_EPR  4
-
- #define GGML_F32x4              float32x4_t
--#define GGML_F32x4_ZERO         vec_splats(0.0f)
-+#define GGML_F32x4_ZERO         (float32x4_t)vec_splats((int32_t)0.0f)
- #define GGML_F32x4_SET1         vec_splats
--#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
--#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-+#define GGML_F32x4_LOAD(p)      GGML_VEC_LOAD(p)
-+#define GGML_F32x4_STORE(p, r)  GGML_VEC_STORE(r, p)
- #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
- #define GGML_F32x4_ADD          vec_add
- #define GGML_F32x4_MUL          vec_mul
-@@ -1101,8 +1113,8 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
-     for (int i = 0; i < offset; ++i) {              \
-         x[i] = vec_add(x[i], x[offset + i]);        \
-     }                                               \
--    float32x4_t tmp = x[0] + vec_reve(x[0]);        \
--    res = tmp[0] + tmp[1];                          \
-+    float32x4_t tmp = x[0] + (float32x4_t)vec_reve((__vector int)x[0]); \
-+    res = (ggml_float)tmp[0] + (ggml_float)tmp[1];                          \
- }
-
- #define GGML_F32_VEC        GGML_F32x4
-@@ -1133,7 +1145,7 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
-
-     // note: keep type-cast here to prevent compiler bugs
-     // see: https://github.com/ggml-org/llama.cpp/issues/12846
--    return vec_xl(0, (const float *)(tmp));
-+    return GGML_VEC_LOAD(tmp);
- #endif
- }
-
-@@ -1152,7 +1164,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
-
-     // note: keep type-cast here to prevent compiler bugs
-     // see: https://github.com/ggml-org/llama.cpp/issues/12846
--    vec_xst(v_y, 0, (float *)(arr));
-+    GGML_VEC_STORE(v_y, arr);
-
-     for (int i = 0; i < 4; i++) {
-         x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
-diff --git a/ggml/src/ggml-cpu/unary-ops.cpp b/ggml/src/ggml-cpu/unary-ops.cpp
-index 4fce569b..9da79838 100644
---- a/ggml/src/ggml-cpu/unary-ops.cpp
-+++ b/ggml/src/ggml-cpu/unary-ops.cpp
-@@ -1,3 +1,7 @@
-+#ifdef GGML_MASS
-+#include <mass.h>
-+#endif
-+
- #include "unary-ops.h"
-
- static inline float op_abs(float x) {
-diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
-index a8156011..7640e3b1 100644
---- a/ggml/src/ggml-cpu/vec.cpp
-+++ b/ggml/src/ggml-cpu/vec.cpp
-@@ -1,3 +1,7 @@
-+#ifdef GGML_MASS
-+#include <mass.h>
-+#endif
-+
- #include "vec.h"
-
- #include <cassert>
diff --git a/patches/stb_image.h.patch b/patches/stb_image.h.patch
index ea445df..62bb590 100644
--- a/patches/stb_image.h.patch
+++ b/patches/stb_image.h.patch
@@ -1,11 +1,11 @@
 diff --git a/vendor/stb/stb_image.h b/vendor/stb/stb_image.h
-index 9eedabe..92d5251 100644
+index 9eedabed..92d52515 100644
 --- a/vendor/stb/stb_image.h
 +++ b/vendor/stb/stb_image.h
 @@ -620,6 +620,10 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
     #define stbi_inline __forceinline
  #endif
-
+ 
 +#ifdef __MVS__
 +   #define STBI_NO_THREAD_LOCALS
 +#endif
@@ -16,7 +16,7 @@ index 9eedabe..92d5251 100644
 @@ -721,6 +725,10 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
  #define STBI_NO_SIMD
  #endif
-
+ 
 +#if defined(__MVS__)
 +#include <zos-tls.h>
 +#endif
@@ -27,7 +27,7 @@ index 9eedabe..92d5251 100644
 @@ -963,11 +971,33 @@ static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
  static int      stbi__pnm_is16(stbi__context *s);
  #endif
-
+ 
 +#if defined(__MVS__)
 +static __tlssim<const char*> stbi__g_failure_reason_tls("");
 +#define stbi__g_failure_reason (*stbi__g_failure_reason_tls.access())
@@ -55,6 +55,6 @@ index 9eedabe..92d5251 100644
  #endif
  const char *stbi__g_failure_reason;
 +#endif
-
+ 
  STBIDEF const char *stbi_failure_reason(void)
- {
\ No newline at end of file
+ {
diff --git a/patches/test-gguf.cpp.patch b/patches/test-gguf.cpp.patch
new file mode 100644
index 0000000..fb945f2
--- /dev/null
+++ b/patches/test-gguf.cpp.patch
@@ -0,0 +1,20 @@
+diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
+index 3f0c312e..f7c66fb6 100644
+--- a/tests/test-gguf.cpp
++++ b/tests/test-gguf.cpp
+@@ -715,6 +715,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
+         struct gguf_init_params gguf_params = {
+             /*no_alloc =*/ false,
+             /*ctx      =*/ hft >= offset_has_data ? &ctx : nullptr,
++	    /*.allow_byteswapping = */ true,
+         };
+ 
+         struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
+@@ -1111,6 +1112,7 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
+     struct gguf_init_params gguf_params = {
+         /*no_alloc =*/ false,
+         /*ctx      =*/ only_meta ? nullptr : &ctx_1,
++	/*.allow_byteswapping = */ true,
+     };
+     struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
+ 
diff --git a/patches/unary-ops.h.patch b/patches/unary-ops.h.patch
new file mode 100644
index 0000000..87a7b8e
--- /dev/null
+++ b/patches/unary-ops.h.patch
@@ -0,0 +1,12 @@
+diff --git a/ggml/src/ggml-cpu/unary-ops.h b/ggml/src/ggml-cpu/unary-ops.h
+index b1ade2c8..eb525682 100644
+--- a/ggml/src/ggml-cpu/unary-ops.h
++++ b/ggml/src/ggml-cpu/unary-ops.h
+@@ -1,5 +1,7 @@
+ #pragma once
+ 
++#include <mass.h>
++
+ #include "common.h"
+ 
+ #ifdef __cplusplus
diff --git a/patches/unicode.h.patch b/patches/unicode.h.patch
new file mode 100644
index 0000000..ef33778
--- /dev/null
+++ b/patches/unicode.h.patch
@@ -0,0 +1,65 @@
+diff --git a/src/unicode.h b/src/unicode.h
+index 0a5fa2a7..ca8a9011 100644
+--- a/src/unicode.h
++++ b/src/unicode.h
+@@ -15,6 +15,10 @@ struct unicode_cpt_flags {
+         SYMBOL          = 0x0040,  // regex: \p{S}
+         CONTROL         = 0x0080,  // regex: \p{C}
+         MASK_CATEGORIES = 0x00FF,
++	WHITESPACE 	= 0x0100,
++	LOWERCASE 	= 0x0200,
++	UPPERCASE 	= 0x0400,
++	NFD 		= 0x0800,
+     };
+ 
+     // codepoint type
+@@ -34,11 +38,49 @@ struct unicode_cpt_flags {
+ 
+     // decode from uint16
+     inline unicode_cpt_flags(const uint16_t flags = 0) {
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+         *reinterpret_cast<uint16_t*>(this) = flags;
++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++	is_undefined 	= (flags & UNDEFINED) 	? 1 : 0;
++	is_number 	= (flags & NUMBER) 	? 1 : 0;
++	is_letter 	= (flags & LETTER) 	? 1 : 0;
++	is_separator	= (flags & SEPARATOR)  	? 1 : 0;
++	is_accent_mark  = (flags & ACCENT_MARK) ? 1 : 0;
++	is_punctuation 	= (flags & PUNCTUATION) ? 1 : 0;
++	is_symbol	= (flags & SYMBOL)	? 1 : 0;
++	is_control 	= (flags & CONTROL) 	? 1 : 0;
++	is_whitespace	= (flags & WHITESPACE) 	? 1 : 0;
++	is_lowercase 	= (flags & LOWERCASE) 	? 1 : 0;
++	is_uppercase 	= (flags & UPPERCASE) 	? 1 : 0;
++	is_nfd 		= (flags & NFD)		? 1 : 0;
++#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++#error Unexpected or undefined __BYTE_ORDER__
++#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+     }
+ 
+     inline uint16_t as_uint() const {
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+         return *reinterpret_cast<const uint16_t*>(this);
++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++	uint16_t result = 
++	      is_undefined	* UNDEFINED
++	    + is_number 	* NUMBER
++	    + is_letter 	* LETTER
++	    + is_separator 	* SEPARATOR
++	    + is_accent_mark 	* ACCENT_MARK
++	    + is_punctuation 	* PUNCTUATION
++	    + is_symbol 	* SYMBOL
++	    + is_control	* CONTROL
++	    + is_whitespace 	* WHITESPACE
++	    + is_lowercase	* LOWERCASE
++	    + is_uppercase	* UPPERCASE
++	    + is_nfd 		* NFD
++	    ;
++
++	return result;
++#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++#error Unexpected or undefined __BYTE_ORDER__
++#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+     }
+ 
+     inline uint16_t category_flag() const {
diff --git a/patches/vec.h.patch b/patches/vec.h.patch
new file mode 100644
index 0000000..b7b53f5
--- /dev/null
+++ b/patches/vec.h.patch
@@ -0,0 +1,10 @@
+diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
+index c432c990..b144a518 100644
+--- a/ggml/src/ggml-cpu/vec.h
++++ b/ggml/src/ggml-cpu/vec.h
+@@ -1,4 +1,5 @@
+ // Vectorized functions for fundamental operations
++#include <mass.h>
+ 
+ #pragma once
+