diff --git a/README.md b/README.md index 70d5085..ed27e80 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # llama.cpp -Enable AI inferencing on z/os +Enable AI inferencing on z/OS # Installation and Usage @@ -27,8 +27,9 @@ See the [zopen porting guide](https://zopen.community/#/Guides/Porting) for more # Documentation - # Troubleshooting +While building if an error is encountered in the `ggml-cpu.cpp` file (perhaps related to pthread), run `zopen upgrade zoslib -y` and try building again. + # Contributing -Contributions are welcome! Please follow the [zopen contribution guidelines](https://github.com/zopencommunity/meta/blob/main/CONTRIBUTING.md). \ No newline at end of file +Contributions are welcome! Please follow the [zopen contribution guidelines](https://github.com/zopencommunity/meta/blob/main/CONTRIBUTING.md). diff --git a/buildenv b/buildenv index 68766ee..a6f6c1d 100644 --- a/buildenv +++ b/buildenv @@ -3,21 +3,23 @@ export ZOPEN_STABLE_DEPS="zoslib make cmake curl" export ZOPEN_DEV_URL="https://github.com/ggerganov/llama.cpp.git" export ZOPEN_DEV_DEPS="zoslib make cmake curl openssl libssh2 zlib libpsl" export ZOPEN_CATEGORIES="ai" -export ZOPEN_DEV_TAG="master" +export ZOPEN_DEV_TAG="b6027" export ZOPEN_NAME="llamacpp-master" export ZOPEN_RUNTIME_DEPS="ncurses" -# rm -f "llama" -# ln -s "llama.cpp" "llama" -# ln -s "llama.cpp" $ZOPEN_NAME +# export ZOPEN_SKIP_ZOSLIB_ENV_HOOK=1 + +rm -f "llama" +ln -s "llama.cpp" "llama" +ln -s "llama.cpp" $ZOPEN_NAME export ZOPEN_COMP="CLANG" # set env variables # export CURL_HOME="/data/zopen/usr/local/zopen/curl/curl" -# export BLAS_HOME="/usr/lpp/cbclib" +export BLAS_HOME="/usr/lpp/cbclib" export ZOPEN_CONFIGURE="cmake" -export ZOPEN_CONFIGURE_OPTS="-B ../build --install-prefix \"\$ZOPEN_INSTALL_DIR/\" -DCURL_LIBRARY=\$CURL_HOME/lib/libcurl.a -DCURL_INCLUDE_DIR=\$CURL_HOME/include -DBUILD_SHARED_LIBS_DEFAULT=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_STATIC=ON -DGGML_BACKEND_DL=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS=\$BLAS_HOME/include/openblas -DBLAS_LIBRARIES=\$BLAS_HOME/lib/libopenblas.so -DLLAMA_BUILD_TESTS=ON ." +export ZOPEN_CONFIGURE_OPTS="-B ../build --install-prefix \"\$ZOPEN_INSTALL_DIR/\" -DCMAKE_C_FLAGS=\"-fzvector -m64 -march=z15 -lmass.arch13\" -DCMAKE_C_STANDARD=11 -DCMAKE_C_STANDARD_REQUIRED=ON -DCMAKE_C_EXTENSIONS=OFF -DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_SSL=ON -DOPENSSL_ROOT_DIR=\$OPENSSL_HOME -DCURL_LIBRARY=\$CURL_HOME/lib/libcurl.a -DCURL_INCLUDE_DIR=\$CURL_HOME/include -DBUILD_SHARED_LIBS_DEFAULT=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_STATIC=ON -DGGML_BACKEND_DL=OFF -DGGML_OPENBLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS=/usr/lpp/cbclib/include/openblas -DBLAS_LIBRARIES=/usr/lpp/cbclib/lib/libopenblas.so -DLLAMA_BUILD_TESTS=ON -DLLAMA_CURL=ON ." export ZOPEN_MAKE="cmake" export ZOPEN_MAKE_OPTS="--build ../build --parallel \$ZOPEN_NUM_JOBS --config Release" @@ -39,10 +41,10 @@ zopen_check_results() dir="$1" pfx="$2" chk="$1/$2_check.log" - + if [[ -f "$chk" ]]; then - total=$(grep -cE "Test #[0-9]+" "$chk") - failed=$(grep -cE "Failed|Subprocess aborted" "$chk") + total=$(grep -cE "Test +#" "$chk") + failed=$(grep -cE "\*\*\*Failed|Subprocess aborted\*\*\*" "$chk") skipped=$(grep -c "Skipped" "$chk") passed=$((total - failed - skipped)) else @@ -57,15 +59,32 @@ zopen_check_results() echo "actualPassed:$passed" echo "actualSkipped:$skipped" echo "totalTests:$total" - echo "expectedFailures:0" + echo "expectedFailures:3" echo "expectedTotalTests:$total" } -zopen_append_to_env() +zopen_pre_check() { - # echo envars outside of PATH, MANPATH, LIBPATH +# unset SSL_CERT_FILE +unset SSL_CERT_PATH +export SSL_CERT_PATH=$(curl-config --ca) +export CFLAGS="$CFLAGS -march=z15 -mzvector" +export CPPFLAGS="$CPPFLAGS -march=z15" } +# zopen_append_to_env() +# { +# export SSL_CERT_PATH=$(curl-config --ca) +# # echo envars outside of PATH, MANPATH, LIBPATH +# } + +# zopen_append_to_zoslib_env() +# { +# cat< { onClick={handleChatSubmit} disabled={isTyping} > - ➤ + ➤ diff --git a/patches/CMakeLists.txt.patch b/patches/CMakeLists.txt.patch new file mode 100644 index 0000000..f6b3e81 --- /dev/null +++ b/patches/CMakeLists.txt.patch @@ -0,0 +1,53 @@ +diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt +index 66a5ad8d..f71c7dec 100644 +--- a/ggml/src/ggml-cpu/CMakeLists.txt ++++ b/ggml/src/ggml-cpu/CMakeLists.txt +@@ -51,7 +51,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) + + target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17) + target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu) +- ++ list(APPEND ARCH_FLAGS -fzvector -m64 -march=z15) + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) +@@ -94,7 +94,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM) + + target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind) +- endif() ++ endif() + + if (GGML_SYSTEM_ARCH STREQUAL "ARM") + message(STATUS "ARM detected") +@@ -463,11 +463,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) + message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.") + list(APPEND ARCH_FLAGS -march=native -mtune=native) + endif() +- + if (GGML_VXE) + message(STATUS "VX/VXE/VXE2 enabled") + list(APPEND ARCH_FLAGS -mvx -mzvector) +- list(APPEND ARCH_DEFINITIONS GGML_VXE) ++ list(APPEND ARCH_DEFINITIONS GGML_VXE) + endif() + + if (GGML_NNPA) +@@ -480,6 +479,17 @@ function(ggml_add_cpu_backend_variant_impl tag_name) + else() + message(WARNING "Unknown CPU architecture. Falling back to generic implementations.") + list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC) ++ list(APPEND ARCH_DEFINITIONS GGML_VXE) ++ message(STATUS "Added GGML_VXE Flag") ++ if(NOT DEFINED TARGET_ARCH) ++ set(TARGET_ARCH 13) ++ endif() ++ if(TARGET_ARCH GREATER 13) ++ target_compile_options(${GGML_CPU_NAME} PRIVATE "-qarch=${TARGET_ARCH}") ++ endif() ++ target_include_directories(${GGML_CPU_NAME} PRIVATE /usr/include) ++ target_link_libraries(${GGML_CPU_NAME} PRIVATE "/usr/lpp/cbclib/lib/libmass.arch${TARGET_ARCH}.a") ++ message(STATUS "Found MASS: /usr/lpp/cbclib/lib/libmass.arch${TARGET_ARCH}.a") + endif() + + if (GGML_CPU_REPACK) diff --git a/patches/arg.cpp.patch b/patches/arg.cpp.patch index 8878008..e586009 100644 --- a/patches/arg.cpp.patch +++ b/patches/arg.cpp.patch @@ -1,5 +1,5 @@ diff --git a/common/arg.cpp b/common/arg.cpp -index 40af7e57..46fec792 100644 +index 06005359..28220aac 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -36,6 +36,9 @@ @@ -10,7 +10,7 @@ index 40af7e57..46fec792 100644 +# include +# endif #endif - + using json = nlohmann::ordered_json; @@ -195,6 +198,8 @@ bool common_has_curl() { # endif @@ -30,7 +30,17 @@ index 40af7e57..46fec792 100644 http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); // Check if hf-token or bearer-token was specified if (!bearer_token.empty()) { -@@ -569,6 +574,7 @@ std::pair> common_remote_get_content(const std::string & +@@ -506,7 +511,8 @@ static bool common_download_model( + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, +- }; ++ /* .allow_byteswapping = */ true, ++ }; + auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params); + if (!ctx_gguf) { + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str()); +@@ -569,6 +575,7 @@ std::pair> common_remote_get_content(const std::string & curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); diff --git a/patches/clip.cpp.patch b/patches/clip.cpp.patch new file mode 100644 index 0000000..fa3ccf9 --- /dev/null +++ b/patches/clip.cpp.patch @@ -0,0 +1,12 @@ +diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp +index a4b62f9a..2d89d4b1 100644 +--- a/tools/mtmd/clip.cpp ++++ b/tools/mtmd/clip.cpp +@@ -2028,6 +2028,7 @@ struct clip_model_loader { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, ++ /*.allow_byteswapping = */ true, + }; + + ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params)); diff --git a/patches/common.cpp.patch b/patches/common.cpp.patch index 872bd9f..ee69674 100644 --- a/patches/common.cpp.patch +++ b/patches/common.cpp.patch @@ -1,8 +1,8 @@ diff --git a/common/common.cpp b/common/common.cpp -index 4cc40ed..234ad95 100644 +index d8c4d988..00aa7d43 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -851,7 +851,7 @@ std::string fs_get_cache_directory() { +@@ -874,7 +874,7 @@ std::string fs_get_cache_directory() { if (getenv("LLAMA_CACHE")) { cache_directory = std::getenv("LLAMA_CACHE"); } else { @@ -11,3 +11,11 @@ index 4cc40ed..234ad95 100644 if (std::getenv("XDG_CACHE_HOME")) { cache_directory = std::getenv("XDG_CACHE_HOME"); } else { +@@ -1436,6 +1436,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, ++ /* .allow_byteswapping = */ true, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!ctx_gguf) { diff --git a/patches/convert-llama2c-to-ggml.cpp.patch b/patches/convert-llama2c-to-ggml.cpp.patch new file mode 100644 index 0000000..a80a89c --- /dev/null +++ b/patches/convert-llama2c-to-ggml.cpp.patch @@ -0,0 +1,12 @@ +diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +index bdf0eed2..b36170bc 100644 +--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ++++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +@@ -535,6 +535,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, ++ /*.allow_byteswapping = */ true, + }; + + struct gguf_context * ctx = gguf_init_from_file(filename, params); diff --git a/patches/examples_gguf.cpp.patch b/patches/examples_gguf.cpp.patch new file mode 100644 index 0000000..49e6b43 --- /dev/null +++ b/patches/examples_gguf.cpp.patch @@ -0,0 +1,12 @@ +diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp +index f31989c8..94ac3e5a 100644 +--- a/examples/gguf/gguf.cpp ++++ b/examples/gguf/gguf.cpp +@@ -87,6 +87,7 @@ static bool gguf_ex_read_0(const std::string & fname) { + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ NULL, ++ /*.allow_byteswapping = */ true, + }; + + struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); diff --git a/patches/export-lora.cpp.patch b/patches/export-lora.cpp.patch new file mode 100644 index 0000000..44efe88 --- /dev/null +++ b/patches/export-lora.cpp.patch @@ -0,0 +1,12 @@ +diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp +index f038019b..bf750ab9 100644 +--- a/tools/export-lora/export-lora.cpp ++++ b/tools/export-lora/export-lora.cpp +@@ -50,6 +50,7 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ ctx_ggml, ++ /*.allow_byteswapping = */ true, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); + if (!ctx_gguf) { diff --git a/patches/ggml-backend-reg.cpp.patch b/patches/ggml-backend-reg.cpp.patch index 964827c..466e10e 100644 --- a/patches/ggml-backend-reg.cpp.patch +++ b/patches/ggml-backend-reg.cpp.patch @@ -1,15 +1,15 @@ diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp -index 405d8e3..b3682a9 100644 +index f0cdac31..29247c6a 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp -@@ -556,7 +556,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, +@@ -561,7 +561,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, } - + void ggml_backend_load_all() { - ggml_backend_load_all_from_path(nullptr); +#ifdef GGML_BACKEND_DL + ggml_backend_load_all_from_path(nullptr); +#endif } - + void ggml_backend_load_all_from_path(const char * dir_path) { diff --git a/patches/ggml-cpu-impl.h.patch b/patches/ggml-cpu-impl.h.patch new file mode 100644 index 0000000..eb0c5f2 --- /dev/null +++ b/patches/ggml-cpu-impl.h.patch @@ -0,0 +1,31 @@ +diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h +index d839cf5c..3f8532f7 100644 +--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h ++++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h +@@ -68,6 +68,15 @@ struct ggml_compute_params { + #endif // __VXE2__ + #endif // __s390x__ && __VEC__ + ++#if defined(__MVS__) && defined(__VEC__) ++#ifndef __VXE__ ++#define __VXE__ ++#endif // __VXE__ ++#ifndef __VXE2__ ++#define __VXE2__ ++#endif // __VXE2__ ++#endif // __MVS__ && __VEC__ ++ + #if defined(__s390x__) && defined(GGML_NNPA) + #ifndef __NNPA__ + #define __NNPA__ +@@ -352,8 +361,9 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) + #endif + + #if defined(__VXE__) || defined(__VXE2__) ++#ifndef __VEC__ + #include +- ++#endif + #define vec_neg(a) (-(a)) // Vector Negate + #define vec_add(a, b) ((a) + (b)) // Vector Add + #define vec_sub(a, b) ((a) - (b)) // Vector Subtract diff --git a/patches/ggml-cpu.c.patch b/patches/ggml-cpu.c.patch index 0592551..b545a79 100644 --- a/patches/ggml-cpu.c.patch +++ b/patches/ggml-cpu.c.patch @@ -1,12 +1,13 @@ diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index c7426df..e074799 100644 +index c5271b77..6a547e4b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -17,7 +17,7 @@ - + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW -#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__) #include #endif + diff --git a/patches/ggml-cpu.cpp.patch b/patches/ggml-cpu.cpp.patch index 81540ae..e7aa9e1 100644 --- a/patches/ggml-cpu.cpp.patch +++ b/patches/ggml-cpu.cpp.patch @@ -1,14 +1,14 @@ diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp -index e013e8b..6290b22 100644 +index c9daa4c3..63839d07 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -33,6 +33,38 @@ # include #endif - -+#define CVT_PTR 0x10 -+#define CVTRCEP_OFFSET 0x490 -+#define RCEAFC_OFFSET 0x088 + ++#define CVT_PTR 0x10 ++#define CVTRCEP_OFFSET 0x490 ++#define RCEAFC_OFFSET 0x088 + +typedef unsigned data_area_ptr_assign_type; + @@ -17,7 +17,7 @@ index e013e8b..6290b22 100644 +#if defined(_LP64) + data_area_ptr_assign_type lower; +#endif -+ data_area_ptr_assign_type assign; ++ data_area_ptr_assign_type assign; + }; + char* deref; +} data_area_ptr; @@ -39,15 +39,15 @@ index e013e8b..6290b22 100644 +} + // ggml-backend interface - + std::vector& ggml_backend_cpu_get_extra_buffers_type() { @@ -43,7 +75,7 @@ std::vector& ggml_backend_cpu_get_extra_buffers_type if (ggml_backend_amx_buffer_type()) { bufts.push_back(ggml_backend_amx_buffer_type()); } -#endif -+#endif - ++#endif + #ifdef GGML_USE_CPU_KLEIDIAI if (ggml_backend_cpu_kleidiai_buffer_type()) { @@ -340,6 +372,9 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * @@ -59,4 +59,4 @@ index e013e8b..6290b22 100644 + *free = get_free_memory(); #else long pages = sysconf(_SC_PHYS_PAGES); - long page_size = sysconf(_SC_PAGE_SIZE); \ No newline at end of file + long page_size = sysconf(_SC_PAGE_SIZE); diff --git a/patches/ggml-impl.h.patch b/patches/ggml-impl.h.patch new file mode 100644 index 0000000..4de578e --- /dev/null +++ b/patches/ggml-impl.h.patch @@ -0,0 +1,74 @@ +diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h +index a2e30994..d7500273 100644 +--- a/ggml/src/ggml-impl.h ++++ b/ggml/src/ggml-impl.h +@@ -28,6 +28,18 @@ + #include + #endif + ++#if defined(__gnu_linux__) ++#include ++#elif defined(__MVS__) ++#define bswap_16(x) __builtin_bswap16(x) ++#define bswap_32(x) __builtin_bswap32(x) ++#define bswap_64(x) __builtin_bswap64(x) ++#else ++#define bswap_16(x) (x) ++#define bswap_32(x) (x) ++#define bswap_64(x) (x) ++#endif // defined(__gnu_linux__) ++ + #ifdef __cplusplus + extern "C" { + #endif +@@ -484,6 +496,18 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { + #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) + #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) + ++static inline void ggml_bswap16(void * value) { ++ *((uint16_t*)value) = bswap_16(*((uint16_t*)value)); ++} ++ ++static inline void ggml_bswap32(void * value) { ++ *((uint32_t*)value) = bswap_32(*((uint32_t*)value)); ++} ++ ++static inline void ggml_bswap64(void * value) { ++ *((uint64_t*)value) = bswap_64(*((uint64_t*)value)); ++} ++ + // return true if the node's results are only used by N other nodes + // and can be fused into their calculations. + static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) { +@@ -548,6 +572,31 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx + #ifdef __cplusplus + #include + #include ++#include ++ ++template = 0> ++static inline void ggml_bswap(T * value) ++{ ++ GGML_UNUSED(value); ++} ++ ++template = 0> ++static inline void ggml_bswap(T * value) ++{ ++ ggml_bswap16(value); ++} ++ ++template = 0> ++static inline void ggml_bswap(T * value) ++{ ++ ggml_bswap32(value); ++} ++ ++template = 0> ++static inline void ggml_bswap(T * value) ++{ ++ ggml_bswap64(value); ++} + + // nicer C++ syntax for ggml_can_fuse + inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { diff --git a/patches/ggml.c.patch b/patches/ggml.c.patch index cc37a78..40fa28a 100644 --- a/patches/ggml.c.patch +++ b/patches/ggml.c.patch @@ -1,11 +1,11 @@ diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c -index 97da26b3..212faf9c 100644 +index 124cf3e8..14487858 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -14,9 +14,13 @@ #include #endif - + +#if defined(__MVS__) +#include +#endif @@ -16,19 +16,19 @@ index 97da26b3..212faf9c 100644 +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__) #include #endif - + @@ -299,6 +303,15 @@ void * ggml_aligned_malloc(size_t size) { - + #if defined(_MSC_VER) || defined(__MINGW32__) return _aligned_malloc(size, alignment); +#elif defined(__MVS__) +// void * aligned_memory = NULL; +// if (size ==0) -+// size = 1; ++// size = 1; +// aligned_memory = malloc(size); +// int result = 0; +// if (aligned_memory == NULL) -+// result = errno; ++// result = errno; + return __aligned_malloc(size, alignment); #else if (size == 0) { @@ -41,4 +41,505 @@ index 97da26b3..212faf9c 100644 + __aligned_free(ptr); #elif GGML_USE_CPU_HBM if (ptr != NULL) { - hbw_free(ptr); \ No newline at end of file + hbw_free(ptr); +@@ -586,12 +601,43 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl + static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); + static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); + ++static void ggml_byteswap_i16 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_i32 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_i64 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q4_0 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q4_1 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q5_0 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q5_1 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q8_0 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q8_1 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q2_k (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q3_k (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q4_k (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q5_k (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q6_k (void * restrict buffer, size_t elements); ++static void ggml_byteswap_iq2_xxs (void * restrict buffer, size_t elements); ++static void ggml_byteswap_iq2_xs (void * restrict buffer, size_t elements); ++static void ggml_byteswap_iq3_xxs (void * restrict buffer, size_t elements); ++static void ggml_byteswap_iq3_s (void * restrict buffer, size_t elements); ++static void ggml_byteswap_iq2_s (void * restrict buffer, size_t elements); ++static void ggml_byteswap_iq1_s (void * restrict buffer, size_t elements); ++static void ggml_byteswap_iq4_nl (void * restrict buffer, size_t elements); ++static void ggml_byteswap_iq4_xs (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q8_k (void * restrict buffer, size_t elements); ++static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements); ++static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements); ++static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements); ++static void ggml_byteswap_tq1_0 (void * restrict buffer, size_t elements); ++static void ggml_byteswap_tq2_0 (void * restrict buffer, size_t elements); ++ ++//byteswap functions enabled for all except i16 and iq1_m + static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + [GGML_TYPE_I8] = { + .type_name = "i8", + .blck_size = 1, + .type_size = sizeof(int8_t), + .is_quantized = false, ++ .byteswap = ggml_byteswap_i16, + }, + [GGML_TYPE_I16] = { + .type_name = "i16", +@@ -604,24 +650,28 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .blck_size = 1, + .type_size = sizeof(int32_t), + .is_quantized = false, ++ .byteswap = ggml_byteswap_i32, + }, + [GGML_TYPE_I64] = { + .type_name = "i64", + .blck_size = 1, + .type_size = sizeof(int64_t), + .is_quantized = false, ++ .byteswap = ggml_byteswap_i64, + }, + [GGML_TYPE_F64] = { + .type_name = "f64", + .blck_size = 1, + .type_size = sizeof(double), + .is_quantized = false, ++ .byteswap = ggml_byteswap_i64, + }, + [GGML_TYPE_F32] = { + .type_name = "f32", + .blck_size = 1, + .type_size = sizeof(float), + .is_quantized = false, ++ .byteswap = ggml_byteswap_i32, + }, + [GGML_TYPE_F16] = { + .type_name = "f16", +@@ -630,6 +680,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = false, + .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row, ++ .byteswap = ggml_byteswap_i16, + }, + [GGML_TYPE_Q4_0] = { + .type_name = "q4_0", +@@ -638,6 +689,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_0, + .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref, ++ .byteswap = ggml_byteswap_q4_0, + }, + [GGML_TYPE_Q4_1] = { + .type_name = "q4_1", +@@ -646,6 +698,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_1, + .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref, ++ .byteswap = ggml_byteswap_q4_1, + }, + [4] = { // GGML_TYPE_Q4_2 + .type_name = "DEPRECATED", +@@ -666,6 +719,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_0, + .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref, ++ .byteswap = ggml_byteswap_q5_0, + }, + [GGML_TYPE_Q5_1] = { + .type_name = "q5_1", +@@ -674,6 +728,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_1, + .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref, ++ .byteswap = ggml_byteswap_q5_1, + }, + [GGML_TYPE_Q8_0] = { + .type_name = "q8_0", +@@ -682,6 +737,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q8_0, + .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref, ++ .byteswap = ggml_byteswap_q8_0, + }, + [GGML_TYPE_Q8_1] = { + .type_name = "q8_1", +@@ -689,6 +745,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .type_size = sizeof(block_q8_1), + .is_quantized = true, + .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref, ++ .byteswap = ggml_byteswap_q8_1, + }, + [GGML_TYPE_Q2_K] = { + .type_name = "q2_K", +@@ -697,6 +754,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_K, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref, ++ .byteswap = ggml_byteswap_q2_k, + }, + [GGML_TYPE_Q3_K] = { + .type_name = "q3_K", +@@ -705,6 +763,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q3_K, + .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref, ++ .byteswap = ggml_byteswap_q3_k, + }, + [GGML_TYPE_Q4_K] = { + .type_name = "q4_K", +@@ -713,6 +772,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_K, + .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref, ++ .byteswap = ggml_byteswap_q4_k, + }, + [GGML_TYPE_Q5_K] = { + .type_name = "q5_K", +@@ -721,6 +781,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_K, + .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref, ++ .byteswap = ggml_byteswap_q5_k, + }, + [GGML_TYPE_Q6_K] = { + .type_name = "q6_K", +@@ -729,6 +790,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q6_K, + .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref, ++ .byteswap = ggml_byteswap_q6_k, + }, + [GGML_TYPE_IQ2_XXS] = { + .type_name = "iq2_xxs", +@@ -737,6 +799,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, + .from_float_ref = NULL, ++ .byteswap = ggml_byteswap_iq2_xxs, + }, + [GGML_TYPE_IQ2_XS] = { + .type_name = "iq2_xs", +@@ -745,6 +808,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, + .from_float_ref = NULL, ++ .byteswap = ggml_byteswap_iq2_xs, + }, + [GGML_TYPE_IQ3_XXS] = { + .type_name = "iq3_xxs", +@@ -753,6 +817,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs, + .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref, ++ .byteswap = ggml_byteswap_iq3_xxs, + }, + [GGML_TYPE_IQ3_S] = { + .type_name = "iq3_s", +@@ -761,6 +826,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq3_s, + .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref, ++ .byteswap = ggml_byteswap_iq3_s, + }, + [GGML_TYPE_IQ2_S] = { + .type_name = "iq2_s", +@@ -769,6 +835,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_s, + .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref, ++ .byteswap = ggml_byteswap_iq2_s, + }, + [GGML_TYPE_IQ1_S] = { + .type_name = "iq1_s", +@@ -777,6 +844,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_s, + .from_float_ref = NULL, ++ .byteswap = ggml_byteswap_iq1_s, + }, + [GGML_TYPE_IQ1_M] = { + .type_name = "iq1_m", +@@ -793,6 +861,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, + .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref, ++ .byteswap = ggml_byteswap_iq4_nl, + }, + [GGML_TYPE_IQ4_XS] = { + .type_name = "iq4_xs", +@@ -801,12 +870,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq4_xs, + .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref, ++ .byteswap = ggml_byteswap_iq4_xs, + }, + [GGML_TYPE_Q8_K] = { + .type_name = "q8_K", + .blck_size = QK_K, + .type_size = sizeof(block_q8_K), + .is_quantized = true, ++ .byteswap = ggml_byteswap_q8_k, + }, + [GGML_TYPE_BF16] = { + .type_name = "bf16", +@@ -815,24 +886,28 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = false, + .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, + .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, ++ .byteswap = ggml_byteswap_i16, + }, + [31] = { // GGML_TYPE_Q4_0_4_4 + .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, ++ .byteswap = ggml_byteswap_q4_0_4x4, + }, + [32] = { // GGML_TYPE_Q4_0_4_8 + .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, ++ .byteswap = ggml_byteswap_q4_0_4x8, + }, + [33] = { // GGML_TYPE_Q4_0_8_8 + .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, ++ .byteswap = ggml_byteswap_q4_0_8x8, + }, + [GGML_TYPE_TQ1_0] = { + .type_name = "tq1_0", +@@ -841,6 +916,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_tq1_0, + .from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref, ++ .byteswap = ggml_byteswap_tq1_0, + }, + [GGML_TYPE_TQ2_0] = { + .type_name = "tq2_0", +@@ -849,6 +925,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_tq2_0, + .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, ++ .byteswap = ggml_byteswap_tq2_0, + }, + [36] = { // GGML_TYPE_IQ4_NL_4_4 + .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking", +@@ -6944,3 +7021,215 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons + if (p0->strict_cpu != p1->strict_cpu ) return false; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; + } ++ ++static void ggml_byteswap_i16(void * restrict buffer, size_t elements) { ++ uint16_t *data_ptr = (uint16_t*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(data_ptr + i); ++ } ++} ++ ++static void ggml_byteswap_i32(void * restrict buffer, size_t elements) { ++ uint32_t *data_ptr = (uint32_t*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap32(data_ptr + i); ++ } ++} ++ ++static void ggml_byteswap_i64(void * restrict buffer, size_t elements) { ++ uint64_t *data_ptr = (uint64_t*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap64(data_ptr + i); ++ } ++} ++ ++static void ggml_byteswap_q4_0(void * restrict buffer, size_t elements) { ++ block_q4_0 *data_ptr = (block_q4_0*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_q4_1(void * restrict buffer, size_t elements) { ++ block_q4_1 *data_ptr = (block_q4_1*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ ggml_bswap16(&(data_ptr[i].m)); ++ } ++} ++ ++static void ggml_byteswap_q5_0(void * restrict buffer, size_t elements) { ++ block_q5_0 *data_ptr = (block_q5_0*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_q5_1(void * restrict buffer, size_t elements) { ++ block_q5_1 *data_ptr = (block_q5_1*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ ggml_bswap16(&(data_ptr[i].m)); ++ } ++} ++ ++static void ggml_byteswap_q8_0(void * restrict buffer, size_t elements) { ++ block_q8_0 *data_ptr = (block_q8_0*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_q8_1(void * restrict buffer, size_t elements) { ++ block_q8_1 *data_ptr = (block_q8_1*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ ggml_bswap16(&(data_ptr[i].s)); ++ } ++} ++ ++static void ggml_byteswap_q2_k(void * restrict buffer, size_t elements) { ++ block_q2_K *data_ptr = (block_q2_K*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ ggml_bswap16(&(data_ptr[i].dmin)); ++ } ++} ++ ++static void ggml_byteswap_q3_k(void * restrict buffer, size_t elements) { ++ block_q3_K *data_ptr = (block_q3_K*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) { ++ block_q4_K *data_ptr = (block_q4_K*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ ggml_bswap16(&(data_ptr[i].dmin)); ++ } ++} ++ ++static void ggml_byteswap_q5_k(void * restrict buffer, size_t elements) { ++ block_q5_K *data_ptr = (block_q5_K*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ ggml_bswap16(&(data_ptr[i].dmin)); ++ } ++} ++ ++static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) { ++ block_q6_K *data_ptr = (block_q6_K*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) { ++ block_iq2_xxs *data_ptr = (block_iq2_xxs*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ for (size_t j = 0; j < QK_K/8; ++j) { ++ ggml_bswap16(&(data_ptr[i].qs[j])); ++ } ++ } ++} ++ ++static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) { ++ block_iq2_xs *data_ptr = (block_iq2_xs*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ for (size_t j = 0; j < QK_K/8; ++j) { ++ ggml_bswap16(&(data_ptr[i].qs[j])); ++ } ++ } ++} ++ ++static void ggml_byteswap_iq3_xxs(void * restrict buffer, size_t elements) { ++ block_iq3_xxs *data_ptr = (block_iq3_xxs*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_iq3_s(void * restrict buffer, size_t elements) { ++ block_iq3_s *data_ptr = (block_iq3_s*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_iq2_s(void * restrict buffer, size_t elements) { ++ block_iq2_s *data_ptr = (block_iq2_s*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) { ++ block_iq1_s *data_ptr = (block_iq1_s*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ for (size_t j = 0; j < QK_K/32; ++j) { ++ ggml_bswap16(&(data_ptr[i].qh[j])); ++ } ++ } ++} ++ ++static void ggml_byteswap_iq4_nl(void * restrict buffer, size_t elements) { ++ block_iq4_nl *data_ptr = (block_iq4_nl*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_iq4_xs(void * restrict buffer, size_t elements) { ++ block_iq4_xs *data_ptr = (block_iq4_xs*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ ggml_bswap16(&(data_ptr[i].scales_h)); ++ } ++} ++ ++static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) { ++ block_q8_K *data_ptr = (block_q8_K*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap32(&(data_ptr[i].d)); ++ for (size_t j = 0; j < QK_K/16; ++j) { ++ ggml_bswap16(&(data_ptr[i].bsums[j])); ++ } ++ } ++} ++ ++static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements) { ++ GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x4 is not implemented yet"); ++ UNUSED(buffer); ++ UNUSED(elements); ++} ++ ++static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements) { ++ GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x8 is not implemented yet"); ++ UNUSED(buffer); ++ UNUSED(elements); ++} ++ ++static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) { ++ GGML_ASSERT(false && "function ggml_byteswap_q4_0_8x8 is not implemented yet"); ++ UNUSED(buffer); ++ UNUSED(elements); ++} ++ ++static void ggml_byteswap_tq1_0(void * restrict buffer, size_t elements) { ++ block_tq1_0 *data_ptr = (block_tq1_0*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} ++ ++static void ggml_byteswap_tq2_0(void * restrict buffer, size_t elements) { ++ block_tq2_0 *data_ptr = (block_tq2_0*) buffer; ++ for (size_t i = 0; i < elements; ++i) { ++ ggml_bswap16(&(data_ptr[i].d)); ++ } ++} diff --git a/patches/ggml.h.patch b/patches/ggml.h.patch new file mode 100644 index 0000000..e4c985f --- /dev/null +++ b/patches/ggml.h.patch @@ -0,0 +1,20 @@ +diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h +index 8a8775be..cfada317 100644 +--- a/ggml/include/ggml.h ++++ b/ggml/include/ggml.h +@@ -2355,6 +2355,7 @@ extern "C" { + #endif + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); ++ typedef void (*ggml_byteswap_t) (void * GGML_RESTRICT buffer, size_t elements); + + struct ggml_type_traits { + const char * type_name; +@@ -2364,6 +2365,7 @@ extern "C" { + bool is_quantized; + ggml_to_float_t to_float; + ggml_from_float_t from_float_ref; ++ ggml_byteswap_t byteswap; + }; + + GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); diff --git a/patches/gguf-hash.cpp.patch b/patches/gguf-hash.cpp.patch new file mode 100644 index 0000000..a8e9629 --- /dev/null +++ b/patches/gguf-hash.cpp.patch @@ -0,0 +1,12 @@ +diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp +index 9523ec12..952d03f8 100644 +--- a/examples/gguf-hash/gguf-hash.cpp ++++ b/examples/gguf-hash/gguf-hash.cpp +@@ -290,6 +290,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, ++ /*.allow_byteswapping = */ true, + }; + + // xxh64 init diff --git a/patches/gguf-split.cpp.patch b/patches/gguf-split.cpp.patch new file mode 100644 index 0000000..7622e14 --- /dev/null +++ b/patches/gguf-split.cpp.patch @@ -0,0 +1,80 @@ +diff --git a/tools/gguf-split/gguf-split.cpp b/tools/gguf-split/gguf-split.cpp +index 30e77156..0472e387 100644 +--- a/tools/gguf-split/gguf-split.cpp ++++ b/tools/gguf-split/gguf-split.cpp +@@ -328,14 +328,20 @@ struct split_strategy { + const char * t_name = gguf_get_tensor_name(ctx_out, i); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + auto n_bytes = ggml_nbytes(t); +- read_buf.resize(n_bytes); ++ auto n_elements = ggml_nelements(t) / ggml_blck_size(t->type); ++ read_buf.resize(n_bytes); + + // calculate offset + auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + ++ ggml_byteswap_t byteswap_func = nullptr; ++ if (gguf_needs_byteswap(ctx_gguf)) { ++ byteswap_func = ggml_get_type_traits(t->type)->byteswap; ++ } ++ + // copy tensor from input to output file +- copy_file_to_file(f_input, fout, offset, n_bytes); ++ copy_file_to_file(f_input, fout, offset, n_bytes, n_elements, byteswap_func); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + +@@ -346,13 +352,18 @@ struct split_strategy { + } + } + +- void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) { ++ void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len, const size_t elements, ggml_byteswap_t byteswap_func) { + // TODO: detect OS and use copy_file_range() here for better performance + if (read_buf.size() < len) { + read_buf.resize(len); + } + f_in.seekg(in_offset); + f_in.read((char *)read_buf.data(), len); ++ ++ if (byteswap_func != nullptr) { ++ byteswap_func(read_buf.data(), elements); ++ } ++ + f_out.write((const char *)read_buf.data(), len); + } + }; +@@ -363,6 +374,7 @@ static void gguf_split(const split_params & split_params) { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, ++ /*.allow_byteswapping = */ true, + }; + + std::ifstream f_input(split_params.input.c_str(), std::ios::binary); +@@ -426,6 +438,7 @@ static void gguf_merge(const split_params & split_params) { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, ++ /*.allow_byteswapping = */ true, + }; + + if (i_split > 0) { +@@ -539,7 +552,15 @@ static void gguf_merge(const split_params & split_params) { + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); +- if (!split_params.dry_run) { ++ ++ if (gguf_needs_byteswap(ctx_gguf)) { ++ auto byteswap = ggml_get_type_traits(t->type)->byteswap; ++ if (byteswap != nullptr) { ++ byteswap(read_data.data(), ggml_nelements(t) / ggml_blck_size(t->type)); ++ } ++ } ++ ++ if (!split_params.dry_run) { + // write tensor data + padding + fout.write((const char *)read_data.data(), n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); diff --git a/patches/gguf.cpp.patch b/patches/gguf.cpp.patch new file mode 100644 index 0000000..299677c --- /dev/null +++ b/patches/gguf.cpp.patch @@ -0,0 +1,86 @@ +diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp +index 53504399..e71bd790 100644 +--- a/ggml/src/gguf.cpp ++++ b/ggml/src/gguf.cpp +@@ -214,16 +214,22 @@ struct gguf_context { + size_t size = 0; // size of `data` in bytes + + void * data = nullptr; ++ bool needs_byteswap = false; //only for reading, writing in non-native endianness is not supported + }; + + struct gguf_reader { + FILE * file; ++ bool do_byteswap = false; + + gguf_reader(FILE * file) : file(file) {} + + template + bool read(T & dst) const { +- return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); ++ auto res = fread(&dst, 1, sizeof(dst), file); ++ if (do_byteswap) { ++ ggml_bswap(&dst); ++ } ++ return res == sizeof(dst); + } + + template +@@ -278,7 +284,13 @@ struct gguf_reader { + return false; + } + dst.resize(size); +- return fread(dst.data(), 1, dst.length(), file) == dst.length(); ++ // return fread(dst.data(), 1, dst.length(), file) == dst.length(); ++ std::vector temp(size); ++ if (fread(temp.data(), 1, size, file) != size) { ++ return false; ++ } ++ dst.assign(temp.begin(), temp.end()); ++ return true; + } + + bool read(void * dst, const size_t size) const { +@@ -317,7 +329,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vectorversion)) { ++ uint32_t original_version = ctx->version; ++ ++ if ((params.allow_byteswapping) && ((ctx->version & 0x0000FFFF) == 0) && ((ctx->version & 0xFFFF0000) != 0)) { ++ // most likely different endianness, do byteswapping ++ gr.do_byteswap = true; ++ ctx->needs_byteswap = true; ++ ggml_bswap(&(ctx->version)); ++ GGML_LOG_INFO("%s: Entered byteswapping, corrected version from %" PRIu32 "to new GGUF file version %" PRIu32 "\n", __func__, original_version, ctx->version); ++ } ++ + if (ok && ctx->version == 0) { + GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version); + ok = false; +@@ -363,7 +385,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par + * the last 4 hexadecimal digits to check if the model is the same + * endianness as the host system. + */ +- if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) { ++ if (ok && !ctx->needs_byteswap && (ctx->version & 0x0000FFFF) == 0x00000000) { + GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version); + ok = false; + } +@@ -1356,3 +1378,7 @@ void gguf_get_meta_data(const struct gguf_context * ctx, void * data) { + gguf_write_to_buf(ctx, buf, /*only_meta =*/ true); + memcpy(data, buf.data(), buf.size()); + } ++ ++bool gguf_needs_byteswap(const struct gguf_context * ctx) { ++ return ctx->needs_byteswap; ++} diff --git a/patches/gguf.h.patch b/patches/gguf.h.patch new file mode 100644 index 0000000..540d4e1 --- /dev/null +++ b/patches/gguf.h.patch @@ -0,0 +1,23 @@ +diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h +index 79ee2020..236205c4 100644 +--- a/ggml/include/gguf.h ++++ b/ggml/include/gguf.h +@@ -74,6 +74,8 @@ extern "C" { + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; ++ ++ bool allow_byteswapping; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); +@@ -197,6 +199,9 @@ extern "C" { + // writes the meta data to pointer "data" + GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); + ++ // returns true if gguf file needs byteswapping when reading. Byteswapping for writing not implemented ++ GGML_API bool gguf_needs_byteswap(const struct gguf_context * ctx); ++ + #ifdef __cplusplus + } + #endif diff --git a/patches/gguf_writer.py.patch b/patches/gguf_writer.py.patch new file mode 100644 index 0000000..32e745d --- /dev/null +++ b/patches/gguf_writer.py.patch @@ -0,0 +1,24 @@ +diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py +index 4f23f9b0..d1fd51e0 100644 +--- a/gguf-py/gguf/gguf_writer.py ++++ b/gguf-py/gguf/gguf_writer.py +@@ -4,6 +4,7 @@ import logging + import os + import shutil + import struct ++import sys + import tempfile + from dataclasses import dataclass + from enum import Enum, auto +@@ -452,6 +453,11 @@ class GGUFWriter: + for ti in tensors.values(): + assert ti.tensor is not None # can only iterate once over the tensors + assert ti.tensor.nbytes == ti.nbytes ++ ++ if (self.endianess == GGUFEndian.BIG and sys.byteorder == 'little') or (self.endianess == GGUFEndian.LITTLE and sys.byteorder == 'big'): ++ # ti.tensor.byteswap(inplace=True) just didn't work here ++ ti.tensor = ti.tensor.byteswap() ++ + ti.tensor.tofile(fout) + if shard_bar is not None: + shard_bar.update(ti.nbytes) diff --git a/patches/httplib.h.patch b/patches/httplib.h.patch index 6f0f621..bd943d3 100644 --- a/patches/httplib.h.patch +++ b/patches/httplib.h.patch @@ -1,5 +1,5 @@ diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h -index 0aa4e62..cbc6ece 100644 +index 0aa4e627..cbc6ece2 100644 --- a/vendor/cpp-httplib/httplib.h +++ b/vendor/cpp-httplib/httplib.h @@ -7,6 +7,7 @@ diff --git a/patches/imatrix.cpp.patch b/patches/imatrix.cpp.patch new file mode 100644 index 0000000..031ad06 --- /dev/null +++ b/patches/imatrix.cpp.patch @@ -0,0 +1,12 @@ +diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp +index 9aad3711..a11c2548 100644 +--- a/tools/imatrix/imatrix.cpp ++++ b/tools/imatrix/imatrix.cpp +@@ -700,6 +700,7 @@ bool IMatrixCollector::load_imatrix(const char * file_name) { + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, // the data is needed + /* .ctx = */ &ctx, ++ /* .allow_byteswapping = */ true, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params); + if (!ctx_gguf) { diff --git a/patches/llama-adapter.cpp.patch b/patches/llama-adapter.cpp.patch new file mode 100644 index 0000000..c2465dd --- /dev/null +++ b/patches/llama-adapter.cpp.patch @@ -0,0 +1,12 @@ +diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp +index 8d94034a..ef21c55b 100644 +--- a/src/llama-adapter.cpp ++++ b/src/llama-adapter.cpp +@@ -152,6 +152,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ + gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &ctx_init, ++ /* .allow_byteswapping = */ true, + }; + + gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; diff --git a/patches/llama-context.cpp.patch b/patches/llama-context.cpp.patch index 187c22c..55c0d9b 100644 --- a/patches/llama-context.cpp.patch +++ b/patches/llama-context.cpp.patch @@ -1,37 +1,37 @@ diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 06e93b19..0db8530f 100644 +index 9e77fe6d..6b40f7fd 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -1178,12 +1178,17 @@ int llama_context::decode(const llama_batch & batch_inp) { - } - } +@@ -1223,12 +1223,17 @@ int llama_context::decode(const llama_batch & batch_inp) { + // remember the swaps and apply them lazily upon logits/embeddings access + output_swaps.push_back({ i, j_min }); } - -+ ++ + #ifndef __MVS__ std::fill(output_ids.begin(), output_ids.end(), -1); - -+ #else -+ for (auto& id: output_ids) { -+ id = -1; -+ } ++ #else ++ for (auto& id: output_ids) { ++ id = -1; ++ } for (uint32_t i = 0; i < n_outputs; ++i) { output_ids[out_ids[i]] = i; } -+ #endif ++ #endif } } - -@@ -1264,8 +1269,10 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { + +@@ -1311,8 +1316,10 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd = has_embd ? output_base + logits_size : nullptr; - + // set all ids as invalid (negative) - std::fill(output_ids.begin(), output_ids.end(), -1); - + //std::fill(output_ids.begin(), output_ids.end(), -1); + for (auto& id: output_ids) { -+ id = -1; ++ id = -1; + } this->n_outputs = 0; - - return n_outputs_max; \ No newline at end of file + + return n_outputs_max; diff --git a/patches/llama-hparams.h.patch b/patches/llama-hparams.h.patch new file mode 100644 index 0000000..a49ab7e --- /dev/null +++ b/patches/llama-hparams.h.patch @@ -0,0 +1,12 @@ +diff --git a/src/llama-hparams.h b/src/llama-hparams.h +index 8b7e2a11..530dfea2 100644 +--- a/src/llama-hparams.h ++++ b/src/llama-hparams.h +@@ -3,6 +3,7 @@ + #include "llama.h" + + #include ++#include + + // bump if necessary + #define LLAMA_MAX_LAYERS 512 diff --git a/patches/llama-model-loader.cpp.patch b/patches/llama-model-loader.cpp.patch new file mode 100644 index 0000000..b42e3dd --- /dev/null +++ b/patches/llama-model-loader.cpp.patch @@ -0,0 +1,94 @@ +diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp +index bd9e6da8..cacb935b 100644 +--- a/src/llama-model-loader.cpp ++++ b/src/llama-model-loader.cpp +@@ -490,6 +490,7 @@ llama_model_loader::llama_model_loader( + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, ++ /*.allow_byteswapping = */ true, + }; + + meta.reset(gguf_init_from_file(fname.c_str(), params)); +@@ -550,6 +551,7 @@ llama_model_loader::llama_model_loader( + struct gguf_init_params split_params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, ++ /*.allow_byteswapping = */ true, + }; + gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; + if (!ctx_gguf) { +@@ -709,6 +711,9 @@ llama_model_loader::llama_model_loader( + if (!llama_mmap::SUPPORTED) { + LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); + use_mmap = false; ++ } else if (gguf_needs_byteswap(meta.get())) { ++ LLAMA_LOG_WARN("%s: gguf file needs byteswapping, mmap is disabled. This may affect performance.\n", __func__); ++ use_mmap = false; + } + + this->use_mmap = use_mmap; +@@ -908,6 +913,13 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { + const auto & file = files.at(w.idx); + file->seek(w.offs, SEEK_SET); + file->read_raw(cur->data, ggml_nbytes(cur)); ++ ++ if (gguf_needs_byteswap(meta.get())) { ++ auto byteswap = ggml_get_type_traits(cur->type)->byteswap; ++ if (byteswap != nullptr) { ++ byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); ++ } ++ } + } + + if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { +@@ -1063,6 +1075,14 @@ bool llama_model_loader::load_all_data( + if (ggml_backend_buffer_is_host(cur->buffer)) { + file->seek(weight->offs, SEEK_SET); + file->read_raw(cur->data, n_size); ++ ++ if (gguf_needs_byteswap(meta.get())) { ++ auto byteswap = ggml_get_type_traits(cur->type)->byteswap; ++ if (byteswap != nullptr) { ++ byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); ++ } ++ } ++ + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); +@@ -1075,11 +1095,20 @@ bool llama_model_loader::load_all_data( + + size_t bytes_read = 0; + ++ // for byteswapping purposes ensure that there is whole number of elements in buffer ++ const size_t buf_size_aligned = gguf_needs_byteswap(meta.get()) ? buffer_size - (buffer_size % ggml_blck_size(cur->type)) : buffer_size; ++ + while (bytes_read < n_size) { +- size_t read_iteration = std::min(buffer_size, n_size - bytes_read); ++ size_t read_iteration = std::min(buf_size_aligned, n_size - bytes_read); + + ggml_backend_event_synchronize(events[buffer_idx]); + file->read_raw(host_ptrs[buffer_idx], read_iteration); ++ if (gguf_needs_byteswap(meta.get())) { ++ auto byteswap = ggml_get_type_traits(cur->type)->byteswap; ++ if (byteswap != nullptr) { ++ byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); ++ } ++ } + ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx], upload_backend); + +@@ -1091,6 +1120,12 @@ bool llama_model_loader::load_all_data( + read_buf.resize(n_size); + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); ++ if (gguf_needs_byteswap(meta.get())) { ++ auto byteswap = ggml_get_type_traits(cur->type)->byteswap; ++ if (byteswap != nullptr) { ++ byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); ++ } ++ } + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); diff --git a/patches/miniaudio.h.patch b/patches/miniaudio.h.patch index 0b3a155..ddb0ce8 100644 --- a/patches/miniaudio.h.patch +++ b/patches/miniaudio.h.patch @@ -1,10 +1,10 @@ diff --git a/vendor/miniaudio/miniaudio.h b/vendor/miniaudio/miniaudio.h -index c74bebe..72cbafd 100644 +index c74bebeb..72cbafd5 100644 --- a/vendor/miniaudio/miniaudio.h +++ b/vendor/miniaudio/miniaudio.h @@ -3873,8 +3873,8 @@ typedef ma_uint16 wchar_t; #define MA_POSIX - + /* - Use the MA_NO_PTHREAD_IN_HEADER option at your own risk. This is intentionally undocumented. - You can use this to avoid including pthread.h in the header section. The downside is that it @@ -16,9 +16,9 @@ index c74bebe..72cbafd 100644 @@ -16176,7 +16176,7 @@ static ma_result ma_thread_create__posix(ma_thread* pThread, ma_thread_priority int result; pthread_attr_t* pAttr = NULL; - + -#if !defined(__EMSCRIPTEN__) && !defined(__3DS__) +#if !defined(__EMSCRIPTEN__) && !defined(__3DS__) && !defined(__MVS__) /* Try setting the thread priority. It's not critical if anything fails here. */ pthread_attr_t attr; - if (pthread_attr_init(&attr) == 0) { \ No newline at end of file + if (pthread_attr_init(&attr) == 0) { diff --git a/patches/ops.h.patch b/patches/ops.h.patch new file mode 100644 index 0000000..e26d1d1 --- /dev/null +++ b/patches/ops.h.patch @@ -0,0 +1,12 @@ +diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h +index 3a32ec20..3677919b 100644 +--- a/ggml/src/ggml-cpu/ops.h ++++ b/ggml/src/ggml-cpu/ops.h +@@ -1,5 +1,7 @@ + #pragma once + ++#include ++ + #include "ggml.h" + + // diff --git a/patches/quantize.cpp.patch b/patches/quantize.cpp.patch new file mode 100644 index 0000000..635c4a5 --- /dev/null +++ b/patches/quantize.cpp.patch @@ -0,0 +1,12 @@ +diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp +index 45c59ecb..b99ae61d 100644 +--- a/tools/quantize/quantize.cpp ++++ b/tools/quantize/quantize.cpp +@@ -219,6 +219,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector iq4_nl_4x4_q8_0; + ++#ifndef __MVS__ + if (cur->type == GGML_TYPE_Q4_0) { + if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if (cur->ne[1] % 8 == 0) { +@@ -1453,6 +1454,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons + } + } + } ++#endif ++ ++ GGML_LOG_DEBUG("REPACK DEBUG: %s: Repacking disabled for tensor '%s' (type %s)\n", __func__, ggml_get_name(cur) ? ggml_get_name(cur) : "unnamed", ggml_type_name(cur->type)); + + return nullptr; + } diff --git a/patches/run.cpp.patch b/patches/run.cpp.patch index d9d7c03..3cb4626 100644 --- a/patches/run.cpp.patch +++ b/patches/run.cpp.patch @@ -3,14 +3,14 @@ index 6fe728c6..9ee09d80 100644 --- a/tools/run/run.cpp +++ b/tools/run/run.cpp @@ -22,6 +22,9 @@ - + #if defined(LLAMA_USE_CURL) # include +# if defined(__MVS__) +# include +# endif #endif - + #include @@ -507,6 +510,7 @@ class HttpClient { curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); @@ -19,4 +19,4 @@ index 6fe728c6..9ee09d80 100644 + curl_easy_setopt(curl, CURLOPT_CAINFO, std::getenv("SSL_CERT_PATH")); return curl_easy_perform(curl); } - + diff --git a/patches/sgemm.h.patch b/patches/sgemm.h.patch new file mode 100644 index 0000000..181c308 --- /dev/null +++ b/patches/sgemm.h.patch @@ -0,0 +1,15 @@ +diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h +index 729e8853..c85a67dd 100644 +--- a/ggml/src/ggml-cpu/llamafile/sgemm.h ++++ b/ggml/src/ggml-cpu/llamafile/sgemm.h +@@ -3,8 +3,10 @@ + #include + + #if defined(__VXE__) || defined(__VXE2__) ++#ifndef __VEC__ + #include + #endif ++#endif + + #ifdef __cplusplus + extern "C" { diff --git a/patches/simd-mappings.h.patch b/patches/simd-mappings.h.patch new file mode 100644 index 0000000..d25ff3e --- /dev/null +++ b/patches/simd-mappings.h.patch @@ -0,0 +1,13 @@ +diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h +index b4ad68c9..22657c86 100644 +--- a/ggml/src/ggml-cpu/simd-mappings.h ++++ b/ggml/src/ggml-cpu/simd-mappings.h +@@ -1070,7 +1070,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { + #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL + #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE + +-#elif defined(__VXE__) || defined(__VXE2__) ++#elif defined(__VXE__) || defined(__VXE2__) || defined(__MVS__) + + #define GGML_SIMD + diff --git a/patches/simd.inprogresspatch b/patches/simd.inprogresspatch deleted file mode 100644 index fb1c651..0000000 --- a/patches/simd.inprogresspatch +++ /dev/null @@ -1,251 +0,0 @@ -diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt -index 66a5ad8d..a67bce53 100644 ---- a/ggml/src/ggml-cpu/CMakeLists.txt -+++ b/ggml/src/ggml-cpu/CMakeLists.txt -@@ -480,6 +480,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) - else() - message(WARNING "Unknown CPU architecture. Falling back to generic implementations.") - list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC) -+ #ifdef __MVS__ -+ list(APPEND ARCH_FLAGS -mzvector) -+ message(STATUS "-mzvector enabled") -+ #endif - endif() - - if (GGML_CPU_REPACK) -diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h -index d839cf5c..9cc46cbf 100644 ---- a/ggml/src/ggml-cpu/ggml-cpu-impl.h -+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h -@@ -352,7 +352,11 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) - #endif - - #if defined(__VXE__) || defined(__VXE2__) -+#ifdef __MVS__ -+#include -+#elif - #include -+#endif - - #define vec_neg(a) (-(a)) // Vector Negate - #define vec_add(a, b) ((a) + (b)) // Vector Add -diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 11ff228f..4a1e7f63 100644 ---- a/ggml/src/ggml-cpu/ggml-cpu.c -+++ b/ggml/src/ggml-cpu/ggml-cpu.c -@@ -17,7 +17,7 @@ - - #if defined(_MSC_VER) || defined(__MINGW32__) - #include // using malloc.h with MSC/MINGW --#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__) - #include - #endif - -diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp -index c9daa4c3..63839d07 100644 ---- a/ggml/src/ggml-cpu/ggml-cpu.cpp -+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp -@@ -33,6 +33,38 @@ - # include - #endif - -+#define CVT_PTR 0x10 -+#define CVTRCEP_OFFSET 0x490 -+#define RCEAFC_OFFSET 0x088 -+ -+typedef unsigned data_area_ptr_assign_type; -+ -+typedef union { -+ struct { -+#if defined(_LP64) -+ data_area_ptr_assign_type lower; -+#endif -+ data_area_ptr_assign_type assign; -+ }; -+ char* deref; -+} data_area_ptr; -+ -+uint64_t get_free_memory(void) { -+ uint64_t freeram; -+ data_area_ptr cvt = {0}; -+ data_area_ptr rcep = {0}; -+ cvt.assign = *(data_area_ptr_assign_type*)(CVT_PTR); -+ rcep.assign = *(data_area_ptr_assign_type*)(cvt.deref + CVTRCEP_OFFSET); -+ freeram = (uint64_t)*((uint32_t*)(rcep.deref + RCEAFC_OFFSET)) * 4096; -+ -+ return freeram; -+} -+ -+uint64_t get_total_memory(void) { -+ /* Use CVTRLSTG to get the size of actual real storage online at IPL in K. */ -+ return (uint64_t)((int)((char *__ptr32 *__ptr32 *)0)[4][214]) * 1024; -+} -+ - // ggml-backend interface - - std::vector& ggml_backend_cpu_get_extra_buffers_type() { -@@ -43,7 +75,7 @@ std::vector& ggml_backend_cpu_get_extra_buffers_type - if (ggml_backend_amx_buffer_type()) { - bufts.push_back(ggml_backend_amx_buffer_type()); - } --#endif -+#endif - - #ifdef GGML_USE_CPU_KLEIDIAI - if (ggml_backend_cpu_kleidiai_buffer_type()) { -@@ -340,6 +372,9 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * - GlobalMemoryStatusEx(&status); - *total = status.ullTotalPhys; - *free = status.ullAvailPhys; -+#elifdef __MVS__ -+ *total = get_total_memory(); -+ *free = get_free_memory(); - #else - long pages = sysconf(_SC_PHYS_PAGES); - long page_size = sysconf(_SC_PAGE_SIZE); -diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp -index ed61869a..1bb64b0f 100644 ---- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp -+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp -@@ -194,7 +194,7 @@ inline float hsum(float16x8_t x) { - - #if defined(__VXE__) || defined(__VXE2__) - inline float hsum(float32x4_t x) { -- float32x4_t tmp = x + vec_reve(x); -+ float32x4_t tmp = x + (float32x4_t)vec_reve((__vector int)x); - return tmp[0] + tmp[1]; - } - #endif -@@ -256,10 +256,12 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) { - tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]); - } - -- return vec_xl(0, (const float *)(tmp)); -+ //return vec_xl(0, (const float *)(tmp)); -+ return GGML_VEC_LOAD(tmp); - } - template <> inline float32x4_t load(const float * p) { -- return vec_xl(0, p); -+ //return vec_xl(0, p); -+ return GGML_VEC_LOAD(p); - } - #endif - -diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h -index 729e8853..5a35157b 100644 ---- a/ggml/src/ggml-cpu/llamafile/sgemm.h -+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h -@@ -3,7 +3,7 @@ - #include - - #if defined(__VXE__) || defined(__VXE2__) --#include -+//#include - #endif - - #ifdef __cplusplus -diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp -index 2f153419..3a459782 100644 ---- a/ggml/src/ggml-cpu/ops.cpp -+++ b/ggml/src/ggml-cpu/ops.cpp -@@ -1,3 +1,9 @@ -+#ifdef GGML_MASS -+#include -+#define cosf(a) cosd2(a) -+#define sinf(a) sind2(a) -+#endif -+ - #include "ops.h" - - #include "ggml-cpu.h" -diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h -index b4ad68c9..a25387d4 100644 ---- a/ggml/src/ggml-cpu/simd-mappings.h -+++ b/ggml/src/ggml-cpu/simd-mappings.h -@@ -2,6 +2,18 @@ - - #include "ggml-cpu-impl.h" - -+#if defined(__MVS__) -+ #define GGML_VEC_LOAD(ptr) (vec_xl(0, (int32_t *)(ptr))) -+ #define GGML_VEC_STORE(vec, ptr) vec_xst((__vector int)(vec), 0, (int32_t *)(ptr)) -+#else -+ #define GGML_VEC_LOAD(ptr) vec_xl(0, (const float *)(ptr)) -+ #define GGML_VEC_STORE(vec, ptr) vec_xst((vec), 0, (float *)(ptr)) -+#endif -+ -+#if defined(__lcbb) -+# undef __lcbb -+#endif -+ - #ifdef __ARM_FEATURE_SVE - #include - #endif // __ARM_FEATURE_SVE -@@ -1080,10 +1092,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { - #define GGML_F32_EPR 4 - - #define GGML_F32x4 float32x4_t --#define GGML_F32x4_ZERO vec_splats(0.0f) -+#define GGML_F32x4_ZERO (float32x4_t)vec_splats((int32_t)0.0f) - #define GGML_F32x4_SET1 vec_splats --#define GGML_F32x4_LOAD(p) vec_xl(0, p) --#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) -+#define GGML_F32x4_LOAD(p) GGML_VEC_LOAD(p) -+#define GGML_F32x4_STORE(p, r) GGML_VEC_STORE(r, p) - #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) - #define GGML_F32x4_ADD vec_add - #define GGML_F32x4_MUL vec_mul -@@ -1101,8 +1113,8 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { - for (int i = 0; i < offset; ++i) { \ - x[i] = vec_add(x[i], x[offset + i]); \ - } \ -- float32x4_t tmp = x[0] + vec_reve(x[0]); \ -- res = tmp[0] + tmp[1]; \ -+ float32x4_t tmp = x[0] + (float32x4_t)vec_reve((__vector int)x[0]); \ -+ res = (ggml_float)tmp[0] + (ggml_float)tmp[1]; \ - } - - #define GGML_F32_VEC GGML_F32x4 -@@ -1133,7 +1145,7 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) { - - // note: keep type-cast here to prevent compiler bugs - // see: https://github.com/ggml-org/llama.cpp/issues/12846 -- return vec_xl(0, (const float *)(tmp)); -+ return GGML_VEC_LOAD(tmp); - #endif - } - -@@ -1152,7 +1164,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) { - - // note: keep type-cast here to prevent compiler bugs - // see: https://github.com/ggml-org/llama.cpp/issues/12846 -- vec_xst(v_y, 0, (float *)(arr)); -+ GGML_VEC_STORE(v_y, arr); - - for (int i = 0; i < 4; i++) { - x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); -diff --git a/ggml/src/ggml-cpu/unary-ops.cpp b/ggml/src/ggml-cpu/unary-ops.cpp -index 4fce569b..9da79838 100644 ---- a/ggml/src/ggml-cpu/unary-ops.cpp -+++ b/ggml/src/ggml-cpu/unary-ops.cpp -@@ -1,3 +1,7 @@ -+#ifdef GGML_MASS -+#include -+#endif -+ - #include "unary-ops.h" - - static inline float op_abs(float x) { -diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp -index a8156011..7640e3b1 100644 ---- a/ggml/src/ggml-cpu/vec.cpp -+++ b/ggml/src/ggml-cpu/vec.cpp -@@ -1,3 +1,7 @@ -+#ifdef GGML_MASS -+#include -+#endif -+ - #include "vec.h" - - #include diff --git a/patches/stb_image.h.patch b/patches/stb_image.h.patch index ea445df..62bb590 100644 --- a/patches/stb_image.h.patch +++ b/patches/stb_image.h.patch @@ -1,11 +1,11 @@ diff --git a/vendor/stb/stb_image.h b/vendor/stb/stb_image.h -index 9eedabe..92d5251 100644 +index 9eedabed..92d52515 100644 --- a/vendor/stb/stb_image.h +++ b/vendor/stb/stb_image.h @@ -620,6 +620,10 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch #define stbi_inline __forceinline #endif - + +#ifdef __MVS__ + #define STBI_NO_THREAD_LOCALS +#endif @@ -16,7 +16,7 @@ index 9eedabe..92d5251 100644 @@ -721,6 +725,10 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1]; #define STBI_NO_SIMD #endif - + +#if defined(__MVS__) +#include +#endif @@ -27,7 +27,7 @@ index 9eedabe..92d5251 100644 @@ -963,11 +971,33 @@ static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp); static int stbi__pnm_is16(stbi__context *s); #endif - + +#if defined(__MVS__) +static __tlssim stbi__g_failure_reason_tls(""); +#define stbi__g_failure_reason (*stbi__g_failure_reason_tls.access()) @@ -55,6 +55,6 @@ index 9eedabe..92d5251 100644 #endif const char *stbi__g_failure_reason; +#endif - + STBIDEF const char *stbi_failure_reason(void) - { \ No newline at end of file + { diff --git a/patches/test-gguf.cpp.patch b/patches/test-gguf.cpp.patch new file mode 100644 index 0000000..fb945f2 --- /dev/null +++ b/patches/test-gguf.cpp.patch @@ -0,0 +1,20 @@ +diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp +index 3f0c312e..f7c66fb6 100644 +--- a/tests/test-gguf.cpp ++++ b/tests/test-gguf.cpp +@@ -715,6 +715,7 @@ static std::pair test_handcrafted_file(const unsigned int seed) { + struct gguf_init_params gguf_params = { + /*no_alloc =*/ false, + /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, ++ /*.allow_byteswapping = */ true, + }; + + struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); +@@ -1111,6 +1112,7 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned + struct gguf_init_params gguf_params = { + /*no_alloc =*/ false, + /*ctx =*/ only_meta ? nullptr : &ctx_1, ++ /*.allow_byteswapping = */ true, + }; + struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params); + diff --git a/patches/unary-ops.h.patch b/patches/unary-ops.h.patch new file mode 100644 index 0000000..87a7b8e --- /dev/null +++ b/patches/unary-ops.h.patch @@ -0,0 +1,12 @@ +diff --git a/ggml/src/ggml-cpu/unary-ops.h b/ggml/src/ggml-cpu/unary-ops.h +index b1ade2c8..eb525682 100644 +--- a/ggml/src/ggml-cpu/unary-ops.h ++++ b/ggml/src/ggml-cpu/unary-ops.h +@@ -1,5 +1,7 @@ + #pragma once + ++#include ++ + #include "common.h" + + #ifdef __cplusplus diff --git a/patches/unicode.h.patch b/patches/unicode.h.patch new file mode 100644 index 0000000..ef33778 --- /dev/null +++ b/patches/unicode.h.patch @@ -0,0 +1,65 @@ +diff --git a/src/unicode.h b/src/unicode.h +index 0a5fa2a7..ca8a9011 100644 +--- a/src/unicode.h ++++ b/src/unicode.h +@@ -15,6 +15,10 @@ struct unicode_cpt_flags { + SYMBOL = 0x0040, // regex: \p{S} + CONTROL = 0x0080, // regex: \p{C} + MASK_CATEGORIES = 0x00FF, ++ WHITESPACE = 0x0100, ++ LOWERCASE = 0x0200, ++ UPPERCASE = 0x0400, ++ NFD = 0x0800, + }; + + // codepoint type +@@ -34,11 +38,49 @@ struct unicode_cpt_flags { + + // decode from uint16 + inline unicode_cpt_flags(const uint16_t flags = 0) { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + *reinterpret_cast(this) = flags; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ is_undefined = (flags & UNDEFINED) ? 1 : 0; ++ is_number = (flags & NUMBER) ? 1 : 0; ++ is_letter = (flags & LETTER) ? 1 : 0; ++ is_separator = (flags & SEPARATOR) ? 1 : 0; ++ is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0; ++ is_punctuation = (flags & PUNCTUATION) ? 1 : 0; ++ is_symbol = (flags & SYMBOL) ? 1 : 0; ++ is_control = (flags & CONTROL) ? 1 : 0; ++ is_whitespace = (flags & WHITESPACE) ? 1 : 0; ++ is_lowercase = (flags & LOWERCASE) ? 1 : 0; ++ is_uppercase = (flags & UPPERCASE) ? 1 : 0; ++ is_nfd = (flags & NFD) ? 1 : 0; ++#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++#error Unexpected or undefined __BYTE_ORDER__ ++#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + } + + inline uint16_t as_uint() const { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return *reinterpret_cast(this); ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ uint16_t result = ++ is_undefined * UNDEFINED ++ + is_number * NUMBER ++ + is_letter * LETTER ++ + is_separator * SEPARATOR ++ + is_accent_mark * ACCENT_MARK ++ + is_punctuation * PUNCTUATION ++ + is_symbol * SYMBOL ++ + is_control * CONTROL ++ + is_whitespace * WHITESPACE ++ + is_lowercase * LOWERCASE ++ + is_uppercase * UPPERCASE ++ + is_nfd * NFD ++ ; ++ ++ return result; ++#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++#error Unexpected or undefined __BYTE_ORDER__ ++#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + } + + inline uint16_t category_flag() const { diff --git a/patches/vec.h.patch b/patches/vec.h.patch new file mode 100644 index 0000000..b7b53f5 --- /dev/null +++ b/patches/vec.h.patch @@ -0,0 +1,10 @@ +diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h +index c432c990..b144a518 100644 +--- a/ggml/src/ggml-cpu/vec.h ++++ b/ggml/src/ggml-cpu/vec.h +@@ -1,4 +1,5 @@ + // Vectorized functions for fundamental operations ++#include + + #pragma once +