diff --git a/CMakeLists.txt b/CMakeLists.txt index d2becb04c6bb9..9e111c2247bb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,6 +193,10 @@ if (LLAMA_BUILD_COMMON) add_subdirectory(common) endif() +if(LLAMA_BUILD_EXAMPLES OR LLAMA_BUILD_TESTS) + add_subdirectory(common_test) +endif() + if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) include(CTest) add_subdirectory(tests) diff --git a/common/common.cpp b/common/common.cpp index e4e71ad13fb59..c5d8cc0f8ff29 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -899,15 +899,7 @@ std::string fs_get_cache_file(const std::string & filename) { // Model utils // -struct common_init_result common_init_from_params(common_params & params) { - common_init_result iparams; - auto mparams = common_model_params_to_llama(params); - - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); - if (model == NULL) { - LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str()); - return iparams; - } +struct common_init_result common_init_from_model_and_params(llama_model* model, common_init_result iparams, common_params & params) { const llama_vocab * vocab = llama_model_get_vocab(model); @@ -1068,6 +1060,19 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } +struct common_init_result common_init_from_params(common_params & params) { + common_init_result iparams; + auto mparams = common_model_params_to_llama(params); + + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); + if (model == NULL) { + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str()); + return iparams; + } + + return common_init_from_model_and_params(model, std::move(iparams), params); +} + std::string get_model_endpoint() { const char * model_endpoint_env = getenv("MODEL_ENDPOINT"); // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility. diff --git a/common/common.h b/common/common.h index e08a59eae7543..e3c13ac9f866f 100644 --- a/common/common.h +++ b/common/common.h @@ -551,6 +551,8 @@ struct common_init_result { }; struct common_init_result common_init_from_params(common_params & params); +struct common_init_result common_init_from_model_and_params(llama_model * model, common_init_result iparams, + common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); diff --git a/common_test/CMakeLists.txt b/common_test/CMakeLists.txt new file mode 100644 index 0000000000000..44903612e534b --- /dev/null +++ b/common_test/CMakeLists.txt @@ -0,0 +1,15 @@ +# common_test library for load_into_memory.h and uint8-buff-stream.h + +set(TARGET llama-common-test) + +add_library(${TARGET} INTERFACE) + +target_include_directories(${TARGET} INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR} +) + +target_compile_definitions(${TARGET} INTERFACE LLAMA_COMMON_TEST_HEADERS) + +target_compile_features(${TARGET} INTERFACE cxx_std_17) + +target_link_libraries(${TARGET} INTERFACE common) diff --git a/common_test/load_into_memory.h b/common_test/load_into_memory.h new file mode 100644 index 0000000000000..0ffd9228baa2a --- /dev/null +++ b/common_test/load_into_memory.h @@ -0,0 +1,220 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// header-only utilities to showcase how to directly load a model from memory +#include "uint8-buff-stream-wrapper.h" + +namespace { +bool is_split_file(const char * const model_path) { + if (!model_path) { + fprintf(stderr, "No model file provided\n"); + exit(EXIT_FAILURE); + } + + std::string path(model_path); + return path.find("-of-") != std::string::npos; +} + +std::vector load_file_into_buffer(const char * const model_path) { + std::ifstream file_stream(model_path, std::ios::binary | std::ios::ate); + if (!file_stream) { + fprintf(stderr, "Failed to open file %s for reading into streambuf\n", model_path); + exit(EXIT_FAILURE); + } + + const size_t file_size = file_stream.tellg(); + file_stream.seekg(0, std::ios::beg); + + static_assert(sizeof(std::uint8_t) == sizeof(char), "uint8_t must be same size as char"); + std::vector buffer(file_size); + if (!file_stream.read((char *) buffer.data(), file_size)) { + fprintf(stderr, "Failed to read entire file into buffer\n"); + exit(EXIT_FAILURE); + } + + return buffer; +} + +std::unique_ptr> load_file_into_streambuf(const char * const model_path) { + return std::make_unique(load_file_into_buffer(model_path)); +} + +struct file_entry { + std::string path; + std::unique_ptr> streambuf; +}; + +std::vector load_files_into_streambuf(const char * const model_path) { + std::vector files; + + // Extract pattern from first file path + std::string path(model_path); + + // Split by '-' + std::vector parts; + std::stringstream ss(path); + std::string item; + while (std::getline(ss, item, '-')) { + parts.push_back(item); + } + + // Split the last part by '.' + std::string last_part = parts.back(); + parts.pop_back(); + size_t dot_pos = last_part.find('.'); + if (dot_pos != std::string::npos) { + parts.push_back(last_part.substr(0, dot_pos)); + parts.push_back(last_part.substr(dot_pos + 1)); // extension + } else { + parts.push_back(last_part); + } + + // Check if we have enough parts + if (parts.size() < 4) { + fprintf(stderr, "Model path does not contain expected pattern\n"); + exit(EXIT_FAILURE); + } + + // Get total files from [-2] position (before the extension) + int total_files = std::stoi(parts[parts.size() - 2]); + + // Get base path by joining all parts except -start-of-end.gguf + std::string base_path; + for (size_t i = 0; i < parts.size() - 4; i++) { + if (i > 0) { + base_path += "-"; + } + base_path += parts[i]; + } + + for (int i = 1; i <= total_files; i++) { + char numbered_path[1024]; + snprintf(numbered_path, sizeof(numbered_path), "%s-%05d-of-%05d.gguf", base_path.c_str(), i, total_files); + + files.push_back({ numbered_path, load_file_into_streambuf(numbered_path) }); + } + + return files; +} + +file_entry load_tensor_list_file(const char * const model_path) { + std::string path(model_path); + + // Split by '-' + std::vector parts; + std::stringstream ss(path); + std::string item; + while (std::getline(ss, item, '-')) { + parts.push_back(item); + } + + // Split the last part by '.' + std::string last_part = parts.back(); + parts.pop_back(); + size_t dot_pos = last_part.find('.'); + if (dot_pos != std::string::npos) { + parts.push_back(last_part.substr(0, dot_pos)); + parts.push_back(last_part.substr(dot_pos + 1)); // extension + } else { + parts.push_back(last_part); + } + + // Check if we have enough parts + if (parts.size() < 4) { + fprintf(stderr, "Model path does not contain expected pattern\n"); + exit(EXIT_FAILURE); + } + + // Get base path by joining all parts except -start-of-end.gguf + std::string base_path; + for (size_t i = 0; i < parts.size() - 4; i++) { + if (i > 0) { + base_path += "-"; + } + base_path += parts[i]; + } + + // Construct tensor list file path + std::string tensor_list_path = base_path + ".tensors.txt"; + + printf("Loading tensor list file: %s\n", tensor_list_path.c_str()); + return { tensor_list_path, load_file_into_streambuf(tensor_list_path.c_str()) }; +} + +llama_model * load_model_from_memory_configuration(const char * model_path, llama_model_params & model_params) { + llama_model * model; + std::chrono::steady_clock::time_point load_start_time; + if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER")) { + std::vector buffer = load_file_into_buffer(model_path); + fprintf(stdout, "%s: loading model from memory buffer\n", __func__); + load_start_time = std::chrono::steady_clock::now(); + model = llama_model_load_from_buffer(std::move(buffer), model_params); + } else if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT")) { + file_entry tensor_list_file = load_tensor_list_file(model_path); + std::vector files = load_files_into_streambuf(model_path); + fprintf(stdout, "%s: loading model from %zu file streambufs\n", __func__, files.size()); + + std::vector file_paths; + for (const auto & file : files) { + printf("Found file %s with streambuf\n", file.path.c_str()); + file_paths.push_back(file.path.c_str()); + } + + load_start_time = std::chrono::steady_clock::now(); + const char * async_load_context = "test-model-load"; + std::thread fulfill_thread([&files, &tensor_list_file, &async_load_context]() { + const bool success = llama_model_load_fulfill_split_future( + tensor_list_file.path.c_str(), async_load_context, std::move(tensor_list_file.streambuf)); + printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(), + success ? "success" : "failure"); + if (!success) { + exit(EXIT_FAILURE); + } + + for (auto & file : files) { + const bool success = llama_model_load_fulfill_split_future(file.path.c_str(), async_load_context, + std::move(file.streambuf)); + printf("Fulfilling file %s with streambuf: %s\n", file.path.c_str(), success ? "success" : "failure"); + if (!success) { + exit(EXIT_FAILURE); + } + } + }); + fprintf(stderr, "Loading model from splits\n"); + model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context, + tensor_list_file.path.c_str(), model_params); + fulfill_thread.join(); + } else if (getenv("LLAMA_EXAMPLE_FROM_FILE")) { + load_start_time = std::chrono::steady_clock::now(); + model = llama_model_load_from_file(model_path, model_params); + } else { + return nullptr; + } + + if (model == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + exit(1); + } + std::chrono::steady_clock::time_point load_end_time = std::chrono::steady_clock::now(); + std::chrono::duration load_duration = load_end_time - load_start_time; + fprintf(stdout, "%s: loading model took %f seconds\n", __func__, load_duration.count()); + return model; +} + +bool memory_configuration_env_is_set() { + return getenv("LLAMA_EXAMPLE_MEMORY_BUFFER") || getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT") || + getenv("LLAMA_EXAMPLE_FROM_FILE"); +} +} // namespace diff --git a/common_test/uint8-buff-stream-wrapper.h b/common_test/uint8-buff-stream-wrapper.h new file mode 100644 index 0000000000000..3a03721b98c07 --- /dev/null +++ b/common_test/uint8-buff-stream-wrapper.h @@ -0,0 +1,5 @@ +#pragma once + +// Wrapper to include the specific header from src +#include "uint8-buff-stream.h" + diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 809040307d2c9..be3d7b17e1578 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-embedding) add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama llama-common-test ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 0ec2999a0c8e9..63fb27fe98a46 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,15 +1,25 @@ +#include +#include +#include +#include +#include +#include +#include +#include + #include "arg.h" #include "common.h" +#include "llama-cpp.h" #include "log.h" -#include "llama.h" - -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif +#ifdef LLAMA_COMMON_TEST_HEADERS +#include "load_into_memory.h" +#endif + static std::vector split_lines(const std::string & s, const std::string & separator = "\n") { std::vector lines; size_t start = 0; @@ -94,7 +104,20 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - common_init_result llama_init = common_init_from_params(params); + common_init_result llama_init; + +#ifdef LLAMA_COMMON_TEST_HEADERS + if (memory_configuration_env_is_set()) { + llama_model_params mparams = common_model_params_to_llama(params); + common_init_result iparams; + llama_model * model = load_model_from_memory_configuration(params.model.path.c_str(), mparams); + llama_init = common_init_from_model_and_params(model, std::move(iparams), params); + } else { + llama_init = common_init_from_params(params); + } +#else + llama_init = common_init_from_params(params); +#endif llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 104ecabfd7236..5ada3fdd3de6a 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-simple) add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama llama-common-test ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 633b87e58406e..f35a34eede829 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,15 +1,20 @@ +#include "llama-cpp.h" #include "llama.h" #include #include #include -#include static void print_usage(int, char ** argv) { printf("\nexample usage:\n"); printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]); + printf("\n Optional environment variables: LLAMA_EXAMPLE_MEMORY_BUFFER LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT"); printf("\n"); } +#ifdef LLAMA_COMMON_TEST_HEADERS +#include "load_into_memory.h" +#endif + int main(int argc, char ** argv) { // path to the model gguf file std::string model_path; @@ -83,12 +88,13 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = ngl; +#ifdef LLAMA_COMMON_TEST_HEADERS + llama_model * model = memory_configuration_env_is_set() ? + load_model_from_memory_configuration(model_path.c_str(), model_params) : + llama_model_load_from_file(model_path.c_str(), model_params); +#else llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params); - - if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); - return 1; - } +#endif const llama_vocab * vocab = llama_model_get_vocab(model); // tokenize the prompt diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee202062b01..1cf020b2a64e7 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -78,7 +78,6 @@ extern "C" { GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - //GGML_API struct gguf_context * gguf_init_from_buffer(..); GGML_API void gguf_free(struct gguf_context * ctx); @@ -200,3 +199,8 @@ extern "C" { #ifdef __cplusplus } #endif + +#ifdef __cplusplus +#include +GGML_API struct gguf_context * gguf_init_from_buffer(std::basic_streambuf& streambuf, struct gguf_init_params params); +#endif diff --git a/ggml/include/uint8-buff-stream.h b/ggml/include/uint8-buff-stream.h new file mode 100644 index 0000000000000..6d29d20dd52f4 --- /dev/null +++ b/ggml/include/uint8-buff-stream.h @@ -0,0 +1,200 @@ +#pragma once + +#include +#include +#include +#include +#include + +#ifdef __APPLE__ +# include + +/// @brief Custom ctype specialization for uint8_t to work around libc++ +/// limitation in macOS +template <> struct std::ctype : public std::ctype_base { + using char_type = uint8_t; + static std::locale::id id; + + ctype() : std::ctype_base() {} + + ctype([[maybe_unused]] const std::locale::facet & other) : std::ctype_base() {} + + ctype & operator=(const ctype & other) { + if (this != &other) { + std::ctype_base::operator=(other); + } + return *this; + } + + // Required public interface methods + bool is(mask m, [[maybe_unused]] char_type c) const { + return (m & space) != 0; // Treat all uint8_t as non-space + } + + const char_type * is(const char_type * low, const char_type * high, mask * vec) const { + for (; low != high; ++low, ++vec) { + *vec = 0; // No special character properties + } + return high; + } + + const char_type * scan_is(mask m, const char_type * low, const char_type * high) const { + for (; low != high; ++low) { + if (is(m, *low)) { + return low; + } + } + return high; + } + + const char_type * scan_not(mask m, const char_type * low, const char_type * high) const { + for (; low != high; ++low) { + if (!is(m, *low)) { + return low; + } + } + return high; + } + + char_type toupper(char_type c) const { + return c; // No case conversion for uint8_t + } + + const char_type * toupper([[maybe_unused]] char_type * low, const char_type * high) const { + return high; // No case conversion for uint8_t + } + + char_type tolower(char_type c) const { + return c; // No case conversion for uint8_t + } + + const char_type * tolower([[maybe_unused]] char_type * low, const char_type * high) const { + return high; // No case conversion for uint8_t + } + + char_type widen(char c) const { return static_cast(c); } + + const char * widen(const char * low, const char * high, char_type * dest) const { + for (; low != high; ++low, ++dest) { + *dest = static_cast(*low); + } + return high; + } + + char narrow(char_type c, [[maybe_unused]] char dfault) const { return static_cast(c); } + + const char_type * narrow(const char_type * low, const char_type * high, [[maybe_unused]] char dfault, + char * dest) const { + for (; low != high; ++low, ++dest) { + *dest = static_cast(*low); + } + return high; + } +}; +#endif + +/// @brief Custom traits for uint8_t for usage in std template classes that use char_traits (e.g. std::basic_streambuf) +template <> struct std::char_traits { + using char_type = uint8_t; + using int_type = int; + using off_type = std::streamoff; + using pos_type = std::streampos; + using state_type = std::mbstate_t; + + static void assign(char_type & c1, const char_type & c2) noexcept { c1 = c2; } + + static constexpr bool eq(char_type a, char_type b) noexcept { return a == b; } + + static constexpr bool lt(char_type a, char_type b) noexcept { return a < b; } + + static int compare(const char_type * s1, const char_type * s2, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) { + if (lt(s1[i], s2[i])) { + return -1; + } + if (lt(s2[i], s1[i])) { + return 1; + } + } + return 0; + } + + static std::size_t length(const char_type * s) { + std::size_t i = 0; + while (!eq(s[i], char_type())) { + ++i; + } + return i; + } + + static const char_type * find(const char_type * s, std::size_t n, const char_type & c) { + for (std::size_t i = 0; i < n; ++i) { + if (eq(s[i], c)) { + return s + i; + } + } + return nullptr; + } + + static char_type * move(char_type * s1, const char_type * s2, std::size_t n) { + return static_cast(std::memmove(s1, s2, n)); + } + + static char_type * copy(char_type * s1, const char_type * s2, std::size_t n) { + return static_cast(std::memcpy(s1, s2, n)); + } + + static char_type * assign(char_type * s, std::size_t n, char_type c) { + for (std::size_t i = 0; i < n; ++i) { + s[i] = c; + } + return s; + } + + static constexpr int_type not_eof(int_type c) noexcept { return eq_int_type(c, eof()) ? 0 : c; } + + static constexpr char_type to_char_type(int_type c) noexcept { + return c >= 0 && c <= 255 ? static_cast(c) : char_type(); + } + + static constexpr int_type to_int_type(char_type c) noexcept { return static_cast(c); } + + static constexpr bool eq_int_type(int_type c1, int_type c2) noexcept { return c1 == c2; } + + static constexpr int_type eof() noexcept { return static_cast(-1); } +}; + +#ifdef GGML_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BUILD +# define GGML_CLASS_API __declspec(dllexport) +# else +# define GGML_CLASS_API __declspec(dllimport) +# endif +# else +# define GGML_CLASS_API __attribute__((visibility("default"))) +# endif +#else +# define GGML_CLASS_API +#endif + +/// @brief Custom streambuf for uint8_t +class GGML_CLASS_API Uint8BufferStreamBuf : public std::basic_streambuf { + public: + Uint8BufferStreamBuf(std::vector && _data); + + protected: + int_type underflow() override; + + /// @brief Efficient bulk reading. The standard implementation specifies that this function can be overridden + /// to provide a more efficient implementation: sgetn will call this function if it is overridden. + std::streamsize xsgetn(char_type * s, std::streamsize n) override; + + pos_type seekoff(off_type off, std::ios_base::seekdir dir, + std::ios_base::openmode which = std::ios_base::in) override; + + pos_type seekpos(pos_type pos, std::ios_base::openmode which = std::ios_base::in) override; + + private: + std::vector data; +}; diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 9cb2c228dcfb2..d0eb33eca851b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -194,6 +194,7 @@ add_library(ggml-base ../include/ggml-cpp.h ../include/ggml-opt.h ../include/gguf.h + ../include/uint8-buff-stream.h ggml.c ggml.cpp ggml-alloc.c @@ -203,7 +204,8 @@ add_library(ggml-base ggml-threading.h ggml-quants.c ggml-quants.h - gguf.cpp) + gguf.cpp + uint8-buff-stream.cpp) target_include_directories(ggml-base PRIVATE .) if (GGML_BACKEND_DL) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index a0a318a29f5b9..957c56153a9f5 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -2,6 +2,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "gguf.h" +#include "uint8-buff-stream.h" #include #include @@ -216,14 +217,79 @@ struct gguf_context { void * data = nullptr; }; -struct gguf_reader { +struct gguf_bytes_reader { + /// @brief Reads up to `count` objects into the array `buffer`. + /// The position of the underlying stream implementation is advanced + /// by the number of characters read. + /// + /// @note If an error occurs, the resulting value of the underlying stream + /// position indicator is indeterminate. + virtual size_t read(void * buffer, size_t size, size_t count) = 0; + + /// @brief Seeks to a position aligned to the given alignment boundary. + /// @return The current position after alignment, or 0 on error. + virtual size_t align(size_t alignment) = 0; + + virtual ~gguf_bytes_reader() = 0; +}; + +gguf_bytes_reader::~gguf_bytes_reader() {} + +struct gguf_bytes_buffer_reader : public gguf_bytes_reader { + gguf_bytes_buffer_reader(std::basic_streambuf & streambuf) : streambuf(streambuf), offset(0) {} + + ~gguf_bytes_buffer_reader() {} + + size_t read(void * buffer, size_t size, size_t count) override { + size_t total_size = size * count; + auto bytes_read = streambuf.sgetn(static_cast(buffer), total_size); + offset += bytes_read; + return bytes_read; + } + + size_t align(size_t alignment) override { + size_t new_offset = GGML_PAD(offset, alignment); + size_t seek_offset = new_offset - offset; + + auto result = streambuf.pubseekoff(seek_offset, std::ios_base::cur); + if (result == std::streampos(-1)) { + return 0; + } + offset = new_offset; + return offset; + } + + private: + std::basic_streambuf & streambuf; + size_t offset; +}; + +struct gguf_bytes_file_reader : public gguf_bytes_reader { + gguf_bytes_file_reader(FILE * file) : file(file) {} + + ~gguf_bytes_file_reader() {} + + size_t read(void * buffer, size_t size, size_t count) override { return fread(buffer, 1, size * count, file); } + + size_t align(size_t alignment) override { + if (fseek(file, GGML_PAD(ftell(file), alignment), SEEK_SET) != 0) { + return 0; + } + return ftell(file); + } + + private: FILE * file; +}; - gguf_reader(FILE * file) : file(file) {} +struct gguf_reader { + gguf_bytes_reader& bytes_reader; + + gguf_reader(gguf_bytes_reader& bytes_reader) : bytes_reader(bytes_reader) {} template bool read(T & dst) const { - return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); + return bytes_reader.read(&dst, 1, sizeof(dst)) == sizeof(dst); } template @@ -278,11 +344,11 @@ struct gguf_reader { return false; } dst.resize(size); - return fread(dst.data(), 1, dst.length(), file) == dst.length(); + return bytes_reader.read(dst.data(), 1, dst.length()) == dst.length(); } bool read(void * dst, const size_t size) const { - return fread(dst, 1, size, file) == size; + return bytes_reader.read(dst, 1, size) == size; } }; @@ -316,8 +382,8 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vectorinfo.size()) == n_tensors); // we require the data section to be aligned, so take into account any padding - if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) { - GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); + // store the current file offset - this is where the data section starts + ctx->offset = gr.bytes_reader.align(ctx->alignment); + if (ctx->offset == 0) { + GGML_LOG_ERROR("%s: failed to align data section\n", __func__); gguf_free(ctx); return nullptr; } - // store the current file offset - this is where the data section starts - ctx->offset = ftell(file); - // compute the total size of the data section, taking into account the alignment { ctx->size = 0; @@ -718,6 +783,13 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par return ctx; } +} + +struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) { + gguf_bytes_file_reader bytes_reader(file); + gguf_reader reader(bytes_reader); + return gguf_init_from_reader_impl(reader, params); +} struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { FILE * file = ggml_fopen(fname, "rb"); @@ -732,6 +804,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return result; } +struct gguf_context * gguf_init_from_buffer(std::basic_streambuf & streambuf, struct gguf_init_params params) { + gguf_bytes_buffer_reader bytes_reader(streambuf); + gguf_reader reader(bytes_reader); + return gguf_init_from_reader_impl(reader, params); +} + void gguf_free(struct gguf_context * ctx) { if (ctx == nullptr) { return; diff --git a/ggml/src/uint8-buff-stream.cpp b/ggml/src/uint8-buff-stream.cpp new file mode 100644 index 0000000000000..14e8dbc20eac3 --- /dev/null +++ b/ggml/src/uint8-buff-stream.cpp @@ -0,0 +1,59 @@ +#include "uint8-buff-stream.h" + +#ifdef __APPLE__ +std::locale::id std::ctype::id; +#endif + +Uint8BufferStreamBuf::Uint8BufferStreamBuf(std::vector && _data) : data(std::move(_data)) { + setg(const_cast(data.data()), const_cast(data.data()), + const_cast(data.data()) + data.size()); +} + +Uint8BufferStreamBuf::int_type Uint8BufferStreamBuf::underflow() { + if (gptr() < egptr()) { + return traits_type::to_int_type(*gptr()); + } + return traits_type::eof(); +} + +std::streamsize Uint8BufferStreamBuf::xsgetn(char_type * s, std::streamsize n) { + std::streamsize available = egptr() - gptr(); + std::streamsize to_read = std::min(n, available); + if (to_read > 0) { + std::memcpy(s, gptr(), to_read); + setg(eback(), gptr() + to_read, egptr()); + } + return to_read; +} + +Uint8BufferStreamBuf::pos_type Uint8BufferStreamBuf::seekoff(off_type off, std::ios_base::seekdir dir, + std::ios_base::openmode which) { + if (!(which & std::ios_base::in)) { + return pos_type(off_type(-1)); + } + char_type * new_pos = nullptr; + if (dir == std::ios_base::beg) { + new_pos = eback() + off; + } else if (dir == std::ios_base::cur) { + new_pos = gptr() + off; + } else if (dir == std::ios_base::end) { + new_pos = egptr() + off; + } + if (new_pos >= eback() && new_pos <= egptr()) { + setg(eback(), new_pos, egptr()); + return new_pos - eback(); + } + return pos_type(off_type(-1)); +} + +Uint8BufferStreamBuf::pos_type Uint8BufferStreamBuf::seekpos(pos_type pos, std::ios_base::openmode which) { + if (!(which & std::ios_base::in)) { + return pos_type(off_type(-1)); + } + char_type * new_pos = eback() + pos; + if (new_pos >= eback() && new_pos <= egptr()) { + setg(eback(), new_pos, egptr()); + return pos; + } + return pos_type(off_type(-1)); +} diff --git a/include/llama-cpp.h b/include/llama-cpp.h index 8f6368177de09..18fb3ac0e1862 100644 --- a/include/llama-cpp.h +++ b/include/llama-cpp.h @@ -5,6 +5,7 @@ #endif #include +#include #include "llama.h" @@ -28,3 +29,8 @@ typedef std::unique_ptr llama_model_ptr; typedef std::unique_ptr llama_context_ptr; typedef std::unique_ptr llama_sampler_ptr; typedef std::unique_ptr llama_adapter_lora_ptr; + +LLAMA_API struct llama_model * llama_model_load_from_buffer(std::vector && data, + struct llama_model_params params); +LLAMA_API bool llama_model_load_fulfill_split_future(const char * path, const char * context, + std::unique_ptr> && streambuf); diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..89edb619895b8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -456,6 +456,11 @@ extern "C" { size_t n_paths, struct llama_model_params params); + LLAMA_API struct llama_model * llama_model_load_from_split_futures(const char ** paths, size_t n_paths, + const char * context, + const char * tensor_list_file, + struct llama_model_params params); + LLAMA_API void llama_model_save_to_file( const struct llama_model * model, const char * path_model); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8f9cd652447ab..6cbd4ac07da96 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -26,6 +26,8 @@ add_library(llama llama-memory-hybrid.cpp llama-memory-recurrent.cpp llama-mmap.cpp + llama-model-load-input.cpp + llama-model-load.cpp llama-model-loader.cpp llama-model-saver.cpp llama-model.cpp diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8d94034aed95d..96e4827732295 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -347,7 +347,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ // set tensor data { - llama_file gguf_file(path_lora, "rb"); + llama_file_disk gguf_file(path_lora, "rb"); std::vector read_buf; auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) { size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 06e93b19cbf40..dee009b2c6882 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1614,7 +1614,7 @@ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * sr } bool llama_context::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(filepath, "rb"); + llama_file_disk file(filepath, "rb"); // sanity checks { @@ -1657,7 +1657,7 @@ bool llama_context::state_load_file(const char * filepath, llama_token * tokens_ } bool llama_context::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) { - llama_file file(filepath, "wb"); + llama_file_disk file(filepath, "wb"); file.write_u32(LLAMA_SESSION_MAGIC); file.write_u32(LLAMA_SESSION_VERSION); @@ -1674,7 +1674,7 @@ bool llama_context::state_save_file(const char * filepath, const llama_token * t } size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(filepath, "rb"); + llama_file_disk file(filepath, "rb"); // version checks { @@ -1717,7 +1717,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file } size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) { - llama_file file(filepath, "wb"); + llama_file_disk file(filepath, "wb"); file.write_u32(LLAMA_STATE_SEQ_MAGIC); file.write_u32(LLAMA_STATE_SEQ_VERSION); diff --git a/src/llama-impl.h b/src/llama-impl.h index 02b1d07f8400d..0a56d83846577 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -30,6 +30,13 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) +// Debug-only logging macro that's only enabled in debug builds at compile time +#ifndef NDEBUG +#define LLAMA_LOG_CMAKE_DEBUG(...) LLAMA_LOG_DEBUG(__VA_ARGS__) +#else +#define LLAMA_LOG_CMAKE_DEBUG(...) +#endif + // // helpers // diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 47497cf953fd3..dbe6ad1f86a04 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -1,6 +1,7 @@ #include "llama-mmap.h" #include "llama-impl.h" +#include "uint8-buff-stream.h" #include "ggml.h" @@ -9,6 +10,7 @@ #include #include #include +#include #ifdef __has_include #if __has_include() @@ -54,9 +56,7 @@ static std::string llama_format_win_err(DWORD err) { } #endif -// llama_file - -struct llama_file::impl { +struct llama_file_disk::impl { #if defined(_WIN32) HANDLE fp_win32; std::string GetErrorMessageWin32(DWORD error_code) const { @@ -241,13 +241,13 @@ struct llama_file::impl { size_t size; }; -llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique(fname, mode)) {} -llama_file::~llama_file() = default; +llama_file_disk::llama_file_disk(const char * fname, const char * mode) : pimpl(std::make_unique(fname, mode)) {} +llama_file_disk::~llama_file_disk() = default; -size_t llama_file::tell() const { return pimpl->tell(); } -size_t llama_file::size() const { return pimpl->size; } +size_t llama_file_disk::tell() const { return pimpl->tell(); } +size_t llama_file_disk::size() const { return pimpl->size; } -int llama_file::file_id() const { +int llama_file_disk::file_id() const { #ifdef _WIN32 return _fileno(pimpl->fp); #else @@ -259,13 +259,193 @@ int llama_file::file_id() const { #endif } -void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } -void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } +void llama_file_disk::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } +void llama_file_disk::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } + +uint32_t llama_file_disk::read_u32() const { return pimpl->read_u32(); } + +void llama_file_disk::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); } +void llama_file_disk::write_u32(uint32_t val) const { pimpl->write_u32(val); } + +template +llama_file_buffer::llama_file_buffer(std::unique_ptr> && streambuf) : + streambuf(std::move(streambuf)) {} + +template llama_file_buffer::~llama_file_buffer() = default; + +template size_t llama_file_buffer::tell() const { + return streambuf->pubseekoff(0, std::ios_base::cur); +} + +template size_t llama_file_buffer::size() const { + auto current_pos = streambuf->pubseekoff(0, std::ios_base::cur); + auto end_pos = streambuf->pubseekoff(0, std::ios_base::end); + streambuf->pubseekpos(current_pos); + return end_pos; +} + +template int llama_file_buffer::file_id() const { + return -1; +} + +template void llama_file_buffer::seek(size_t offset, int whence) const { + static std::map whence_to_dir = { + { SEEK_SET, std::ios_base::beg }, + { SEEK_CUR, std::ios_base::cur }, + { SEEK_END, std::ios_base::end } + }; + auto result = streambuf->pubseekoff(offset, whence_to_dir.at(whence)); + if (result == std::streampos(-1)) { + throw std::runtime_error("seek failed"); + } +} + +template void llama_file_buffer::read_raw(void * ptr, size_t len) const { + auto bytes_read = streambuf->sgetn(static_cast(ptr), len); + if (bytes_read != static_cast(len)) { + throw std::runtime_error("read beyond end of buffer"); + } +} + +template uint32_t llama_file_buffer::read_u32() const { + uint32_t val; + read_raw(&val, sizeof(val)); + return val; +} + +template <> void llama_file_buffer::write_raw([[maybe_unused]] const void * ptr, size_t len) const { + if (len > 0) { + throw std::runtime_error("buffer is not writable"); + } +} + +template <> void llama_file_buffer::write_u32(uint32_t val) const { + if (val > 0) { + // Cannot directly set [[noreturn]] for a function since it was defined without it. + throw std::runtime_error("buffer is not writable"); + } +} + +template <> void llama_file_buffer::write_raw(const void * ptr, size_t len) const { + auto bytes_written = streambuf->sputn(static_cast(ptr), len); + if (bytes_written != static_cast(len)) { + throw std::runtime_error("write beyond end of buffer"); + } +} -uint32_t llama_file::read_u32() const { return pimpl->read_u32(); } +template <> void llama_file_buffer::write_u32(uint32_t val) const { + write_raw(&val, sizeof(val)); +} + +// Explicit instantiations +template struct llama_file_buffer; +template struct llama_file_buffer; + +// llama_future_file_buffer implementation + +namespace { +std::string final_key(const std::string & promise_key, const std::string & context) { + return promise_key + ":" + context; +} + +std::mutex promise_registry_mutex; + +std::map>>> promise_registry_ro; +std::map>>> promise_registry_rw; + +template +std::map>>> & promise_registry() { + if constexpr (Writable) { + return promise_registry_rw; + } else { + return promise_registry_ro; + } +} + +/// @brief Ensures a promise exists in the registry for the given key. +/// If it doesn't exist, creates it. Returns an iterator to the promise. +/// Thread-safe. +template +typename std::map>>>::iterator +ensure_promise_registry(const std::string & key) { + std::lock_guard lock(promise_registry_mutex); + auto it = promise_registry().find(key); + if (it != promise_registry().end()) { + return it; + } + auto result = + promise_registry().emplace(key, std::promise>>()); + LLAMA_LOG_CMAKE_DEBUG("%s: created future file buffer %p for %s\n", __func__, (void *) &(*it), key.c_str()); + return result.first; +} +} // namespace + +template +llama_future_file_buffer::llama_future_file_buffer(const std::string & promise_key, + const std::string & context) : + file_buffer_future(), + file_buffer() { + std::string key = final_key(promise_key, context); + file_buffer_promise_iterator = ensure_promise_registry(key); + file_buffer_future = file_buffer_promise_iterator->second.get_future(); +} + +template +llama_future_file_buffer::llama_future_file_buffer(llama_future_file_buffer && other) noexcept : + file_buffer_promise_iterator(std::move(other.file_buffer_promise_iterator)), + file_buffer_future(std::move(other.file_buffer_future)), + file_buffer(std::move(other.file_buffer)) { + // Set the other object's iterator to end() to mark it as moved from + // to avoid early erasure at destruction of the moved other object + other.file_buffer_promise_iterator = promise_registry().end(); +} + +template +llama_future_file_buffer & llama_future_file_buffer::operator=( + llama_future_file_buffer && other) noexcept { + if (this != &other) { + file_buffer_promise_iterator = std::move(other.file_buffer_promise_iterator); + file_buffer_future = std::move(other.file_buffer_future); + file_buffer = std::move(other.file_buffer); + other.file_buffer_promise_iterator = promise_registry().end(); + } + return *this; +} + +template llama_future_file_buffer::~llama_future_file_buffer() { + std::lock_guard lock(promise_registry_mutex); + if (file_buffer_promise_iterator != promise_registry().end()) { + promise_registry().erase(file_buffer_promise_iterator); + } +} + +template +bool llama_future_file_buffer::fulfill_promise(const std::string & promise_key, const std::string & context, + std::unique_ptr> && value) { + std::string key = final_key(promise_key, context); + auto it = ensure_promise_registry(key); + if (it != promise_registry().end()) { + LLAMA_LOG_CMAKE_DEBUG("fulfilling future file buffer %p for %s\n", (void *) &(*it), key.c_str()); + it->second.set_value(std::move(value)); + return true; + } + return false; +} + +template +std::unique_ptr> llama_future_file_buffer::extract() const { + if (file_buffer) { + return std::move(file_buffer); + } + + auto future_result = file_buffer_future.get(); + file_buffer = std::move(future_result); + return std::move(file_buffer); +} -void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); } -void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); } +// Explicit instantiations for llama_future_file_buffer +template struct llama_future_file_buffer; +template struct llama_future_file_buffer; // llama_mmap diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 4e5aec3f440d7..9e71eba7ce195 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -3,6 +3,10 @@ #include #include #include +#include "uint8-buff-stream.h" +#include +#include +#include struct llama_file; struct llama_mmap; @@ -13,27 +17,105 @@ using llama_mmaps = std::vector>; using llama_mlocks = std::vector>; struct llama_file { - llama_file(const char * fname, const char * mode); - ~llama_file(); + virtual ~llama_file() = default; - size_t tell() const; - size_t size() const; + virtual size_t tell() const = 0; + virtual size_t size() const = 0; + virtual int file_id() const = 0; + + virtual void seek(size_t offset, int whence) const = 0; - int file_id() const; // fileno overload + virtual void read_raw(void * ptr, size_t len) const = 0; + virtual uint32_t read_u32() const = 0; + + virtual void write_raw(const void * ptr, size_t len) const = 0; + virtual void write_u32(uint32_t val) const = 0; +}; - void seek(size_t offset, int whence) const; +struct llama_file_disk : public llama_file { + llama_file_disk(const char * fname, const char * mode); + ~llama_file_disk() override; - void read_raw(void * ptr, size_t len) const; - uint32_t read_u32() const; + size_t tell() const override; + size_t size() const override; + int file_id() const override; - void write_raw(const void * ptr, size_t len) const; - void write_u32(uint32_t val) const; + void seek(size_t offset, int whence) const override; + + void read_raw(void * ptr, size_t len) const override; + uint32_t read_u32() const override; + + void write_raw(const void * ptr, size_t len) const override; + void write_u32(uint32_t val) const override; private: struct impl; std::unique_ptr pimpl; }; +template struct llama_file_buffer : public llama_file { + llama_file_buffer(std::unique_ptr> && streambuf); + + ~llama_file_buffer() override; + + size_t tell() const override; + size_t size() const override; + + /// @return -1 to indicate this is not a real file descriptor + int file_id() const override; + + void seek(size_t offset, int whence) const override; + + void read_raw(void * ptr, size_t len) const override; + uint32_t read_u32() const override; + + /// @throw std::runtime_error if the buffer is read-only + void write_raw(const void * ptr, size_t len) const override; + + /// @throw std::runtime_error if the buffer is read-only + void write_u32(uint32_t val) const override; + + std::unique_ptr> streambuf; +}; + +template struct llama_future_file_buffer { + /// @brief A file buffer object whose operations will block + /// until the given promise key is set with a file buffer. + /// @param promise_key The key to use for the promise (e.g. a file path). + /// @param context The context to use for the promise, used to distinguish same promise key (e.g. for a same file opened twice). + llama_future_file_buffer(const std::string & promise_key, const std::string & context); + + // Delete copy constructor and copy assignment operator + llama_future_file_buffer(const llama_future_file_buffer &) = delete; + llama_future_file_buffer & operator=(const llama_future_file_buffer &) = delete; + + llama_future_file_buffer(llama_future_file_buffer && other) noexcept; + llama_future_file_buffer & operator=(llama_future_file_buffer && other) noexcept; + + ~llama_future_file_buffer(); + + /// @brief Sets the given key and context with a file buffer so that + /// operations can resume/start. + static bool fulfill_promise(const std::string & promise_key, const std::string & context, + std::unique_ptr> && value); + + /// @brief Waits for future buffer or obtains current if already + /// fulfilled and moves the future contents outside the registry. + std::unique_ptr> extract() const; + + private: + typename std::map>>>::iterator + file_buffer_promise_iterator; + mutable std::future>> file_buffer_future; + mutable std::unique_ptr> file_buffer; +}; + +// Type aliases for convenience +using llama_file_buffer_ro = llama_file_buffer; +using llama_file_buffer_rw = llama_file_buffer; +using llama_future_file_buffer_ro = llama_future_file_buffer; +using llama_future_file_buffer_rw = llama_future_file_buffer; + struct llama_mmap { llama_mmap(const llama_mmap &) = delete; llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false); diff --git a/src/llama-model-load-input.cpp b/src/llama-model-load-input.cpp new file mode 100644 index 0000000000000..e1f4086feec38 --- /dev/null +++ b/src/llama-model-load-input.cpp @@ -0,0 +1,64 @@ +#include "llama-model-load-input.h" +#include +#include "llama-mmap.h" + +namespace load_input_variant { + +const char * identifier(load_input_t & load_input) { + if (std::holds_alternative(load_input)) { + const auto & file_input = std::get(load_input); + return file_input.fname.c_str(); + } + static const char * buffer_id_str = "buffer"; + return buffer_id_str; +} + +fname_load_input split_name_from_variant(load_input_t & load_input) { + if (std::holds_alternative(load_input)) { + auto future_input = std::get(load_input); + return fname_load_input{ future_input.promise_key, future_input.splits }; + } + auto file_input = std::get(load_input); + return file_input; +} + +bool variant_supports_split_load(load_input_t & load_input) { + return std::holds_alternative(load_input) || + std::holds_alternative(load_input); +} + +bool variant_supports_split_load_from_memory(load_input_t & load_input) { + return std::holds_alternative(load_input); +} + +std::optional> parse_tensor_list_from_future(load_input_t & load_input) { + std::set tensor_names; + + if (!std::holds_alternative(load_input)) { + return std::nullopt; + } + + const auto & future_input = std::get(load_input); + + // Open and read the tensor list file + llama_future_file_buffer_ro tensor_file(future_input.tensor_list_file, future_input.context); + std::unique_ptr file_buffer = tensor_file.extract(); + + // Read the entire buffer as bytes and convert to string + std::vector buffer; + std::basic_istream stream(file_buffer->streambuf.get()); + std::istreambuf_iterator begin(stream), end; + buffer.assign(begin, end); + + // Convert bytes to string and split by newlines + std::string content(reinterpret_cast(buffer.data()), buffer.size()); + std::istringstream line_stream(content); + std::string line; + while (std::getline(line_stream, line)) { + tensor_names.insert(line); + } + + return tensor_names; +} + +} // namespace load_input_variant diff --git a/src/llama-model-load-input.h b/src/llama-model-load-input.h new file mode 100644 index 0000000000000..d7bb331c5f8ba --- /dev/null +++ b/src/llama-model-load-input.h @@ -0,0 +1,46 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace load_input_variant { + +struct fname_load_input { + const std::string & fname; + std::vector & splits; // optional, only need if the split does not follow naming scheme +}; + +struct buffer_load_input { + std::unique_ptr> & streambuf; +}; + +struct buffer_future_load_input { + const std::string & promise_key; + const std::string & context; + std::vector & splits; + const std::string & tensor_list_file; +}; + +} // namespace load_input_variant + +using load_input_t = std::variant; + +namespace load_input_variant { +const char * identifier(load_input_t & load_input); + +fname_load_input split_name_from_variant(load_input_t & load_input); + +bool variant_supports_split_load(load_input_t & load_input); + +bool variant_supports_split_load_from_memory(load_input_t & load_input); + +/// @brief Parse tensor list from future file or nullopt if not a future file +std::optional> parse_tensor_list_from_future(load_input_t & load_input); +} // namespace load_input_variant diff --git a/src/llama-model-load.cpp b/src/llama-model-load.cpp new file mode 100644 index 0000000000000..15c3b367e0ac2 --- /dev/null +++ b/src/llama-model-load.cpp @@ -0,0 +1,234 @@ +#include "llama-model-load.h" + +#include +#include +#include +#include + +#include "llama-model-loader.h" + +gguf_file_load::gguf_file_load(struct ggml_context ** ctx, load_input_t load_input) : + params({ + /*.no_alloc = */ true, + /*.ctx = */ ctx, + }) { + using namespace load_input_variant; + if (std::holds_alternative(load_input)) { + const auto & file_input = std::get(load_input); + meta.reset(gguf_init_from_file(file_input.fname.c_str(), params)); + if (!meta) { + throw std::runtime_error(format("%s: failed to load model from %s", __func__, file_input.fname.c_str())); + } + file = std::make_unique(file_input.fname.c_str(), "ro"); + } else if (std::holds_alternative(load_input)) { + const auto & future_input = std::get(load_input); + auto future_file = + std::make_unique(future_input.promise_key, future_input.context); + std::unique_ptr file_buffer = future_file->extract(); + meta.reset(gguf_init_from_buffer(*file_buffer->streambuf, params)); + if (!meta) { + throw std::runtime_error(format("%s: failed to load model from buffer", __func__)); + } + file = std::move(file_buffer); + } else { + const auto & buffer_input = std::get(load_input); + meta.reset(gguf_init_from_buffer(*buffer_input.streambuf, params)); + if (!meta) { + throw std::runtime_error(format("%s: failed to load model from buffer", __func__)); + } + file = std::make_unique(std::move(buffer_input.streambuf)); + } +} + +gguf_file_load SplitLoad::load_split_gguf(struct ggml_context ** ctx, const char * fname_split, + load_input_t & load_input, std::vector & splits) { + using namespace load_input_variant; + if (std::holds_alternative(load_input)) { + return gguf_file_load(ctx, fname_load_input{ fname_split, splits }); + } + if (std::holds_alternative(load_input)) { + auto future_input = std::get(load_input); + return gguf_file_load( + ctx, buffer_future_load_input{ fname_split, future_input.context, splits, future_input.tensor_list_file }); + } + return gguf_file_load(ctx, load_input); +} + +SplitLoad::SplitLoad(load_input_t & load_input, load_input_variant::fname_load_input base_split, uint16_t idx, + std::string kv_split_no) : + load_input(load_input), + base_split(base_split), + idx(idx), + kv_split_no(std::move(kv_split_no)) {} + +IncrementalSplitsTensorLoad::IncrementalSplitsTensorLoad(struct ggml_context * ctx, struct llama_model_loader & ml, + gguf_file_load & base_split, + std::set tensor_list) : + expected_tensors(std::move(tensor_list)) { + ml.process_loaded_gguf(ctx, base_split, 0); + _process_split(ctx, ml, 0); +} + +struct ggml_context * SplitLoad::load(llama_model_loader & ml) { + if (loaded) { + return ml.contexts[idx].get(); + } + + struct ggml_context * ctx = ml.contexts.back().get(); + + const char * fname_split = base_split.splits[idx].c_str(); + LLAMA_LOG_INFO("loading split-file %s\n", fname_split); + + gguf_file_load split_gguf = gguf_file_load(load_split_gguf(&ctx, fname_split, load_input, base_split.splits)); + gguf_context_ptr & split_meta = split_gguf.meta; + + if (idx > 0) { + const int kid = gguf_find_key(split_meta.get(), kv_split_no.c_str()); + if (kid < 0) { + throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split)); + } + int idx_gguf = gguf_get_val_u16(split_meta.get(), kid); + if (idx_gguf != idx) { + throw std::runtime_error( + format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx)); + } + } + + // Check that this split's idx matches the expected position in ml.files + if (!ml.files.empty() && idx != ml.files.size()) { + throw std::runtime_error( + format("invalid split file loading order: got idx %d but expected %zu based on ml.files size", idx, + ml.files.size())); + } + + ml.process_loaded_gguf(ctx, split_gguf, idx); + + loaded = true; + return ctx; +} + +void IncrementalSplitsTensorLoad::add_split(SplitLoad splitLoad) { + // +1 because first split is expected to have been already loaded (not delayed) + split_info[delayed_files.size() + 1] = SplitInfo(); + delayed_files.emplace_back(std::move(splitLoad)); +} + +void IncrementalSplitsTensorLoad::_load_split(struct llama_model_loader & ml, uint16_t idx) { + // -1 because first split is expected to have been already loaded (not delayed and not present in delayed_files) + const struct ggml_context * ctx = delayed_files[idx - 1].load(ml); + _process_split(ctx, ml, idx); +} + +void IncrementalSplitsTensorLoad::_process_split(const struct ggml_context * ctx, struct llama_model_loader & ml, + uint16_t idx) { + SplitInfo & split = split_info[idx]; + + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + split.total_tensor_count++; + + // Add tensor info with initial loaded state as false + tensor_info[tensor_name] = TensorInfo{ idx, false }; + + auto it = ml.weights_map.find(tensor_name); + if (it == ml.weights_map.end()) { + throw std::runtime_error(format("tensor '%s' not found in weights_map", tensor_name.c_str())); + } + split.data_size += ggml_nbytes(it->second.tensor); + } +} + +uint16_t IncrementalSplitsTensorLoad::load_tensor_metadata(struct llama_model_loader & ml, const char * tensor_name, + ggml_tensor ** out_tensor_metadata) { + LLAMA_LOG_CMAKE_DEBUG("%s: loading tensor %s (tensor_meta=%p, delayed_loaded=%zu, delayed_files.size=%zu)\n", + __func__, tensor_name, (void *) *out_tensor_metadata, delayed_loaded, delayed_files.size()); + if (expected_tensors.find(tensor_name) == expected_tensors.end()) { + throw std::runtime_error(format("unknown tensor not expected in split files: %s", tensor_name)); + } + while (!(*out_tensor_metadata) && delayed_loaded < delayed_files.size()) { + // +1 because first split is expected to have been already loaded (not delayed) + _load_split(ml, delayed_loaded + 1); + *out_tensor_metadata = ml.get_tensor_meta(tensor_name); + delayed_loaded++; + if (*out_tensor_metadata) { + LLAMA_LOG_CMAKE_DEBUG("%s: tensor %s found in file %zu\n", __func__, tensor_name, delayed_loaded); + } + if (delayed_loaded == delayed_files.size() && ml.weights_map.size() != expected_n_tensors()) { + throw std::runtime_error( + format("finished incrementally loading all splits but expected %zu tensors, got %zu", + expected_n_tensors(), ml.weights_map.size())); + } + } + uint16_t split_idx = get_split_idx_for_tensor(tensor_name); + + // Mark tensor as loaded and increment split's loaded count + auto tensor_it = tensor_info.find(tensor_name); + if (!tensor_it->second.is_loaded) { + tensor_it->second.is_loaded = true; + split_info[split_idx].loaded_tensor_count++; + } + + return split_idx; +} + +uint16_t IncrementalSplitsTensorLoad::get_split_idx_for_tensor(const char * tensor_name) const { + return _get_tensor_info_iterator(tensor_name)->second.split_idx; +} + +std::size_t IncrementalSplitsTensorLoad::get_split_data_size(uint16_t split_idx) const { + return _get_split_info_iterator(split_idx)->second.data_size; +} + +void IncrementalSplitsTensorLoad::print_currently_known_tensors() const { + LLAMA_LOG_INFO("Current incremental loaded tensors:\n"); + for (const auto & it : tensor_info) { + LLAMA_LOG_INFO("Tensor '%s' in split %d (loaded: %s)\n", it.first.c_str(), it.second.split_idx, + it.second.is_loaded ? "yes" : "no"); + } +} + +bool IncrementalSplitsTensorLoad::all_tensors_are_loaded(uint16_t split_idx) const { + auto it = _get_split_info_iterator(split_idx); + const SplitInfo & split = it->second; + LLAMA_LOG_CMAKE_DEBUG("Loaded tensor count for split %d: %u/%u\n", split_idx, split.loaded_tensor_count, + split.total_tensor_count); + return split.all_tensors_loaded(); +} + +std::size_t IncrementalSplitsTensorLoad::expected_n_tensors() { + return expected_tensors.size(); +} + +void IncrementalSplitsTensorLoad::release_split(struct llama_model_loader & ml, uint16_t split_idx) { + // Let destructor of the smart pointer do the release of memory + ml.files[split_idx] = nullptr; +} + +std::map::const_iterator +IncrementalSplitsTensorLoad::_get_tensor_info_iterator(const char * tensor_name) const { + auto it = tensor_info.find(tensor_name); + if (it == tensor_info.end()) { + throw std::runtime_error(format("tensor '%s' not found in tensor_info map", tensor_name)); + } + return it; +} + +std::map::const_iterator +IncrementalSplitsTensorLoad::_get_split_info_iterator(uint16_t split_idx) const { + auto it = split_info.find(split_idx); + if (it == split_info.end()) { + throw std::runtime_error(format("split index %d not found in split_info map", split_idx)); + } + return it; +} + +bool IncrementalSplitsTensorLoad::SplitInfo::all_tensors_loaded() const { + return loaded_tensor_count >= total_tensor_count; +} + +bool IncrementalSplitsTensorLoad::tensor_ignored(const std::optional & splits_tensor_load, + const char * tensor_name) { + return !splits_tensor_load.has_value() || + (splits_tensor_load.has_value() && + splits_tensor_load->expected_tensors.find(tensor_name) == splits_tensor_load->expected_tensors.end()); +} diff --git a/src/llama-model-load.h b/src/llama-model-load.h new file mode 100644 index 0000000000000..1abc2053de4af --- /dev/null +++ b/src/llama-model-load.h @@ -0,0 +1,147 @@ +#pragma once + +#include +#include + +#include "ggml-cpp.h" +#include "llama-mmap.h" +#include "llama-model-load-input.h" + +struct llama_model_loader; + +/// @brief Immediately loads and stores relevant data in the struct fields. +struct gguf_file_load { + struct gguf_init_params params; + gguf_context_ptr meta; + std::unique_ptr file = nullptr; + + gguf_file_load(struct ggml_context ** ctx, load_input_t load_input); +}; + +/// @brief Stores relevant information to be able to loads a `.gguf` split file when load method is called. +struct SplitLoad { + load_input_t load_input; + load_input_variant::fname_load_input base_split; + uint16_t idx; + std::string kv_split_no; + bool loaded = false; + + SplitLoad(load_input_t & load_input, load_input_variant::fname_load_input base_split, uint16_t idx, + std::string kv_split_no); + + static gguf_file_load load_split_gguf(struct ggml_context ** ctx, const char * fname_split, + load_input_t & load_input, std::vector & splits); + + struct ggml_context * load(struct llama_model_loader & ml); +}; + +/// @brief Handles incremental load of tensor and split-files. +/// @note First split-file is expected to be already available at construction, the remainder of split-files are +/// incrementally load on-demand by calling `load_tensor_metadata` +struct IncrementalSplitsTensorLoad { + IncrementalSplitsTensorLoad(struct ggml_context * ctx, struct llama_model_loader & ml, gguf_file_load & base_split, + std::set tensor_list); + + void add_split(SplitLoad splitLoad); + + /// @brief Incrementally loads file splits until the tensor metadata is found. + /// Also increments loaded tensor count so that `all_tensors_are_loaded` returns true + /// when all tensors in a file-split have been requested. + /// @returns Split idx where the tensor was found + /// @throw runtime_error if tensor was not found + uint16_t load_tensor_metadata(struct llama_model_loader & ml, const char * tensor_name, + ggml_tensor ** out_tensor_metadata); + + /// @returns True if all tensors of a split have been loaded. + bool all_tensors_are_loaded(uint16_t split_idx) const; + + /// @returns Max number of tensors as described on the summary tensor-list file. + std::size_t expected_n_tensors(); + + /// @bried Release file memory for a split. + static void release_split(struct llama_model_loader & ml, uint16_t split_idx); + + void print_currently_known_tensors() const; + + uint16_t get_split_idx_for_tensor(const char * tensor_name) const; + + std::size_t get_split_data_size(uint16_t split_idx) const; + + static bool tensor_ignored(const std::optional & splits_tensor_load, + const char * tensor_name); + + /// @brief Lalizy get/allocate a context with enough capacity for all tensors of + /// same type of an individual split. The context can be used to instantiate the + /// final model tensors and and attach to them backend buffers. + /// @tparam impl The model implementation type where the context will be stored. + template + ggml_context * get_model_ctx_for_split_buft(ggml_backend_buffer_type_t buft, uint16_t split, impl * model_impl) { + auto key = std::make_pair(buft, split); + auto it = ctx_split_map.find(key); + if (it == ctx_split_map.end()) { + LLAMA_LOG_CMAKE_DEBUG("%s: creating context for split %d (buft=%s, existing=%zu)\n", __func__, split, + ggml_backend_buft_name(buft), ctx_split_map.size()); + + const size_t max_n_tensors = _get_split_info_iterator(split)->second.total_tensor_count; + const size_t ctx_size = ggml_tensor_overhead() * max_n_tensors; + + ggml_init_params params = { + /*.mem_size =*/ctx_size, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/true, + }; + + ggml_context * ctx = ggml_init(params); + if (!ctx) { + throw std::runtime_error("failed to create ggml context for split-file"); + } + + ctx_split_map[key] = ctx; + model_impl->ctxs.emplace_back(ctx); + + return ctx; + } + return it->second; + } + + // public so that it can be processed by the backend storage allocator + std::map, ggml_context *> ctx_split_map; + + private: + struct TensorInfo { + uint16_t split_idx = 0; + bool is_loaded = false; + }; + + struct SplitInfo { + uint32_t total_tensor_count = 0, loaded_tensor_count = 0; + + /// @brief Total ggml tensor data size of this split + std::size_t data_size = 0; + + bool all_tensors_loaded() const; + }; + + void _load_split(struct llama_model_loader & ml, uint16_t idx); + void _process_split(const struct ggml_context * ctx, struct llama_model_loader & ml, uint16_t idx); + + /// @brief Get tensor info iterator or throw if not found + /// @throw runtime_error if tensor not found + std::map::const_iterator _get_tensor_info_iterator(const char * tensor_name) const; + + /// @brief Get split info iterator or throw if not found + /// @throw runtime_error if split not found + std::map::const_iterator _get_split_info_iterator(uint16_t split_idx) const; + + std::map tensor_info; + std::map split_info; + + /// @brief Number of delayed files that have been loaded + std::size_t delayed_loaded = 0; + + /// @brief Vector of split files to be loaded on demand + std::vector delayed_files; + + /// @brief Set of expected tensor names loaded from tensor list file + std::set expected_tensors; +}; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index bd9e6da8832b7..29eed9f96f1c8 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1,11 +1,15 @@ #include "llama-model-loader.h" #include "ggml.h" +#include "llama-mmap.h" +#include "llama-model-load.h" #include #include +#include #include #include +#include static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; @@ -463,11 +467,35 @@ namespace GGUFMeta { // TODO: this is not very clever - figure out something better template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); - template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, + std::array & result, + uint32_t n, bool required); + + // Save tensors data offset of the main file. + // For subsidiary files, `meta` tensor data offset must not be used, + // so we build a unified tensors index for weights. + void llama_model_loader::process_loaded_gguf(struct ggml_context * ctx, gguf_file_load & gguf_load, uint16_t idx) { + contexts.emplace_back(ctx); + files.emplace_back(std::move(gguf_load.file)); + llama_file * raw_file_ptr = files.back().get(); + + // Save tensors data offset info of the shard. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + LLAMA_LOG_CMAKE_DEBUG("%s: loaded tensor %s at split %d\n", tensor_name.c_str(), __func__, idx); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, + llama_model_loader::llama_tensor_weight(raw_file_ptr, idx, gguf_load.meta.get(), cur)); + } + } llama_model_loader::llama_model_loader( - const std::string & fname, - std::vector & splits, + load_input_t load_input, bool use_mmap, bool check_tensors, const llama_model_kv_override * param_overrides_p, @@ -485,58 +513,46 @@ llama_model_loader::llama_model_loader( tensor_buft_overrides = param_tensor_buft_overrides_p; - // Load the main GGUF + std::optional> tensor_list = load_input_variant::parse_tensor_list_from_future(load_input); + struct ggml_context * ctx = NULL; - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, - }; + gguf_file_load main_gguf(&ctx, load_input); - meta.reset(gguf_init_from_file(fname.c_str(), params)); - if (!meta) { - throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str())); + if (load_input_variant::variant_supports_split_load_from_memory(load_input)) { + incremental_splits_tensor_load.emplace(ctx, *this, main_gguf, std::move(*tensor_list)); + } else { + process_loaded_gguf(ctx, main_gguf, 0); } + meta = std::move(main_gguf.meta); + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fname.c_str(), "rb")); - contexts.emplace_back(ctx); - - // Save tensors data offset of the main file. - // For subsidiary files, `meta` tensor data offset must not be used, - // so we build a unified tensors index for weights. - for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - std::string tensor_name = std::string(cur->name); - // make sure there is no duplicated tensor names - if (weights_map.find(tensor_name) != weights_map.end()) { - throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); - } - n_elements += ggml_nelements(cur); - n_bytes += ggml_nbytes(cur); - weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); - } uint16_t n_split = 0; get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); // Load additional GGML contexts - if (n_split > 1) { + if (load_input_variant::variant_supports_split_load(load_input) && n_split > 1) { + + load_input_variant::fname_load_input base_split = load_input_variant::split_name_from_variant(load_input); + // make sure the main file is loaded first uint16_t idx = 0; const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO); get_key(kv_split_no, idx); if (idx != 0) { - throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str())); + throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, base_split.fname.c_str())); } // generate list of splits if needed - if (splits.empty()) { - splits = llama_get_list_splits(fname, idx, n_split); + if (base_split.splits.empty()) { + base_split.splits = llama_get_list_splits(base_split.fname, idx, n_split); } // in case user give a custom list of splits, check if it matches the expected number - if (n_split != (uint16_t)splits.size()) { - throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split)); + if (n_split != (uint16_t)base_split.splits.size()) { + throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", base_split.splits.size(), n_split)); } if (trace > 0) { @@ -545,49 +561,20 @@ llama_model_loader::llama_model_loader( // load other splits for (idx = 1; idx < n_split; idx++) { - const char * fname_split = splits[idx].c_str(); - - struct gguf_init_params split_params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, - }; - gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; - if (!ctx_gguf) { - throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split)); - } + SplitLoad split_load(load_input, base_split, idx, kv_split_no); - // check idx - { - const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str()); - if (kid < 0) { - throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split)); - } - int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid); - if (idx_gguf != idx) { - throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx)); - } + if(incremental_splits_tensor_load.has_value()) { + incremental_splits_tensor_load->add_split(std::move(split_load)); } - - files.emplace_back(new llama_file(fname_split, "rb")); - contexts.emplace_back(ctx); - - // Save tensors data offset info of the shard. - for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - std::string tensor_name = std::string(cur->name); - // make sure there is no duplicated tensor names - if (weights_map.find(tensor_name) != weights_map.end()) { - throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); - } - n_elements += ggml_nelements(cur); - n_bytes += ggml_nbytes(cur); - weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); + else { + split_load.load(*this); } } get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); - // sanity check - { + // sanity check (the incremental loader does the check after loading the last split) + if(!incremental_splits_tensor_load.has_value()) { const int n_tensors_loaded = (int) weights_map.size(); if (n_tensors != n_tensors_loaded) { throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); @@ -598,16 +585,22 @@ llama_model_loader::llama_model_loader( } n_kv = gguf_get_n_kv(meta.get()); - n_tensors = weights_map.size(); + if (incremental_splits_tensor_load.has_value()) { + n_tensors = incremental_splits_tensor_load->expected_n_tensors(); + LLAMA_LOG_CMAKE_DEBUG("%s: n_tensors (expected from summary list): %d\n", __func__, n_tensors); + } else { + n_tensors = weights_map.size(); + LLAMA_LOG_CMAKE_DEBUG("%s: exact n_tensors: %d\n", __func__, n_tensors); + } fver = (enum llama_fver) gguf_get_version(meta.get()); LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); + __func__, n_kv, n_tensors, load_input_variant::identifier(load_input), llama_file_version_name(fver)); // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional - { + if(!incremental_splits_tensor_load.has_value()) { std::map n_type; uint32_t n_type_max = 0; @@ -915,12 +908,9 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { } } -bool llama_model_loader::load_all_data( - struct ggml_context * ctx, - llama_buf_map & bufs, - llama_mlocks * lmlocks, - llama_progress_callback progress_callback, - void * progress_callback_user_data) { +bool llama_model_loader::load_all_data(size_t size_data, struct ggml_context * ctx, llama_buf_map & bufs, + llama_mlocks * lmlocks, llama_progress_callback progress_callback, + void * progress_callback_user_data) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; @@ -1060,6 +1050,12 @@ bool llama_model_loader::load_all_data( } } else { const auto & file = files.at(weight->idx); + if (file == nullptr) { + throw std::runtime_error( + format("file not found for tensor '%s' at split-index %d", ggml_get_name(cur), weight->idx)); + } + LLAMA_LOG_CMAKE_DEBUG("%s: uploading tensor %s from file at split-index %d\n", __func__, ggml_get_name(cur), + weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); file->read_raw(cur->data, n_size); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 0f52b011b6986..605a9784bcd03 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -5,6 +5,7 @@ #include "llama-impl.h" #include "llama-arch.h" #include "llama-mmap.h" +#include "llama-model-load.h" #include "ggml-cpp.h" @@ -78,6 +79,9 @@ struct llama_model_loader { llama_mmaps mappings; std::map weights_map; + + std::optional incremental_splits_tensor_load; + std::unordered_map kv_overrides; const llama_model_tensor_buft_override * tensor_buft_overrides; @@ -91,9 +95,10 @@ struct llama_model_loader { size_t size_data = 0; std::vector> mmaps_used; + void process_loaded_gguf(struct ggml_context * ctx, gguf_file_load & gguf_load, uint16_t idx); + llama_model_loader( - const std::string & fname, - std::vector & splits, // optional, only need if the split does not follow naming scheme + load_input_t load_input, bool use_mmap, bool check_tensors, const llama_model_kv_override * param_overrides_p, @@ -156,12 +161,8 @@ struct llama_model_loader { void load_data_for(struct ggml_tensor * cur) const; // Returns false if cancelled by progress_callback - bool load_all_data( - struct ggml_context * ctx, - llama_buf_map & bufs, - llama_mlocks * lmlocks, - llama_progress_callback progress_callback, - void * progress_callback_user_data); + bool load_all_data(size_t size_data, struct ggml_context * ctx, llama_buf_map & bufs, llama_mlocks * lmlocks, + llama_progress_callback progress_callback, void * progress_callback_user_data); std::string ftype_name() const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9b19da984081e..700dfc6567850 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -17,10 +17,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -1643,9 +1643,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_backend_buffer_type_t first_moved_to_buft = nullptr; auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list & ne, int flags) -> ggml_tensor * { - ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str()); - + const std::string& tensor_name = tn.str(); + ggml_tensor * t_meta = ml.get_tensor_meta(tensor_name.c_str()); + std::optional split_idx; + if (!t_meta && (flags & TENSOR_NOT_REQUIRED) && + IncrementalSplitsTensorLoad::tensor_ignored(ml.incremental_splits_tensor_load, tensor_name.c_str())) { + return nullptr; + } + if (ml.incremental_splits_tensor_load.has_value()) { + split_idx = ml.incremental_splits_tensor_load->load_tensor_metadata(ml, tn.str().c_str(), &t_meta); + LLAMA_LOG_CMAKE_DEBUG("split idx for tensor %s: %d\n", tn.str().c_str(), *split_idx); + } if (!t_meta) { + LLAMA_LOG_ERROR("%s: missing tensor %s\n", __func__ , tn.str().c_str()); if (flags & TENSOR_NOT_REQUIRED) { return nullptr; } @@ -1758,16 +1768,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } - ggml_context * ctx = ctx_for_buft(buft); + ggml_context * ctx = + split_idx.has_value() ? + ml.incremental_splits_tensor_load->get_model_ctx_for_split_buft(buft, *split_idx, pimpl.get()) : + ctx_for_buft(buft); // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one if (flags & TENSOR_DUPLICATED) { - ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str()); + auto tn_str = tn.str(); + ggml_tensor * t = ggml_get_tensor(ctx, tn_str.c_str()); if (t) { return t; } + LLAMA_LOG_WARN("%s: duplicated tensor %s not found on existing context\n", tn_str.c_str(), __func__); + } + struct ggml_tensor * tensor = ml.create_tensor(ctx, tn, ne, flags); + + if (split_idx.has_value() && ml.incremental_splits_tensor_load->all_tensors_are_loaded(*split_idx)) { + // Upload right now. + if (!create_split_backend_buffers(*split_idx, ml.incremental_splits_tensor_load->ctx_split_map, ml, + use_mmap_buffer, use_mlock, n_gpu_layers)) { + throw std::runtime_error("Failed to create incremental backend buffers"); + } + IncrementalSplitsTensorLoad::release_split(ml, *split_idx); } - return ml.create_tensor(ctx, tn, ne, flags); + + return tensor; }; layers.resize(n_layer); @@ -4285,9 +4311,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ml.done_getting_tensors(); + if (ml.incremental_splits_tensor_load.has_value()) { + // Already did incremental load. + print_backend_buffers_info(n_gpu_layers); + return true; + } + ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr); pimpl->mappings.reserve(ml.mappings.size()); + return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers); +} + +bool llama_model::create_split_backend_buffers( + const uint16_t idx, std::map, ggml_context *> & ctx_split_map, + llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) { + // Extract contexts for the given split index from ctx_split_map into a new map + std::map ctx_map; + for (const auto & [buft_split_idx, ctx] : ctx_split_map) { + const auto & [buft, split_idx] = buft_split_idx; + if (split_idx == idx) { + ctx_map[buft] = ctx; + } + } + + const std::size_t split_data_size = ml.incremental_splits_tensor_load->get_split_data_size(idx); + LLAMA_LOG_CMAKE_DEBUG("%s: creating backend buffers for split %d with size %zu\n", __func__, idx, split_data_size); + constexpr bool do_print_backend_buffers_info = false; + const bool creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock, + n_gpu_layers, do_print_backend_buffers_info); + + return creation_success; +} + +bool llama_model::create_backend_buffers(std::size_t size_data, + const std::map & ctx_map, + llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, + const int32_t n_gpu_layers, bool do_print_backend_buffers_info) { // create the backend buffers std::vector> ctx_bufs; ctx_bufs.reserve(ctx_map.size()); @@ -4296,7 +4356,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); pimpl->bufs.reserve(n_max_backend_buffer); - for (auto & it : ctx_map) { + for (const auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; @@ -4372,23 +4432,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ctx_bufs.emplace_back(ctx, buf_map); } - if (llama_supports_gpu_offload()) { - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - - LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); - if (n_gpu_layers > (int) hparams.n_layer) { - LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__); - } - - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; - - LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - } - - // print memory requirements per buffer type - for (auto & buf : pimpl->bufs) { - LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); + if(do_print_backend_buffers_info) { + print_backend_buffers_info(n_gpu_layers); } // populate tensors_by_name @@ -4402,7 +4447,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; auto & bufs = it.second; - if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { + if (!ml.load_all_data(size_data, ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { return false; } } @@ -4416,6 +4461,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) { return true; } +void llama_model::print_backend_buffers_info(const int32_t n_gpu_layers) { + if (llama_supports_gpu_offload()) { + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); + if (n_gpu_layers > (int) hparams.n_layer) { + LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__); + } + + const int max_backend_supported_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; + + LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), + max_backend_supported_layers); + } + + // print memory requirements per buffer type + for (auto & buf : pimpl->bufs) { + LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), + ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); + } +} + std::string llama_model::arch_name() const { return llm_arch_name(arch); } diff --git a/src/llama-model.h b/src/llama-model.h index 06e6c687943cc..98ba0d29da2d3 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -7,10 +7,12 @@ #include "llama-memory.h" #include "llama-vocab.h" +#include #include #include #include #include +#include struct llama_cparams; struct llama_ubatch; @@ -373,6 +375,19 @@ struct llama_model { explicit llama_model(const struct llama_model_params & params); ~llama_model(); + /// @brief Create backend buffers for all tensors + bool create_backend_buffers(std::size_t size_data, + const std::map & ctx_map, + llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers, + bool do_print_backend_buffers_info = true); + + /// @brief Create backend buffers for tensors on a split file idenfified by `idx`. Removes the split from the map. + bool create_split_backend_buffers( + uint16_t idx, std::map, ggml_context *> & ctx_split_map, + llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers); + + void print_backend_buffers_info(int32_t n_gpu_layers); + void load_stats (llama_model_loader & ml); void load_arch (llama_model_loader & ml); void load_hparams(llama_model_loader & ml); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43229e1938597..0cb6ebe238ef6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -583,7 +583,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); + load_input_variant::fname_load_input inp{fname_inp, splits}; + llama_model_loader ml(inp, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 34906cdb62844..3fe4e8f7e5013 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9,6 +9,7 @@ #include "ggml.h" #include "ggml-backend.h" +#include "uint8-buff-stream.h" #include #include @@ -16,11 +17,17 @@ #include #include #include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif +#ifdef __cplusplus +#include "llama-cpp.h" +#endif + // // interface implementation // @@ -84,7 +91,7 @@ int64_t llama_time_us(void) { } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { +static int llama_model_load(llama_model_loader & ml, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -93,8 +100,6 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); - ml.print_info(); model.hparams.vocab_only = params.vocab_only; @@ -135,8 +140,7 @@ static int llama_model_load(const std::string & fname, std::vector } static struct llama_model * llama_model_load_from_file_impl( - const std::string & path_model, - std::vector & splits, + llama_model_loader& ml, struct llama_model_params params) { ggml_time_init(); @@ -218,7 +222,7 @@ static struct llama_model * llama_model_load_from_file_impl( LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); } - const int status = llama_model_load(path_model, splits, *model, params); + const int status = llama_model_load(ml, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -241,26 +245,80 @@ struct llama_model * llama_load_model_from_file( return llama_model_load_from_file(path_model, params); } -struct llama_model * llama_model_load_from_file( - const char * path_model, - struct llama_model_params params) { +static llama_model_loader create_disk_fileloader(const char * path_model, std::vector & splits, + struct llama_model_params params) { + load_input_variant::fname_load_input loader_input{ path_model, splits }; + return llama_model_loader(loader_input, params.use_mmap, params.check_tensors, params.kv_overrides, + params.tensor_buft_overrides); +} + +struct llama_model * llama_model_load_from_file(const char * path_model, struct llama_model_params params) { std::vector splits = {}; - return llama_model_load_from_file_impl(path_model, splits, params); + llama_model_loader ml = create_disk_fileloader(path_model, splits, params); + return llama_model_load_from_file_impl(ml, params); } -struct llama_model * llama_model_load_from_splits( - const char ** paths, - size_t n_paths, - struct llama_model_params params) { +namespace { +void override_and_disable_mmap(struct llama_model_params & params) { + if (params.use_mmap) { + LLAMA_LOG_WARN("Overriding and disabling memory mapping when loading from memory buffer\n"); + params.use_mmap = false; + } +} +} // namespace + +struct llama_model * llama_model_load_from_buffer(std::vector && data, struct llama_model_params params) { + std::unique_ptr> streambuf = std::make_unique(std::move(data)); + override_and_disable_mmap(params); + llama_model_loader ml(load_input_variant::buffer_load_input{ streambuf }, params.use_mmap, params.check_tensors, + params.kv_overrides, params.tensor_buft_overrides); + return llama_model_load_from_file_impl(ml, params); +} + +namespace { +std::vector splits_from_c_paths(const char ** paths, size_t n_paths) { std::vector splits; if (n_paths == 0) { LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__); - return nullptr; + return splits; } for (size_t i = 0; i < n_paths; ++i) { splits.push_back(paths[i]); } - return llama_model_load_from_file_impl(splits.front(), splits, params); + return splits; +} +} // namespace + +struct llama_model * llama_model_load_from_splits(const char ** paths, size_t n_paths, + struct llama_model_params params) { + std::vector splits = splits_from_c_paths(paths, n_paths); + if (splits.empty()) { + return nullptr; + } + llama_model_loader ml = create_disk_fileloader(splits.front().c_str(), splits, params); + return llama_model_load_from_file_impl(ml, params); +} + +struct llama_model * llama_model_load_from_split_futures(const char ** paths, size_t n_paths, const char * context, + const char * tensor_list_file, + struct llama_model_params params) { + std::vector splits = splits_from_c_paths(paths, n_paths); + if (splits.empty()) { + return nullptr; + } + std::string tensor_list_file_str(tensor_list_file); + + load_input_variant::buffer_future_load_input loader_input{ splits.front(), context, splits, tensor_list_file_str }; + override_and_disable_mmap(params); + llama_model_loader ml(loader_input, params.use_mmap, params.check_tensors, params.kv_overrides, + params.tensor_buft_overrides); + return llama_model_load_from_file_impl(ml, params); +} + +bool llama_model_load_fulfill_split_future(const char * path, const char * context, + std::unique_ptr> && streambuf) { + return llama_future_file_buffer_ro::fulfill_promise(path, context, + std::make_unique(std::move(streambuf))); } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index fc1557a2d4065..cb7ebae0a6bae 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,7 +8,7 @@ function(llama_build source) endif() add_executable(${TEST_TARGET} ${source}) - target_link_libraries(${TEST_TARGET} PRIVATE common) + target_link_libraries(${TEST_TARGET} PRIVATE common llama-common-test) install(TARGETS ${TEST_TARGET} RUNTIME) endfunction() @@ -97,7 +97,7 @@ function(llama_build_and_test source) add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) - target_link_libraries(${TEST_TARGET} PRIVATE common) + target_link_libraries(${TEST_TARGET} PRIVATE common llama-common-test) add_test( NAME ${TEST_TARGET} @@ -197,6 +197,9 @@ llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") +llama_build_and_test(test-model-load-disk.cpp LABEL "model") +llama_build_and_test(test-model-load-memory.cpp LABEL "model") +llama_build_and_test(test-model-load-memory-split.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") if (NOT GGML_BACKEND_DL) diff --git a/tests/test-model-load-disk.cpp b/tests/test-model-load-disk.cpp new file mode 100644 index 0000000000000..3310681200c0f --- /dev/null +++ b/tests/test-model-load-disk.cpp @@ -0,0 +1,41 @@ +#include + +#include "get-model.h" +#include "llama.h" + +int main(int argc, char * argv[]) { + auto * model_path = get_model_or_exit(argc, argv); + auto * file = fopen(model_path, "r"); + if (file == nullptr) { + fprintf(stderr, "no model at '%s' found\n", model_path); + return EXIT_FAILURE; + } + + fprintf(stderr, "using '%s'\n", model_path); + fclose(file); + + llama_backend_init(); + auto params = llama_model_params{}; + params.use_mmap = false; + params.progress_callback = [](float progress, void * ctx) { + (void) ctx; + fprintf(stderr, "%.2f%% ", progress * 100.0f); + // true means: Don't cancel the load + return true; + }; + auto * model = llama_model_load_from_file(model_path, params); + + // Add newline after progress output + fprintf(stderr, "\n"); + + if (model == nullptr) { + fprintf(stderr, "Failed to load model\n"); + llama_backend_free(); + return EXIT_FAILURE; + } + + fprintf(stderr, "Model loaded successfully\n"); + llama_model_free(model); + llama_backend_free(); + return EXIT_SUCCESS; +} diff --git a/tests/test-model-load-memory-split.cpp b/tests/test-model-load-memory-split.cpp new file mode 100644 index 0000000000000..5b87bcc9c5dbb --- /dev/null +++ b/tests/test-model-load-memory-split.cpp @@ -0,0 +1,74 @@ +#include +#include +#include + +#include "get-model.h" +#include "llama-cpp.h" +#include "load_into_memory.h" + +int main(int argc, char * argv[]) { + auto * model_path = get_model_or_exit(argc, argv); + + if (!is_split_file(model_path)) { + printf("Skipping not-split model %s\n", model_path); + return EXIT_SUCCESS; + } + + // Manually load into a memory buffer first + file_entry tensor_list_file = load_tensor_list_file(model_path); + std::vector files = load_files_into_streambuf(model_path); + + llama_backend_init(); + auto params = llama_model_params{}; + params.use_mmap = false; + params.progress_callback = [](float progress, void * ctx) { + (void) ctx; + fprintf(stderr, "%.2f%% ", progress * 100.0f); + // true means: Don't cancel the load + return true; + }; + + printf("Loading model from %zu files\n", files.size()); + + std::vector file_paths; + for (size_t i = 0; i < files.size(); i++) { + printf("Found file %s \n", files[i].path.c_str()); + file_paths.push_back(files[i].path.c_str()); + } + + const char * async_load_context = "test-model-load"; + std::thread fulfill_thread([&files, &tensor_list_file, &async_load_context]() { + const bool success = llama_model_load_fulfill_split_future(tensor_list_file.path.c_str(), async_load_context, + std::move(tensor_list_file.streambuf)); + printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(), success ? "success" : "failure"); + if (!success) { + exit(EXIT_FAILURE); + } + for (size_t i = 0; i < files.size(); i++) { + const bool success = llama_model_load_fulfill_split_future(files[i].path.c_str(), async_load_context, + std::move(files[i].streambuf)); + printf("Fulfilling file %s: %s\n", files[i].path.c_str(), success ? "success" : "failure"); + if (!success) { + exit(EXIT_FAILURE); + } + } + }); + fprintf(stderr, "Loading model from splits\n"); + auto * model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context, + tensor_list_file.path.c_str(), params); + fulfill_thread.join(); + + fprintf(stderr, "\n"); + + if (model == nullptr) { + fprintf(stderr, "Failed to load model\n"); + llama_backend_free(); + return EXIT_FAILURE; + } + + fprintf(stderr, "Model loaded successfully\n"); + llama_model_free(model); + llama_backend_free(); + + return EXIT_SUCCESS; +} diff --git a/tests/test-model-load-memory.cpp b/tests/test-model-load-memory.cpp new file mode 100644 index 0000000000000..255abb46e499f --- /dev/null +++ b/tests/test-model-load-memory.cpp @@ -0,0 +1,47 @@ +#include +#include +#include + +#include "get-model.h" +#include "llama-cpp.h" +#include "load_into_memory.h" + +int main(int argc, char * argv[]) { + auto * model_path = get_model_or_exit(argc, argv); + + if (is_split_file(model_path)) { + printf("Skipping split model %s\n", model_path); + return EXIT_SUCCESS; + } + + // Manually load into a memory buffer first + std::vector buffer = load_file_into_buffer(model_path); + + llama_backend_init(); + auto params = llama_model_params{}; + params.use_mmap = false; + params.progress_callback = [](float progress, void * ctx) { + (void) ctx; + fprintf(stderr, "%.2f%% ", progress * 100.0f); + // true means: Don't cancel the load + return true; + }; + + // Test that it can load directly from a buffer + printf("Loading model from buffer of size %zu bytes\n", buffer.size()); + auto * model = llama_model_load_from_buffer(std::move(buffer), params); + + // Add newline after progress output + fprintf(stderr, "\n"); + + if (model == nullptr) { + fprintf(stderr, "Failed to load model\n"); + llama_backend_free(); + return EXIT_FAILURE; + } + + fprintf(stderr, "Model loaded successfully\n"); + llama_model_free(model); + llama_backend_free(); + return EXIT_SUCCESS; +} diff --git a/tools/gguf-split/gguf-split.cpp b/tools/gguf-split/gguf-split.cpp index 30e771564e808..5c45940cff4fe 100644 --- a/tools/gguf-split/gguf-split.cpp +++ b/tools/gguf-split/gguf-split.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #if defined(_WIN32) #include @@ -43,6 +44,8 @@ struct split_params { std::string output; bool no_tensor_first_split = false; bool dry_run = false; + bool verbose = false; + std::set must_be_followed_layers; }; static void split_print_usage(const char * executable) { @@ -50,7 +53,8 @@ static void split_print_usage(const char * executable) { printf("\n"); printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable); printf("\n"); - printf("Apply a GGUF operation on IN to OUT."); + printf("Apply a GGUF operation on IN to OUT.\n"); + printf("When splitting, also creates GGUF_OUT.tensors.txt with all tensor names.\n"); printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); @@ -60,7 +64,9 @@ static void split_print_usage(const char * executable) { printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); printf(" --split-max-size N(M|G) max size per split\n"); printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n"); + printf(" --must-be-followed LAYER ensure LAYER is not the last tensor in a split and will not be released when loading after any tensor is created (can be used multiple times)\n"); printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); + printf(" --verbose show tensor names for each split\n"); printf("\n"); } @@ -106,6 +112,9 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p } else if (arg == "--dry-run") { arg_found = true; params.dry_run = true; + } else if (arg == "--verbose") { + arg_found = true; + params.verbose = true; } else if (arg == "--no-tensor-first-split") { arg_found = true; params.no_tensor_first_split = true; @@ -143,6 +152,13 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p } params.mode = MODE_SIZE; params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); + } else if (arg == "--must-be-followed") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + params.must_be_followed_layers.insert(argv[arg_idx]); } if (!arg_found) { @@ -275,7 +291,19 @@ struct split_strategy { } } + bool must_be_followed(int i_tensor) { + if (i_tensor > 0 && i_tensor < n_tensors) { + const char* tensor_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + return params.must_be_followed_layers.find(tensor_name) != params.must_be_followed_layers.end(); + } + return false; + } + bool should_split(int i_tensor, size_t next_size) { + if (must_be_followed(i_tensor) || must_be_followed(i_tensor - 1)) { + return false; + } + if (params.mode == MODE_SIZE) { // split by max size per file return next_size > params.n_bytes_split; @@ -299,10 +327,41 @@ struct split_strategy { } total_size = total_size / 1000 / 1000; // convert to megabytes printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + + if (params.verbose) { + for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { + const char * t_name = gguf_get_tensor_name(ctx_out, i); + printf(" - %s\n", t_name); + } + } i_split++; } } + void write_tensor_list() { + // Create a .txt file with all tensor names from all splits + std::string tensor_list_path = params.output + ".tensors.txt"; + std::ofstream tensor_file(tensor_list_path); + if (!tensor_file.is_open()) { + fprintf(stderr, "warning: failed to create tensor list file %s\n", tensor_list_path.c_str()); + return; + } + + printf("Writing tensor list to %s ... ", tensor_list_path.c_str()); + fflush(stdout); + + // Write all tensor names from all splits + for (auto & ctx_out : ctx_outs) { + for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { + const char * t_name = gguf_get_tensor_name(ctx_out, i); + tensor_file << t_name << "\n"; + } + } + + tensor_file.close(); + printf("done\n"); + } + void write() { int i_split = 0; int n_split = ctx_outs.size(); @@ -382,6 +441,9 @@ static void gguf_split(const split_params & split_params) { int n_split = strategy.ctx_outs.size(); strategy.print_info(); + // Write tensor list file + strategy.write_tensor_list(); + if (!split_params.dry_run) { // write all output splits strategy.write();