diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2becb04c6bb9..9e111c2247bb1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,6 +193,10 @@ if (LLAMA_BUILD_COMMON)
     add_subdirectory(common)
 endif()
 
+if(LLAMA_BUILD_EXAMPLES OR LLAMA_BUILD_TESTS)
+    add_subdirectory(common_test)
+endif()
+
 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
     include(CTest)
     add_subdirectory(tests)
diff --git a/common/common.cpp b/common/common.cpp
index e4e71ad13fb59..c5d8cc0f8ff29 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -899,15 +899,7 @@ std::string fs_get_cache_file(const std::string & filename) {
 // Model utils
 //
 
-struct common_init_result common_init_from_params(common_params & params) {
-    common_init_result iparams;
-    auto mparams = common_model_params_to_llama(params);
-
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
-    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
-        return iparams;
-    }
+struct common_init_result common_init_from_model_and_params(llama_model* model, common_init_result iparams, common_params & params) {
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -1068,6 +1060,19 @@ struct common_init_result common_init_from_params(common_params & params) {
     return iparams;
 }
 
+struct common_init_result common_init_from_params(common_params & params) {
+    common_init_result iparams;
+    auto               mparams = common_model_params_to_llama(params);
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        return iparams;
+    }
+
+    return common_init_from_model_and_params(model, std::move(iparams), params);
+}
+
 std::string get_model_endpoint() {
     const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
     // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
diff --git a/common/common.h b/common/common.h
index e08a59eae7543..e3c13ac9f866f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -551,6 +551,8 @@ struct common_init_result {
 };
 
 struct common_init_result     common_init_from_params(common_params & params);
+struct common_init_result     common_init_from_model_and_params(llama_model * model, common_init_result iparams,
+                                                                common_params & params);
 
 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
diff --git a/common_test/CMakeLists.txt b/common_test/CMakeLists.txt
new file mode 100644
index 0000000000000..44903612e534b
--- /dev/null
+++ b/common_test/CMakeLists.txt
@@ -0,0 +1,15 @@
+# common_test library for load_into_memory.h and uint8-buff-stream.h
+
+set(TARGET llama-common-test)
+
+add_library(${TARGET} INTERFACE)
+
+target_include_directories(${TARGET} INTERFACE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+target_compile_definitions(${TARGET} INTERFACE LLAMA_COMMON_TEST_HEADERS)
+
+target_compile_features(${TARGET} INTERFACE cxx_std_17)
+
+target_link_libraries(${TARGET} INTERFACE common)
diff --git a/common_test/load_into_memory.h b/common_test/load_into_memory.h
new file mode 100644
index 0000000000000..0ffd9228baa2a
--- /dev/null
+++ b/common_test/load_into_memory.h
@@ -0,0 +1,220 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <streambuf>
+#include <string>
+#include <thread>
+#include <vector>
+
+// header-only utilities to showcase how to directly load a model from memory
+#include "uint8-buff-stream-wrapper.h"
+
+namespace {
+bool is_split_file(const char * const model_path) {
+    if (!model_path) {
+        fprintf(stderr, "No model file provided\n");
+        exit(EXIT_FAILURE);
+    }
+
+    std::string path(model_path);
+    return path.find("-of-") != std::string::npos;
+}
+
+std::vector<uint8_t> load_file_into_buffer(const char * const model_path) {
+    std::ifstream file_stream(model_path, std::ios::binary | std::ios::ate);
+    if (!file_stream) {
+        fprintf(stderr, "Failed to open file %s for reading into streambuf\n", model_path);
+        exit(EXIT_FAILURE);
+    }
+
+    const size_t file_size = file_stream.tellg();
+    file_stream.seekg(0, std::ios::beg);
+
+    static_assert(sizeof(std::uint8_t) == sizeof(char), "uint8_t must be same size as char");
+    std::vector<std::uint8_t> buffer(file_size);
+    if (!file_stream.read((char *) buffer.data(), file_size)) {
+        fprintf(stderr, "Failed to read entire file into buffer\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return buffer;
+}
+
+std::unique_ptr<std::basic_streambuf<uint8_t>> load_file_into_streambuf(const char * const model_path) {
+    return std::make_unique<Uint8BufferStreamBuf>(load_file_into_buffer(model_path));
+}
+
+struct file_entry {
+    std::string                                    path;
+    std::unique_ptr<std::basic_streambuf<uint8_t>> streambuf;
+};
+
+std::vector<file_entry> load_files_into_streambuf(const char * const model_path) {
+    std::vector<file_entry> files;
+
+    // Extract pattern from first file path
+    std::string path(model_path);
+
+    // Split by '-'
+    std::vector<std::string> parts;
+    std::stringstream        ss(path);
+    std::string              item;
+    while (std::getline(ss, item, '-')) {
+        parts.push_back(item);
+    }
+
+    // Split the last part by '.'
+    std::string last_part = parts.back();
+    parts.pop_back();
+    size_t dot_pos = last_part.find('.');
+    if (dot_pos != std::string::npos) {
+        parts.push_back(last_part.substr(0, dot_pos));
+        parts.push_back(last_part.substr(dot_pos + 1));  // extension
+    } else {
+        parts.push_back(last_part);
+    }
+
+    // Check if we have enough parts
+    if (parts.size() < 4) {
+        fprintf(stderr, "Model path does not contain expected pattern\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Get total files from [-2] position (before the extension)
+    int total_files = std::stoi(parts[parts.size() - 2]);
+
+    // Get base path by joining all parts except -start-of-end.gguf
+    std::string base_path;
+    for (size_t i = 0; i < parts.size() - 4; i++) {
+        if (i > 0) {
+            base_path += "-";
+        }
+        base_path += parts[i];
+    }
+
+    for (int i = 1; i <= total_files; i++) {
+        char numbered_path[1024];
+        snprintf(numbered_path, sizeof(numbered_path), "%s-%05d-of-%05d.gguf", base_path.c_str(), i, total_files);
+
+        files.push_back({ numbered_path, load_file_into_streambuf(numbered_path) });
+    }
+
+    return files;
+}
+
+file_entry load_tensor_list_file(const char * const model_path) {
+    std::string path(model_path);
+
+    // Split by '-'
+    std::vector<std::string> parts;
+    std::stringstream        ss(path);
+    std::string              item;
+    while (std::getline(ss, item, '-')) {
+        parts.push_back(item);
+    }
+
+    // Split the last part by '.'
+    std::string last_part = parts.back();
+    parts.pop_back();
+    size_t dot_pos = last_part.find('.');
+    if (dot_pos != std::string::npos) {
+        parts.push_back(last_part.substr(0, dot_pos));
+        parts.push_back(last_part.substr(dot_pos + 1));  // extension
+    } else {
+        parts.push_back(last_part);
+    }
+
+    // Check if we have enough parts
+    if (parts.size() < 4) {
+        fprintf(stderr, "Model path does not contain expected pattern\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Get base path by joining all parts except -start-of-end.gguf
+    std::string base_path;
+    for (size_t i = 0; i < parts.size() - 4; i++) {
+        if (i > 0) {
+            base_path += "-";
+        }
+        base_path += parts[i];
+    }
+
+    // Construct tensor list file path
+    std::string tensor_list_path = base_path + ".tensors.txt";
+
+    printf("Loading tensor list file: %s\n", tensor_list_path.c_str());
+    return { tensor_list_path, load_file_into_streambuf(tensor_list_path.c_str()) };
+}
+
+llama_model * load_model_from_memory_configuration(const char * model_path, llama_model_params & model_params) {
+    llama_model *                         model;
+    std::chrono::steady_clock::time_point load_start_time;
+    if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER")) {
+        std::vector<uint8_t> buffer = load_file_into_buffer(model_path);
+        fprintf(stdout, "%s: loading model from memory buffer\n", __func__);
+        load_start_time = std::chrono::steady_clock::now();
+        model           = llama_model_load_from_buffer(std::move(buffer), model_params);
+    } else if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT")) {
+        file_entry              tensor_list_file = load_tensor_list_file(model_path);
+        std::vector<file_entry> files            = load_files_into_streambuf(model_path);
+        fprintf(stdout, "%s: loading model from %zu file streambufs\n", __func__, files.size());
+
+        std::vector<const char *> file_paths;
+        for (const auto & file : files) {
+            printf("Found file %s with streambuf\n", file.path.c_str());
+            file_paths.push_back(file.path.c_str());
+        }
+
+        load_start_time                 = std::chrono::steady_clock::now();
+        const char * async_load_context = "test-model-load";
+        std::thread  fulfill_thread([&files, &tensor_list_file, &async_load_context]() {
+            const bool success = llama_model_load_fulfill_split_future(
+                tensor_list_file.path.c_str(), async_load_context, std::move(tensor_list_file.streambuf));
+            printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(),
+                   success ? "success" : "failure");
+            if (!success) {
+                exit(EXIT_FAILURE);
+            }
+
+            for (auto & file : files) {
+                const bool success = llama_model_load_fulfill_split_future(file.path.c_str(), async_load_context,
+                                                                            std::move(file.streambuf));
+                printf("Fulfilling file %s with streambuf: %s\n", file.path.c_str(), success ? "success" : "failure");
+                if (!success) {
+                    exit(EXIT_FAILURE);
+                }
+            }
+        });
+        fprintf(stderr, "Loading model from splits\n");
+        model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context,
+                                                    tensor_list_file.path.c_str(), model_params);
+        fulfill_thread.join();
+    } else if (getenv("LLAMA_EXAMPLE_FROM_FILE")) {
+        load_start_time = std::chrono::steady_clock::now();
+        model           = llama_model_load_from_file(model_path, model_params);
+    } else {
+        return nullptr;
+    }
+
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        exit(1);
+    }
+    std::chrono::steady_clock::time_point load_end_time = std::chrono::steady_clock::now();
+    std::chrono::duration<double>         load_duration = load_end_time - load_start_time;
+    fprintf(stdout, "%s: loading model took %f seconds\n", __func__, load_duration.count());
+    return model;
+}
+
+bool memory_configuration_env_is_set() {
+    return getenv("LLAMA_EXAMPLE_MEMORY_BUFFER") || getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT") ||
+           getenv("LLAMA_EXAMPLE_FROM_FILE");
+}
+}  // namespace
diff --git a/common_test/uint8-buff-stream-wrapper.h b/common_test/uint8-buff-stream-wrapper.h
new file mode 100644
index 0000000000000..3a03721b98c07
--- /dev/null
+++ b/common_test/uint8-buff-stream-wrapper.h
@@ -0,0 +1,5 @@
+#pragma once
+
+// Wrapper to include the specific header from src
+#include "uint8-buff-stream.h"
+
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 809040307d2c9..be3d7b17e1578 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama llama-common-test ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 0ec2999a0c8e9..63fb27fe98a46 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,15 +1,25 @@
+#include <algorithm>
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <thread>
+#include <vector>
+
 #include "arg.h"
 #include "common.h"
+#include "llama-cpp.h"
 #include "log.h"
-#include "llama.h"
-
-#include <ctime>
-#include <algorithm>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#ifdef LLAMA_COMMON_TEST_HEADERS
+#include "load_into_memory.h"
+#endif
+
 static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
     std::vector<std::string> lines;
     size_t start = 0;
@@ -94,7 +104,20 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result llama_init;
+
+#ifdef LLAMA_COMMON_TEST_HEADERS
+    if (memory_configuration_env_is_set()) {
+        llama_model_params mparams = common_model_params_to_llama(params);
+        common_init_result iparams;
+        llama_model *      model = load_model_from_memory_configuration(params.model.path.c_str(), mparams);
+        llama_init               = common_init_from_model_and_params(model, std::move(iparams), params);
+    } else {
+        llama_init = common_init_from_params(params);
+    }
+#else
+    llama_init = common_init_from_params(params);
+#endif
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index 104ecabfd7236..5ada3fdd3de6a 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama llama-common-test ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 633b87e58406e..f35a34eede829 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,15 +1,20 @@
+#include "llama-cpp.h"
 #include "llama.h"
 #include <cstdio>
 #include <cstring>
 #include <string>
-#include <vector>
 
 static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
     printf("\n    %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
+    printf("\n Optional environment variables: LLAMA_EXAMPLE_MEMORY_BUFFER LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT");
     printf("\n");
 }
 
+#ifdef LLAMA_COMMON_TEST_HEADERS
+#include "load_into_memory.h"
+#endif
+
 int main(int argc, char ** argv) {
     // path to the model gguf file
     std::string model_path;
@@ -83,12 +88,13 @@ int main(int argc, char ** argv) {
     llama_model_params model_params = llama_model_default_params();
     model_params.n_gpu_layers = ngl;
 
+#ifdef LLAMA_COMMON_TEST_HEADERS
+    llama_model * model = memory_configuration_env_is_set() ?
+                              load_model_from_memory_configuration(model_path.c_str(), model_params) :
+                              llama_model_load_from_file(model_path.c_str(), model_params);
+#else
     llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
-
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
+#endif
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
     // tokenize the prompt
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
index 79ee202062b01..1cf020b2a64e7 100644
--- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
@@ -78,7 +78,6 @@ extern "C" {
 
     GGML_API struct gguf_context * gguf_init_empty(void);
     GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
 
     GGML_API void gguf_free(struct gguf_context * ctx);
 
@@ -200,3 +199,8 @@ extern "C" {
 #ifdef  __cplusplus
 }
 #endif
+
+#ifdef __cplusplus
+#include <ios>
+GGML_API struct gguf_context * gguf_init_from_buffer(std::basic_streambuf<uint8_t>& streambuf, struct gguf_init_params params);
+#endif
diff --git a/ggml/include/uint8-buff-stream.h b/ggml/include/uint8-buff-stream.h
new file mode 100644
index 0000000000000..6d29d20dd52f4
--- /dev/null
+++ b/ggml/include/uint8-buff-stream.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <streambuf>
+#include <vector>
+
+#ifdef __APPLE__
+#    include <locale>
+
+/// @brief Custom ctype specialization for uint8_t to work around libc++
+/// limitation in macOS
+template <> struct std::ctype<uint8_t> : public std::ctype_base {
+    using char_type = uint8_t;
+    static std::locale::id id;
+
+    ctype() : std::ctype_base() {}
+
+    ctype([[maybe_unused]] const std::locale::facet & other) : std::ctype_base() {}
+
+    ctype & operator=(const ctype & other) {
+        if (this != &other) {
+            std::ctype_base::operator=(other);
+        }
+        return *this;
+    }
+
+    // Required public interface methods
+    bool is(mask m, [[maybe_unused]] char_type c) const {
+        return (m & space) != 0;  // Treat all uint8_t as non-space
+    }
+
+    const char_type * is(const char_type * low, const char_type * high, mask * vec) const {
+        for (; low != high; ++low, ++vec) {
+            *vec = 0;  // No special character properties
+        }
+        return high;
+    }
+
+    const char_type * scan_is(mask m, const char_type * low, const char_type * high) const {
+        for (; low != high; ++low) {
+            if (is(m, *low)) {
+                return low;
+            }
+        }
+        return high;
+    }
+
+    const char_type * scan_not(mask m, const char_type * low, const char_type * high) const {
+        for (; low != high; ++low) {
+            if (!is(m, *low)) {
+                return low;
+            }
+        }
+        return high;
+    }
+
+    char_type toupper(char_type c) const {
+        return c;  // No case conversion for uint8_t
+    }
+
+    const char_type * toupper([[maybe_unused]] char_type * low, const char_type * high) const {
+        return high;  // No case conversion for uint8_t
+    }
+
+    char_type tolower(char_type c) const {
+        return c;  // No case conversion for uint8_t
+    }
+
+    const char_type * tolower([[maybe_unused]] char_type * low, const char_type * high) const {
+        return high;  // No case conversion for uint8_t
+    }
+
+    char_type widen(char c) const { return static_cast<char_type>(c); }
+
+    const char * widen(const char * low, const char * high, char_type * dest) const {
+        for (; low != high; ++low, ++dest) {
+            *dest = static_cast<char_type>(*low);
+        }
+        return high;
+    }
+
+    char narrow(char_type c, [[maybe_unused]] char dfault) const { return static_cast<char>(c); }
+
+    const char_type * narrow(const char_type * low, const char_type * high, [[maybe_unused]] char dfault,
+                             char * dest) const {
+        for (; low != high; ++low, ++dest) {
+            *dest = static_cast<char>(*low);
+        }
+        return high;
+    }
+};
+#endif
+
+/// @brief Custom traits for uint8_t for usage in std template classes that use char_traits (e.g. std::basic_streambuf)
+template <> struct std::char_traits<uint8_t> {
+    using char_type  = uint8_t;
+    using int_type   = int;
+    using off_type   = std::streamoff;
+    using pos_type   = std::streampos;
+    using state_type = std::mbstate_t;
+
+    static void assign(char_type & c1, const char_type & c2) noexcept { c1 = c2; }
+
+    static constexpr bool eq(char_type a, char_type b) noexcept { return a == b; }
+
+    static constexpr bool lt(char_type a, char_type b) noexcept { return a < b; }
+
+    static int compare(const char_type * s1, const char_type * s2, std::size_t n) {
+        for (std::size_t i = 0; i < n; ++i) {
+            if (lt(s1[i], s2[i])) {
+                return -1;
+            }
+            if (lt(s2[i], s1[i])) {
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    static std::size_t length(const char_type * s) {
+        std::size_t i = 0;
+        while (!eq(s[i], char_type())) {
+            ++i;
+        }
+        return i;
+    }
+
+    static const char_type * find(const char_type * s, std::size_t n, const char_type & c) {
+        for (std::size_t i = 0; i < n; ++i) {
+            if (eq(s[i], c)) {
+                return s + i;
+            }
+        }
+        return nullptr;
+    }
+
+    static char_type * move(char_type * s1, const char_type * s2, std::size_t n) {
+        return static_cast<char_type *>(std::memmove(s1, s2, n));
+    }
+
+    static char_type * copy(char_type * s1, const char_type * s2, std::size_t n) {
+        return static_cast<char_type *>(std::memcpy(s1, s2, n));
+    }
+
+    static char_type * assign(char_type * s, std::size_t n, char_type c) {
+        for (std::size_t i = 0; i < n; ++i) {
+            s[i] = c;
+        }
+        return s;
+    }
+
+    static constexpr int_type not_eof(int_type c) noexcept { return eq_int_type(c, eof()) ? 0 : c; }
+
+    static constexpr char_type to_char_type(int_type c) noexcept {
+        return c >= 0 && c <= 255 ? static_cast<char_type>(c) : char_type();
+    }
+
+    static constexpr int_type to_int_type(char_type c) noexcept { return static_cast<int_type>(c); }
+
+    static constexpr bool eq_int_type(int_type c1, int_type c2) noexcept { return c1 == c2; }
+
+    static constexpr int_type eof() noexcept { return static_cast<int_type>(-1); }
+};
+
+#ifdef GGML_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BUILD
+#            define GGML_CLASS_API __declspec(dllexport)
+#        else
+#            define GGML_CLASS_API __declspec(dllimport)
+#        endif
+#    else
+#        define GGML_CLASS_API __attribute__((visibility("default")))
+#    endif
+#else
+#    define GGML_CLASS_API
+#endif
+
+/// @brief Custom streambuf for uint8_t
+class GGML_CLASS_API Uint8BufferStreamBuf : public std::basic_streambuf<uint8_t> {
+  public:
+    Uint8BufferStreamBuf(std::vector<uint8_t> && _data);
+
+  protected:
+    int_type underflow() override;
+
+    /// @brief Efficient bulk reading. The standard implementation specifies that this function can be overridden
+    /// to provide a more efficient implementation: sgetn will call this function if it is overridden.
+    std::streamsize xsgetn(char_type * s, std::streamsize n) override;
+
+    pos_type seekoff(off_type off, std::ios_base::seekdir dir,
+                     std::ios_base::openmode which = std::ios_base::in) override;
+
+    pos_type seekpos(pos_type pos, std::ios_base::openmode which = std::ios_base::in) override;
+
+  private:
+    std::vector<uint8_t> data;
+};
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 9cb2c228dcfb2..d0eb33eca851b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -194,6 +194,7 @@ add_library(ggml-base
             ../include/ggml-cpp.h
             ../include/ggml-opt.h
             ../include/gguf.h
+            ../include/uint8-buff-stream.h
             ggml.c
             ggml.cpp
             ggml-alloc.c
@@ -203,7 +204,8 @@ add_library(ggml-base
             ggml-threading.h
             ggml-quants.c
             ggml-quants.h
-            gguf.cpp)
+            gguf.cpp
+            uint8-buff-stream.cpp)
 
 target_include_directories(ggml-base PRIVATE .)
 if (GGML_BACKEND_DL)
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index a0a318a29f5b9..957c56153a9f5 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -2,6 +2,7 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "gguf.h"
+#include "uint8-buff-stream.h"
 
 #include <cinttypes>
 #include <cstddef>
@@ -216,14 +217,79 @@ struct gguf_context {
     void * data = nullptr;
 };
 
-struct gguf_reader {
+struct gguf_bytes_reader {
+    /// @brief Reads up to `count` objects into the array `buffer`.
+    /// The position of the underlying stream implementation is advanced
+    /// by the number of characters read.
+    ///
+    /// @note If an error occurs, the resulting value of the underlying stream
+    /// position indicator is indeterminate.
+    virtual size_t read(void * buffer, size_t size, size_t count) = 0;
+
+    /// @brief Seeks to a position aligned to the given alignment boundary.
+    /// @return The current position after alignment, or 0 on error.
+    virtual size_t align(size_t alignment) = 0;
+
+    virtual ~gguf_bytes_reader() = 0;
+};
+
+gguf_bytes_reader::~gguf_bytes_reader() {}
+
+struct gguf_bytes_buffer_reader : public gguf_bytes_reader {
+    gguf_bytes_buffer_reader(std::basic_streambuf<uint8_t> & streambuf) : streambuf(streambuf), offset(0) {}
+
+    ~gguf_bytes_buffer_reader() {}
+
+    size_t read(void * buffer, size_t size, size_t count) override {
+        size_t total_size = size * count;
+        auto   bytes_read = streambuf.sgetn(static_cast<uint8_t *>(buffer), total_size);
+        offset += bytes_read;
+        return bytes_read;
+    }
+
+    size_t align(size_t alignment) override {
+        size_t new_offset  = GGML_PAD(offset, alignment);
+        size_t seek_offset = new_offset - offset;
+
+        auto result = streambuf.pubseekoff(seek_offset, std::ios_base::cur);
+        if (result == std::streampos(-1)) {
+            return 0;
+        }
+        offset = new_offset;
+        return offset;
+    }
+
+  private:
+    std::basic_streambuf<uint8_t> & streambuf;
+    size_t                          offset;
+};
+
+struct gguf_bytes_file_reader : public gguf_bytes_reader {
+    gguf_bytes_file_reader(FILE * file) : file(file) {}
+
+    ~gguf_bytes_file_reader() {}
+
+    size_t read(void * buffer, size_t size, size_t count) override { return fread(buffer, 1, size * count, file); }
+
+    size_t align(size_t alignment) override {
+        if (fseek(file, GGML_PAD(ftell(file), alignment), SEEK_SET) != 0) {
+            return 0;
+        }
+        return ftell(file);
+    }
+
+  private:
     FILE * file;
+};
 
-    gguf_reader(FILE * file) : file(file) {}
+struct gguf_reader {
+    gguf_bytes_reader& bytes_reader;
+
+    gguf_reader(gguf_bytes_reader& bytes_reader) : bytes_reader(bytes_reader) {}
 
     template <typename T>
     bool read(T & dst) const {
-        return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
+        return bytes_reader.read(&dst, 1, sizeof(dst)) == sizeof(dst);
     }
 
     template <typename T>
@@ -278,11 +344,11 @@ struct gguf_reader {
             return false;
         }
         dst.resize(size);
-        return fread(dst.data(), 1, dst.length(), file) == dst.length();
+        return bytes_reader.read(dst.data(), 1, dst.length()) == dst.length();
     }
 
     bool read(void * dst, const size_t size) const {
-        return fread(dst, 1, size, file) == size;
+        return bytes_reader.read(dst, 1, size) == size;
     }
 };
 
@@ -316,8 +382,8 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
     return true;
 }
 
-struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
-    const struct gguf_reader gr(file);
+namespace {
+struct gguf_context * gguf_init_from_reader_impl(const struct gguf_reader& gr, struct gguf_init_params params) {
     struct gguf_context * ctx = new gguf_context;
 
     bool ok = true;
@@ -606,15 +672,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
     GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
 
     // we require the data section to be aligned, so take into account any padding
-    if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) {
-        GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
+    // store the current file offset - this is where the data section starts
+    ctx->offset = gr.bytes_reader.align(ctx->alignment);
+    if (ctx->offset == 0) {
+        GGML_LOG_ERROR("%s: failed to align data section\n", __func__);
         gguf_free(ctx);
         return nullptr;
     }
 
-    // store the current file offset - this is where the data section starts
-    ctx->offset = ftell(file);
-
     // compute the total size of the data section, taking into account the alignment
     {
         ctx->size = 0;
@@ -718,6 +783,13 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
 
     return ctx;
 }
+}
+
+struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
+    gguf_bytes_file_reader bytes_reader(file);
+    gguf_reader            reader(bytes_reader);
+    return gguf_init_from_reader_impl(reader, params);
+}
 
 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
     FILE * file = ggml_fopen(fname, "rb");
@@ -732,6 +804,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     return result;
 }
 
+struct gguf_context * gguf_init_from_buffer(std::basic_streambuf<uint8_t> & streambuf, struct gguf_init_params params) {
+    gguf_bytes_buffer_reader bytes_reader(streambuf);
+    gguf_reader              reader(bytes_reader);
+    return gguf_init_from_reader_impl(reader, params);
+}
+
 void gguf_free(struct gguf_context * ctx) {
     if (ctx == nullptr) {
         return;
diff --git a/ggml/src/uint8-buff-stream.cpp b/ggml/src/uint8-buff-stream.cpp
new file mode 100644
index 0000000000000..14e8dbc20eac3
--- /dev/null
+++ b/ggml/src/uint8-buff-stream.cpp
@@ -0,0 +1,59 @@
+#include "uint8-buff-stream.h"
+
+#ifdef __APPLE__
+std::locale::id std::ctype<uint8_t>::id;
+#endif
+
+Uint8BufferStreamBuf::Uint8BufferStreamBuf(std::vector<uint8_t> && _data) : data(std::move(_data)) {
+    setg(const_cast<uint8_t *>(data.data()), const_cast<uint8_t *>(data.data()),
+         const_cast<uint8_t *>(data.data()) + data.size());
+}
+
+Uint8BufferStreamBuf::int_type Uint8BufferStreamBuf::underflow() {
+    if (gptr() < egptr()) {
+        return traits_type::to_int_type(*gptr());
+    }
+    return traits_type::eof();
+}
+
+std::streamsize Uint8BufferStreamBuf::xsgetn(char_type * s, std::streamsize n) {
+    std::streamsize available = egptr() - gptr();
+    std::streamsize to_read   = std::min(n, available);
+    if (to_read > 0) {
+        std::memcpy(s, gptr(), to_read);
+        setg(eback(), gptr() + to_read, egptr());
+    }
+    return to_read;
+}
+
+Uint8BufferStreamBuf::pos_type Uint8BufferStreamBuf::seekoff(off_type off, std::ios_base::seekdir dir,
+                                                             std::ios_base::openmode which) {
+    if (!(which & std::ios_base::in)) {
+        return pos_type(off_type(-1));
+    }
+    char_type * new_pos = nullptr;
+    if (dir == std::ios_base::beg) {
+        new_pos = eback() + off;
+    } else if (dir == std::ios_base::cur) {
+        new_pos = gptr() + off;
+    } else if (dir == std::ios_base::end) {
+        new_pos = egptr() + off;
+    }
+    if (new_pos >= eback() && new_pos <= egptr()) {
+        setg(eback(), new_pos, egptr());
+        return new_pos - eback();
+    }
+    return pos_type(off_type(-1));
+}
+
+Uint8BufferStreamBuf::pos_type Uint8BufferStreamBuf::seekpos(pos_type pos, std::ios_base::openmode which) {
+    if (!(which & std::ios_base::in)) {
+        return pos_type(off_type(-1));
+    }
+    char_type * new_pos = eback() + pos;
+    if (new_pos >= eback() && new_pos <= egptr()) {
+        setg(eback(), new_pos, egptr());
+        return pos;
+    }
+    return pos_type(off_type(-1));
+}
diff --git a/include/llama-cpp.h b/include/llama-cpp.h
index 8f6368177de09..18fb3ac0e1862 100644
--- a/include/llama-cpp.h
+++ b/include/llama-cpp.h
@@ -5,6 +5,7 @@
 #endif
 
 #include <memory>
+#include <vector>
 
 #include "llama.h"
 
@@ -28,3 +29,8 @@ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
 typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
 typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
 typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
+
+LLAMA_API struct llama_model * llama_model_load_from_buffer(std::vector<uint8_t> &&   data,
+                                                            struct llama_model_params params);
+LLAMA_API bool                 llama_model_load_fulfill_split_future(const char * path, const char * context,
+                                                                     std::unique_ptr<std::basic_streambuf<uint8_t>> && streambuf);
diff --git a/include/llama.h b/include/llama.h
index 3eda9bc68608c..89edb619895b8 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -456,6 +456,11 @@ extern "C" {
                                  size_t    n_paths,
               struct llama_model_params    params);
 
+    LLAMA_API struct llama_model * llama_model_load_from_split_futures(const char ** paths, size_t n_paths,
+                                                                       const char *              context,
+                                                                       const char *              tensor_list_file,
+                                                                       struct llama_model_params params);
+
     LLAMA_API void llama_model_save_to_file(
             const struct llama_model * model,
                         const char * path_model);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8f9cd652447ab..6cbd4ac07da96 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -26,6 +26,8 @@ add_library(llama
             llama-memory-hybrid.cpp
             llama-memory-recurrent.cpp
             llama-mmap.cpp
+            llama-model-load-input.cpp
+            llama-model-load.cpp
             llama-model-loader.cpp
             llama-model-saver.cpp
             llama-model.cpp
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
index 8d94034aed95d..96e4827732295 100644
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -347,7 +347,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
 
     // set tensor data
     {
-        llama_file gguf_file(path_lora, "rb");
+        llama_file_disk gguf_file(path_lora, "rb");
         std::vector<uint8_t> read_buf;
         auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
             size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 06e93b19cbf40..dee009b2c6882 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1614,7 +1614,7 @@ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * sr
 }
 
 bool llama_context::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(filepath, "rb");
+    llama_file_disk file(filepath, "rb");
 
     // sanity checks
     {
@@ -1657,7 +1657,7 @@ bool llama_context::state_load_file(const char * filepath, llama_token * tokens_
 }
 
 bool llama_context::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(filepath, "wb");
+    llama_file_disk file(filepath, "wb");
 
     file.write_u32(LLAMA_SESSION_MAGIC);
     file.write_u32(LLAMA_SESSION_VERSION);
@@ -1674,7 +1674,7 @@ bool llama_context::state_save_file(const char * filepath, const llama_token * t
 }
 
 size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(filepath, "rb");
+    llama_file_disk file(filepath, "rb");
 
     // version checks
     {
@@ -1717,7 +1717,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
 }
 
 size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(filepath, "wb");
+    llama_file_disk file(filepath, "wb");
 
     file.write_u32(LLAMA_STATE_SEQ_MAGIC);
     file.write_u32(LLAMA_STATE_SEQ_VERSION);
diff --git a/src/llama-impl.h b/src/llama-impl.h
index 02b1d07f8400d..0a56d83846577 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -30,6 +30,13 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
 
+// Debug-only logging macro that's only enabled in debug builds at compile time
+#ifndef NDEBUG
+#define LLAMA_LOG_CMAKE_DEBUG(...) LLAMA_LOG_DEBUG(__VA_ARGS__)
+#else
+#define LLAMA_LOG_CMAKE_DEBUG(...)
+#endif
+
 //
 // helpers
 //
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 47497cf953fd3..dbe6ad1f86a04 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -1,6 +1,7 @@
 #include "llama-mmap.h"
 
 #include "llama-impl.h"
+#include "uint8-buff-stream.h"
 
 #include "ggml.h"
 
@@ -9,6 +10,7 @@
 #include <stdexcept>
 #include <cerrno>
 #include <algorithm>
+#include <map>
 
 #ifdef __has_include
     #if __has_include(<unistd.h>)
@@ -54,9 +56,7 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
-// llama_file
-
-struct llama_file::impl {
+struct llama_file_disk::impl {
 #if defined(_WIN32)
     HANDLE fp_win32;
     std::string GetErrorMessageWin32(DWORD error_code) const {
@@ -241,13 +241,13 @@ struct llama_file::impl {
     size_t size;
 };
 
-llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
-llama_file::~llama_file() = default;
+llama_file_disk::llama_file_disk(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
+llama_file_disk::~llama_file_disk() = default;
 
-size_t llama_file::tell() const { return pimpl->tell(); }
-size_t llama_file::size() const { return pimpl->size; }
+size_t llama_file_disk::tell() const { return pimpl->tell(); }
+size_t llama_file_disk::size() const { return pimpl->size; }
 
-int llama_file::file_id() const {
+int llama_file_disk::file_id() const {
 #ifdef _WIN32
     return _fileno(pimpl->fp);
 #else
@@ -259,13 +259,193 @@ int llama_file::file_id() const {
 #endif
 }
 
-void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
-void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
+void llama_file_disk::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
+void llama_file_disk::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
+
+uint32_t llama_file_disk::read_u32() const { return pimpl->read_u32(); }
+
+void llama_file_disk::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
+void llama_file_disk::write_u32(uint32_t val) const { pimpl->write_u32(val); }
+
+template <bool Writable>
+llama_file_buffer<Writable>::llama_file_buffer(std::unique_ptr<std::basic_streambuf<uint8_t>> && streambuf) :
+    streambuf(std::move(streambuf)) {}
+
+template <bool Writable> llama_file_buffer<Writable>::~llama_file_buffer() = default;
+
+template <bool Writable> size_t llama_file_buffer<Writable>::tell() const {
+    return streambuf->pubseekoff(0, std::ios_base::cur);
+}
+
+template <bool Writable> size_t llama_file_buffer<Writable>::size() const {
+    auto current_pos = streambuf->pubseekoff(0, std::ios_base::cur);
+    auto end_pos     = streambuf->pubseekoff(0, std::ios_base::end);
+    streambuf->pubseekpos(current_pos);
+    return end_pos;
+}
+
+template <bool Writable> int llama_file_buffer<Writable>::file_id() const {
+    return -1;
+}
+
+template <bool Writable> void llama_file_buffer<Writable>::seek(size_t offset, int whence) const {
+    static std::map<int, std::ios_base::seekdir> whence_to_dir = {
+        { SEEK_SET, std::ios_base::beg },
+        { SEEK_CUR, std::ios_base::cur },
+        { SEEK_END, std::ios_base::end }
+    };
+    auto result = streambuf->pubseekoff(offset, whence_to_dir.at(whence));
+    if (result == std::streampos(-1)) {
+        throw std::runtime_error("seek failed");
+    }
+}
+
+template <bool Writable> void llama_file_buffer<Writable>::read_raw(void * ptr, size_t len) const {
+    auto bytes_read = streambuf->sgetn(static_cast<uint8_t *>(ptr), len);
+    if (bytes_read != static_cast<std::streamsize>(len)) {
+        throw std::runtime_error("read beyond end of buffer");
+    }
+}
+
+template <bool Writable> uint32_t llama_file_buffer<Writable>::read_u32() const {
+    uint32_t val;
+    read_raw(&val, sizeof(val));
+    return val;
+}
+
+template <> void llama_file_buffer<false>::write_raw([[maybe_unused]] const void * ptr, size_t len) const {
+    if (len > 0) {
+        throw std::runtime_error("buffer is not writable");
+    }
+}
+
+template <> void llama_file_buffer<false>::write_u32(uint32_t val) const {
+    if (val > 0) {
+        // Cannot directly set [[noreturn]] for a function since it was defined without it.
+        throw std::runtime_error("buffer is not writable");
+    }
+}
+
+template <> void llama_file_buffer<true>::write_raw(const void * ptr, size_t len) const {
+    auto bytes_written = streambuf->sputn(static_cast<const uint8_t *>(ptr), len);
+    if (bytes_written != static_cast<std::streamsize>(len)) {
+        throw std::runtime_error("write beyond end of buffer");
+    }
+}
 
-uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
+template <> void llama_file_buffer<true>::write_u32(uint32_t val) const {
+    write_raw(&val, sizeof(val));
+}
+
+// Explicit instantiations
+template struct llama_file_buffer<false>;
+template struct llama_file_buffer<true>;
+
+// llama_future_file_buffer implementation
+
+namespace {
+std::string final_key(const std::string & promise_key, const std::string & context) {
+    return promise_key + ":" + context;
+}
+
+std::mutex promise_registry_mutex;
+
+std::map<std::string, std::promise<std::unique_ptr<llama_file_buffer<false>>>> promise_registry_ro;
+std::map<std::string, std::promise<std::unique_ptr<llama_file_buffer<true>>>>  promise_registry_rw;
+
+template <bool Writable>
+std::map<std::string, std::promise<std::unique_ptr<llama_file_buffer<Writable>>>> & promise_registry() {
+    if constexpr (Writable) {
+        return promise_registry_rw;
+    } else {
+        return promise_registry_ro;
+    }
+}
+
+/// @brief Ensures a promise exists in the registry for the given key.
+/// If it doesn't exist, creates it. Returns an iterator to the promise.
+/// Thread-safe.
+template <bool Writable>
+typename std::map<std::string, std::promise<std::unique_ptr<llama_file_buffer<Writable>>>>::iterator
+ensure_promise_registry(const std::string & key) {
+    std::lock_guard<std::mutex> lock(promise_registry_mutex);
+    auto                        it = promise_registry<Writable>().find(key);
+    if (it != promise_registry<Writable>().end()) {
+        return it;
+    }
+    auto result =
+        promise_registry<Writable>().emplace(key, std::promise<std::unique_ptr<llama_file_buffer<Writable>>>());
+    LLAMA_LOG_CMAKE_DEBUG("%s: created future file buffer %p for %s\n", __func__, (void *) &(*it), key.c_str());
+    return result.first;
+}
+}  // namespace
+
+template <bool Writable>
+llama_future_file_buffer<Writable>::llama_future_file_buffer(const std::string & promise_key,
+                                                             const std::string & context) :
+    file_buffer_future(),
+    file_buffer() {
+    std::string key              = final_key(promise_key, context);
+    file_buffer_promise_iterator = ensure_promise_registry<Writable>(key);
+    file_buffer_future           = file_buffer_promise_iterator->second.get_future();
+}
+
+template <bool Writable>
+llama_future_file_buffer<Writable>::llama_future_file_buffer(llama_future_file_buffer && other) noexcept :
+    file_buffer_promise_iterator(std::move(other.file_buffer_promise_iterator)),
+    file_buffer_future(std::move(other.file_buffer_future)),
+    file_buffer(std::move(other.file_buffer)) {
+    // Set the other object's iterator to end() to mark it as moved from
+    // to avoid early erasure at destruction of the moved other object
+    other.file_buffer_promise_iterator = promise_registry<Writable>().end();
+}
+
+template <bool Writable>
+llama_future_file_buffer<Writable> & llama_future_file_buffer<Writable>::operator=(
+    llama_future_file_buffer && other) noexcept {
+    if (this != &other) {
+        file_buffer_promise_iterator       = std::move(other.file_buffer_promise_iterator);
+        file_buffer_future                 = std::move(other.file_buffer_future);
+        file_buffer                        = std::move(other.file_buffer);
+        other.file_buffer_promise_iterator = promise_registry<Writable>().end();
+    }
+    return *this;
+}
+
+template <bool Writable> llama_future_file_buffer<Writable>::~llama_future_file_buffer() {
+    std::lock_guard<std::mutex> lock(promise_registry_mutex);
+    if (file_buffer_promise_iterator != promise_registry<Writable>().end()) {
+        promise_registry<Writable>().erase(file_buffer_promise_iterator);
+    }
+}
+
+template <bool Writable>
+bool llama_future_file_buffer<Writable>::fulfill_promise(const std::string & promise_key, const std::string & context,
+                                                         std::unique_ptr<llama_file_buffer<Writable>> && value) {
+    std::string key = final_key(promise_key, context);
+    auto        it  = ensure_promise_registry<Writable>(key);
+    if (it != promise_registry<Writable>().end()) {
+        LLAMA_LOG_CMAKE_DEBUG("fulfilling future file buffer %p for %s\n", (void *) &(*it), key.c_str());
+        it->second.set_value(std::move(value));
+        return true;
+    }
+    return false;
+}
+
+template <bool Writable>
+std::unique_ptr<llama_file_buffer<Writable>> llama_future_file_buffer<Writable>::extract() const {
+    if (file_buffer) {
+        return std::move(file_buffer);
+    }
+
+    auto future_result = file_buffer_future.get();
+    file_buffer        = std::move(future_result);
+    return std::move(file_buffer);
+}
 
-void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
-void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
+// Explicit instantiations for llama_future_file_buffer
+template struct llama_future_file_buffer<false>;
+template struct llama_future_file_buffer<true>;
 
 // llama_mmap
 
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 4e5aec3f440d7..9e71eba7ce195 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -3,6 +3,10 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
+#include "uint8-buff-stream.h"
+#include <future>
+#include <string>
+#include <map>
 
 struct llama_file;
 struct llama_mmap;
@@ -13,27 +17,105 @@ using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 
 struct llama_file {
-    llama_file(const char * fname, const char * mode);
-    ~llama_file();
+    virtual ~llama_file() = default;
 
-    size_t tell() const;
-    size_t size() const;
+    virtual size_t tell() const = 0;
+    virtual size_t size() const = 0;
+    virtual int file_id() const = 0;
+
+    virtual void seek(size_t offset, int whence) const = 0;
 
-    int file_id() const; // fileno overload
+    virtual void read_raw(void * ptr, size_t len) const = 0;
+    virtual uint32_t read_u32() const = 0;
+
+    virtual void write_raw(const void * ptr, size_t len) const = 0;
+    virtual void write_u32(uint32_t val) const = 0;
+};
 
-    void seek(size_t offset, int whence) const;
+struct llama_file_disk : public llama_file {
+    llama_file_disk(const char * fname, const char * mode);
+    ~llama_file_disk() override;
 
-    void read_raw(void * ptr, size_t len) const;
-    uint32_t read_u32() const;
+    size_t tell() const override;
+    size_t size() const override;
+    int file_id() const override;
 
-    void write_raw(const void * ptr, size_t len) const;
-    void write_u32(uint32_t val) const;
+    void seek(size_t offset, int whence) const override;
+
+    void read_raw(void * ptr, size_t len) const override;
+    uint32_t read_u32() const override;
+
+    void write_raw(const void * ptr, size_t len) const override;
+    void write_u32(uint32_t val) const override;
 
 private:
     struct impl;
     std::unique_ptr<impl> pimpl;
 };
 
+template <bool Writable> struct llama_file_buffer : public llama_file {
+    llama_file_buffer(std::unique_ptr<std::basic_streambuf<uint8_t>> && streambuf);
+
+    ~llama_file_buffer() override;
+
+    size_t tell() const override;
+    size_t size() const override;
+
+    /// @return -1 to indicate this is not a real file descriptor
+    int file_id() const override;
+
+    void seek(size_t offset, int whence) const override;
+
+    void     read_raw(void * ptr, size_t len) const override;
+    uint32_t read_u32() const override;
+
+    /// @throw std::runtime_error if the buffer is read-only
+    void write_raw(const void * ptr, size_t len) const override;
+
+    /// @throw std::runtime_error if the buffer is read-only
+    void write_u32(uint32_t val) const override;
+
+    std::unique_ptr<std::basic_streambuf<uint8_t>> streambuf;
+};
+
+template <bool Writable> struct llama_future_file_buffer {
+    /// @brief A file buffer object whose operations will block
+    /// until the given promise key is set with a file buffer.
+    /// @param promise_key The key to use for the promise (e.g. a file path).
+    /// @param context The context to use for the promise, used to distinguish same promise key (e.g. for a same file opened twice).
+    llama_future_file_buffer(const std::string & promise_key, const std::string & context);
+
+    // Delete copy constructor and copy assignment operator
+    llama_future_file_buffer(const llama_future_file_buffer &)             = delete;
+    llama_future_file_buffer & operator=(const llama_future_file_buffer &) = delete;
+
+    llama_future_file_buffer(llama_future_file_buffer && other) noexcept;
+    llama_future_file_buffer & operator=(llama_future_file_buffer && other) noexcept;
+
+    ~llama_future_file_buffer();
+
+    /// @brief Sets the given key and context with a file buffer so that
+    /// operations can resume/start.
+    static bool fulfill_promise(const std::string & promise_key, const std::string & context,
+                                std::unique_ptr<llama_file_buffer<Writable>> && value);
+
+    /// @brief Waits for future buffer or obtains current if already
+    /// fulfilled and moves the future contents outside the registry.
+    std::unique_ptr<llama_file_buffer<Writable>> extract() const;
+
+  private:
+    typename std::map<std::string, std::promise<std::unique_ptr<llama_file_buffer<Writable>>>>::iterator
+                                                                      file_buffer_promise_iterator;
+    mutable std::future<std::unique_ptr<llama_file_buffer<Writable>>> file_buffer_future;
+    mutable std::unique_ptr<llama_file_buffer<Writable>>              file_buffer;
+};
+
+// Type aliases for convenience
+using llama_file_buffer_ro = llama_file_buffer<false>;
+using llama_file_buffer_rw = llama_file_buffer<true>;
+using llama_future_file_buffer_ro = llama_future_file_buffer<false>;
+using llama_future_file_buffer_rw = llama_future_file_buffer<true>;
+
 struct llama_mmap {
     llama_mmap(const llama_mmap &) = delete;
     llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false);
diff --git a/src/llama-model-load-input.cpp b/src/llama-model-load-input.cpp
new file mode 100644
index 0000000000000..e1f4086feec38
--- /dev/null
+++ b/src/llama-model-load-input.cpp
@@ -0,0 +1,64 @@
+#include "llama-model-load-input.h"
+#include <sstream>
+#include "llama-mmap.h"
+
+namespace load_input_variant {
+
+const char * identifier(load_input_t & load_input) {
+    if (std::holds_alternative<fname_load_input>(load_input)) {
+        const auto & file_input = std::get<fname_load_input>(load_input);
+        return file_input.fname.c_str();
+    }
+    static const char * buffer_id_str = "buffer";
+    return buffer_id_str;
+}
+
+fname_load_input split_name_from_variant(load_input_t & load_input) {
+    if (std::holds_alternative<buffer_future_load_input>(load_input)) {
+        auto future_input = std::get<buffer_future_load_input>(load_input);
+        return fname_load_input{ future_input.promise_key, future_input.splits };
+    }
+    auto file_input = std::get<fname_load_input>(load_input);
+    return file_input;
+}
+
+bool variant_supports_split_load(load_input_t & load_input) {
+    return std::holds_alternative<fname_load_input>(load_input) ||
+           std::holds_alternative<buffer_future_load_input>(load_input);
+}
+
+bool variant_supports_split_load_from_memory(load_input_t & load_input) {
+    return std::holds_alternative<buffer_future_load_input>(load_input);
+}
+
+std::optional<std::set<std::string>> parse_tensor_list_from_future(load_input_t & load_input) {
+    std::set<std::string> tensor_names;
+
+    if (!std::holds_alternative<buffer_future_load_input>(load_input)) {
+        return std::nullopt;
+    }
+
+    const auto & future_input = std::get<buffer_future_load_input>(load_input);
+
+    // Open and read the tensor list file
+    llama_future_file_buffer_ro           tensor_file(future_input.tensor_list_file, future_input.context);
+    std::unique_ptr<llama_file_buffer_ro> file_buffer = tensor_file.extract();
+
+    // Read the entire buffer as bytes and convert to string
+    std::vector<uint8_t>              buffer;
+    std::basic_istream<uint8_t>       stream(file_buffer->streambuf.get());
+    std::istreambuf_iterator<uint8_t> begin(stream), end;
+    buffer.assign(begin, end);
+
+    // Convert bytes to string and split by newlines
+    std::string        content(reinterpret_cast<const char *>(buffer.data()), buffer.size());
+    std::istringstream line_stream(content);
+    std::string        line;
+    while (std::getline(line_stream, line)) {
+        tensor_names.insert(line);
+    }
+
+    return tensor_names;
+}
+
+}  // namespace load_input_variant
diff --git a/src/llama-model-load-input.h b/src/llama-model-load-input.h
new file mode 100644
index 0000000000000..d7bb331c5f8ba
--- /dev/null
+++ b/src/llama-model-load-input.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <variant>
+#include <vector>
+
+namespace load_input_variant {
+
+struct fname_load_input {
+    const std::string &        fname;
+    std::vector<std::string> & splits;  // optional, only need if the split does not follow naming scheme
+};
+
+struct buffer_load_input {
+    std::unique_ptr<std::basic_streambuf<uint8_t>> & streambuf;
+};
+
+struct buffer_future_load_input {
+    const std::string &        promise_key;
+    const std::string &        context;
+    std::vector<std::string> & splits;
+    const std::string &        tensor_list_file;
+};
+
+}  // namespace load_input_variant
+
+using load_input_t = std::variant<load_input_variant::fname_load_input, load_input_variant::buffer_load_input,
+                                  load_input_variant::buffer_future_load_input>;
+
+namespace load_input_variant {
+const char * identifier(load_input_t & load_input);
+
+fname_load_input split_name_from_variant(load_input_t & load_input);
+
+bool variant_supports_split_load(load_input_t & load_input);
+
+bool variant_supports_split_load_from_memory(load_input_t & load_input);
+
+/// @brief Parse tensor list from future file or nullopt if not a future file
+std::optional<std::set<std::string>> parse_tensor_list_from_future(load_input_t & load_input);
+}  // namespace load_input_variant
diff --git a/src/llama-model-load.cpp b/src/llama-model-load.cpp
new file mode 100644
index 0000000000000..15c3b367e0ac2
--- /dev/null
+++ b/src/llama-model-load.cpp
@@ -0,0 +1,234 @@
+#include "llama-model-load.h"
+
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <variant>
+
+#include "llama-model-loader.h"
+
+gguf_file_load::gguf_file_load(struct ggml_context ** ctx, load_input_t load_input) :
+    params({
+        /*.no_alloc = */ true,
+        /*.ctx      = */ ctx,
+    }) {
+    using namespace load_input_variant;
+    if (std::holds_alternative<fname_load_input>(load_input)) {
+        const auto & file_input = std::get<fname_load_input>(load_input);
+        meta.reset(gguf_init_from_file(file_input.fname.c_str(), params));
+        if (!meta) {
+            throw std::runtime_error(format("%s: failed to load model from %s", __func__, file_input.fname.c_str()));
+        }
+        file = std::make_unique<llama_file_disk>(file_input.fname.c_str(), "ro");
+    } else if (std::holds_alternative<buffer_future_load_input>(load_input)) {
+        const auto & future_input = std::get<buffer_future_load_input>(load_input);
+        auto         future_file =
+            std::make_unique<llama_future_file_buffer_ro>(future_input.promise_key, future_input.context);
+        std::unique_ptr<llama_file_buffer_ro> file_buffer = future_file->extract();
+        meta.reset(gguf_init_from_buffer(*file_buffer->streambuf, params));
+        if (!meta) {
+            throw std::runtime_error(format("%s: failed to load model from buffer", __func__));
+        }
+        file = std::move(file_buffer);
+    } else {
+        const auto & buffer_input = std::get<buffer_load_input>(load_input);
+        meta.reset(gguf_init_from_buffer(*buffer_input.streambuf, params));
+        if (!meta) {
+            throw std::runtime_error(format("%s: failed to load model from buffer", __func__));
+        }
+        file = std::make_unique<llama_file_buffer_ro>(std::move(buffer_input.streambuf));
+    }
+}
+
+gguf_file_load SplitLoad::load_split_gguf(struct ggml_context ** ctx, const char * fname_split,
+                                          load_input_t & load_input, std::vector<std::string> & splits) {
+    using namespace load_input_variant;
+    if (std::holds_alternative<fname_load_input>(load_input)) {
+        return gguf_file_load(ctx, fname_load_input{ fname_split, splits });
+    }
+    if (std::holds_alternative<buffer_future_load_input>(load_input)) {
+        auto future_input = std::get<buffer_future_load_input>(load_input);
+        return gguf_file_load(
+            ctx, buffer_future_load_input{ fname_split, future_input.context, splits, future_input.tensor_list_file });
+    }
+    return gguf_file_load(ctx, load_input);
+}
+
+SplitLoad::SplitLoad(load_input_t & load_input, load_input_variant::fname_load_input base_split, uint16_t idx,
+                     std::string kv_split_no) :
+    load_input(load_input),
+    base_split(base_split),
+    idx(idx),
+    kv_split_no(std::move(kv_split_no)) {}
+
+IncrementalSplitsTensorLoad::IncrementalSplitsTensorLoad(struct ggml_context * ctx, struct llama_model_loader & ml,
+                                                         gguf_file_load &      base_split,
+                                                         std::set<std::string> tensor_list) :
+    expected_tensors(std::move(tensor_list)) {
+    ml.process_loaded_gguf(ctx, base_split, 0);
+    _process_split(ctx, ml, 0);
+}
+
+struct ggml_context * SplitLoad::load(llama_model_loader & ml) {
+    if (loaded) {
+        return ml.contexts[idx].get();
+    }
+
+    struct ggml_context * ctx = ml.contexts.back().get();
+
+    const char * fname_split = base_split.splits[idx].c_str();
+    LLAMA_LOG_INFO("loading split-file %s\n", fname_split);
+
+    gguf_file_load     split_gguf = gguf_file_load(load_split_gguf(&ctx, fname_split, load_input, base_split.splits));
+    gguf_context_ptr & split_meta = split_gguf.meta;
+
+    if (idx > 0) {
+        const int kid = gguf_find_key(split_meta.get(), kv_split_no.c_str());
+        if (kid < 0) {
+            throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+        }
+        int idx_gguf = gguf_get_val_u16(split_meta.get(), kid);
+        if (idx_gguf != idx) {
+            throw std::runtime_error(
+                format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+        }
+    }
+
+    // Check that this split's idx matches the expected position in ml.files
+    if (!ml.files.empty() && idx != ml.files.size()) {
+        throw std::runtime_error(
+            format("invalid split file loading order: got idx %d but expected %zu based on ml.files size", idx,
+                   ml.files.size()));
+    }
+
+    ml.process_loaded_gguf(ctx, split_gguf, idx);
+
+    loaded = true;
+    return ctx;
+}
+
+void IncrementalSplitsTensorLoad::add_split(SplitLoad splitLoad) {
+    // +1 because first split is expected to have been already loaded (not delayed)
+    split_info[delayed_files.size() + 1] = SplitInfo();
+    delayed_files.emplace_back(std::move(splitLoad));
+}
+
+void IncrementalSplitsTensorLoad::_load_split(struct llama_model_loader & ml, uint16_t idx) {
+    // -1 because first split is expected to have been already loaded (not delayed and not present in delayed_files)
+    const struct ggml_context * ctx = delayed_files[idx - 1].load(ml);
+    _process_split(ctx, ml, idx);
+}
+
+void IncrementalSplitsTensorLoad::_process_split(const struct ggml_context * ctx, struct llama_model_loader & ml,
+                                                 uint16_t idx) {
+    SplitInfo & split = split_info[idx];
+
+    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string tensor_name = std::string(cur->name);
+        split.total_tensor_count++;
+
+        // Add tensor info with initial loaded state as false
+        tensor_info[tensor_name] = TensorInfo{ idx, false };
+
+        auto it = ml.weights_map.find(tensor_name);
+        if (it == ml.weights_map.end()) {
+            throw std::runtime_error(format("tensor '%s' not found in weights_map", tensor_name.c_str()));
+        }
+        split.data_size += ggml_nbytes(it->second.tensor);
+    }
+}
+
+uint16_t IncrementalSplitsTensorLoad::load_tensor_metadata(struct llama_model_loader & ml, const char * tensor_name,
+                                                           ggml_tensor ** out_tensor_metadata) {
+    LLAMA_LOG_CMAKE_DEBUG("%s: loading tensor %s (tensor_meta=%p, delayed_loaded=%zu, delayed_files.size=%zu)\n",
+                          __func__, tensor_name, (void *) *out_tensor_metadata, delayed_loaded, delayed_files.size());
+    if (expected_tensors.find(tensor_name) == expected_tensors.end()) {
+        throw std::runtime_error(format("unknown tensor not expected in split files: %s", tensor_name));
+    }
+    while (!(*out_tensor_metadata) && delayed_loaded < delayed_files.size()) {
+        // +1 because first split is expected to have been already loaded (not delayed)
+        _load_split(ml, delayed_loaded + 1);
+        *out_tensor_metadata = ml.get_tensor_meta(tensor_name);
+        delayed_loaded++;
+        if (*out_tensor_metadata) {
+            LLAMA_LOG_CMAKE_DEBUG("%s: tensor %s found in file %zu\n", __func__, tensor_name, delayed_loaded);
+        }
+        if (delayed_loaded == delayed_files.size() && ml.weights_map.size() != expected_n_tensors()) {
+            throw std::runtime_error(
+                format("finished incrementally loading all splits but expected %zu tensors, got %zu",
+                       expected_n_tensors(), ml.weights_map.size()));
+        }
+    }
+    uint16_t split_idx = get_split_idx_for_tensor(tensor_name);
+
+    // Mark tensor as loaded and increment split's loaded count
+    auto tensor_it = tensor_info.find(tensor_name);
+    if (!tensor_it->second.is_loaded) {
+        tensor_it->second.is_loaded = true;
+        split_info[split_idx].loaded_tensor_count++;
+    }
+
+    return split_idx;
+}
+
+uint16_t IncrementalSplitsTensorLoad::get_split_idx_for_tensor(const char * tensor_name) const {
+    return _get_tensor_info_iterator(tensor_name)->second.split_idx;
+}
+
+std::size_t IncrementalSplitsTensorLoad::get_split_data_size(uint16_t split_idx) const {
+    return _get_split_info_iterator(split_idx)->second.data_size;
+}
+
+void IncrementalSplitsTensorLoad::print_currently_known_tensors() const {
+    LLAMA_LOG_INFO("Current incremental loaded tensors:\n");
+    for (const auto & it : tensor_info) {
+        LLAMA_LOG_INFO("Tensor '%s' in split %d (loaded: %s)\n", it.first.c_str(), it.second.split_idx,
+                       it.second.is_loaded ? "yes" : "no");
+    }
+}
+
+bool IncrementalSplitsTensorLoad::all_tensors_are_loaded(uint16_t split_idx) const {
+    auto              it    = _get_split_info_iterator(split_idx);
+    const SplitInfo & split = it->second;
+    LLAMA_LOG_CMAKE_DEBUG("Loaded tensor count for split %d: %u/%u\n", split_idx, split.loaded_tensor_count,
+                          split.total_tensor_count);
+    return split.all_tensors_loaded();
+}
+
+std::size_t IncrementalSplitsTensorLoad::expected_n_tensors() {
+    return expected_tensors.size();
+}
+
+void IncrementalSplitsTensorLoad::release_split(struct llama_model_loader & ml, uint16_t split_idx) {
+    // Let destructor of the smart pointer do the release of memory
+    ml.files[split_idx] = nullptr;
+}
+
+std::map<std::string, IncrementalSplitsTensorLoad::TensorInfo>::const_iterator
+IncrementalSplitsTensorLoad::_get_tensor_info_iterator(const char * tensor_name) const {
+    auto it = tensor_info.find(tensor_name);
+    if (it == tensor_info.end()) {
+        throw std::runtime_error(format("tensor '%s' not found in tensor_info map", tensor_name));
+    }
+    return it;
+}
+
+std::map<uint16_t, IncrementalSplitsTensorLoad::SplitInfo>::const_iterator
+IncrementalSplitsTensorLoad::_get_split_info_iterator(uint16_t split_idx) const {
+    auto it = split_info.find(split_idx);
+    if (it == split_info.end()) {
+        throw std::runtime_error(format("split index %d not found in split_info map", split_idx));
+    }
+    return it;
+}
+
+bool IncrementalSplitsTensorLoad::SplitInfo::all_tensors_loaded() const {
+    return loaded_tensor_count >= total_tensor_count;
+}
+
+bool IncrementalSplitsTensorLoad::tensor_ignored(const std::optional<IncrementalSplitsTensorLoad> & splits_tensor_load,
+                                                 const char *                                       tensor_name) {
+    return !splits_tensor_load.has_value() ||
+           (splits_tensor_load.has_value() &&
+            splits_tensor_load->expected_tensors.find(tensor_name) == splits_tensor_load->expected_tensors.end());
+}
diff --git a/src/llama-model-load.h b/src/llama-model-load.h
new file mode 100644
index 0000000000000..1abc2053de4af
--- /dev/null
+++ b/src/llama-model-load.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <cstdint>
+#include <set>
+
+#include "ggml-cpp.h"
+#include "llama-mmap.h"
+#include "llama-model-load-input.h"
+
+struct llama_model_loader;
+
+/// @brief Immediately loads and stores relevant data in the struct fields.
+struct gguf_file_load {
+    struct gguf_init_params     params;
+    gguf_context_ptr            meta;
+    std::unique_ptr<llama_file> file = nullptr;
+
+    gguf_file_load(struct ggml_context ** ctx, load_input_t load_input);
+};
+
+/// @brief Stores relevant information to be able to loads a `.gguf` split file when load method is called.
+struct SplitLoad {
+    load_input_t                         load_input;
+    load_input_variant::fname_load_input base_split;
+    uint16_t                             idx;
+    std::string                          kv_split_no;
+    bool                                 loaded = false;
+
+    SplitLoad(load_input_t & load_input, load_input_variant::fname_load_input base_split, uint16_t idx,
+              std::string kv_split_no);
+
+    static gguf_file_load load_split_gguf(struct ggml_context ** ctx, const char * fname_split,
+                                          load_input_t & load_input, std::vector<std::string> & splits);
+
+    struct ggml_context * load(struct llama_model_loader & ml);
+};
+
+/// @brief Handles incremental load of tensor and split-files.
+/// @note First split-file is expected to be already available at construction, the remainder of split-files are
+/// incrementally load on-demand by calling `load_tensor_metadata`
+struct IncrementalSplitsTensorLoad {
+    IncrementalSplitsTensorLoad(struct ggml_context * ctx, struct llama_model_loader & ml, gguf_file_load & base_split,
+                                std::set<std::string> tensor_list);
+
+    void add_split(SplitLoad splitLoad);
+
+    /// @brief Incrementally loads file splits until the tensor metadata is found.
+    /// Also increments loaded tensor count so that `all_tensors_are_loaded` returns true
+    /// when all tensors in a file-split have been requested.
+    /// @returns Split idx where the tensor was found
+    /// @throw runtime_error if tensor was not found
+    uint16_t load_tensor_metadata(struct llama_model_loader & ml, const char * tensor_name,
+                                  ggml_tensor ** out_tensor_metadata);
+
+    /// @returns True if all tensors of a split have been loaded.
+    bool all_tensors_are_loaded(uint16_t split_idx) const;
+
+    /// @returns Max number of tensors as described on the summary tensor-list file.
+    std::size_t expected_n_tensors();
+
+    /// @bried Release file memory for a split.
+    static void release_split(struct llama_model_loader & ml, uint16_t split_idx);
+
+    void print_currently_known_tensors() const;
+
+    uint16_t get_split_idx_for_tensor(const char * tensor_name) const;
+
+    std::size_t get_split_data_size(uint16_t split_idx) const;
+
+    static bool tensor_ignored(const std::optional<IncrementalSplitsTensorLoad> & splits_tensor_load,
+                               const char *                                       tensor_name);
+
+    /// @brief Lalizy get/allocate a context with enough capacity for all tensors of
+    /// same type of an individual split. The context can be used to instantiate the
+    /// final model tensors and and attach to them backend buffers.
+    /// @tparam impl The model implementation type where the context will be stored.
+    template <typename impl>
+    ggml_context * get_model_ctx_for_split_buft(ggml_backend_buffer_type_t buft, uint16_t split, impl * model_impl) {
+        auto key = std::make_pair(buft, split);
+        auto it  = ctx_split_map.find(key);
+        if (it == ctx_split_map.end()) {
+            LLAMA_LOG_CMAKE_DEBUG("%s: creating context for split %d (buft=%s, existing=%zu)\n", __func__, split,
+                                  ggml_backend_buft_name(buft), ctx_split_map.size());
+
+            const size_t max_n_tensors = _get_split_info_iterator(split)->second.total_tensor_count;
+            const size_t ctx_size      = ggml_tensor_overhead() * max_n_tensors;
+
+            ggml_init_params params = {
+                /*.mem_size   =*/ctx_size,
+                /*.mem_buffer =*/NULL,
+                /*.no_alloc   =*/true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                throw std::runtime_error("failed to create ggml context for split-file");
+            }
+
+            ctx_split_map[key] = ctx;
+            model_impl->ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+        return it->second;
+    }
+
+    // public so that it can be processed by the backend storage allocator
+    std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> ctx_split_map;
+
+  private:
+    struct TensorInfo {
+        uint16_t split_idx = 0;
+        bool     is_loaded = false;
+    };
+
+    struct SplitInfo {
+        uint32_t total_tensor_count = 0, loaded_tensor_count = 0;
+
+        /// @brief Total ggml tensor data size of this split
+        std::size_t data_size = 0;
+
+        bool all_tensors_loaded() const;
+    };
+
+    void _load_split(struct llama_model_loader & ml, uint16_t idx);
+    void _process_split(const struct ggml_context * ctx, struct llama_model_loader & ml, uint16_t idx);
+
+    /// @brief Get tensor info iterator or throw if not found
+    /// @throw runtime_error if tensor not found
+    std::map<std::string, TensorInfo>::const_iterator _get_tensor_info_iterator(const char * tensor_name) const;
+
+    /// @brief Get split info iterator or throw if not found
+    /// @throw runtime_error if split not found
+    std::map<uint16_t, SplitInfo>::const_iterator _get_split_info_iterator(uint16_t split_idx) const;
+
+    std::map<std::string, TensorInfo> tensor_info;
+    std::map<uint16_t, SplitInfo>     split_info;
+
+    /// @brief Number of delayed files that have been loaded
+    std::size_t delayed_loaded = 0;
+
+    /// @brief Vector of split files to be loaded on demand
+    std::vector<SplitLoad> delayed_files;
+
+    /// @brief Set of expected tensor names loaded from tensor list file
+    std::set<std::string> expected_tensors;
+};
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index bd9e6da8832b7..29eed9f96f1c8 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1,11 +1,15 @@
 #include "llama-model-loader.h"
 
 #include "ggml.h"
+#include "llama-mmap.h"
+#include "llama-model-load.h"
 
 #include <array>
 #include <cinttypes>
+#include <cstdint>
 #include <cstring>
 #include <future>
+#include <stdexcept>
 
 static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
@@ -463,11 +467,35 @@ namespace GGUFMeta {
 
     // TODO: this is not very clever - figure out something better
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
-    template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+    template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv                 kid,
+                                                                                std::array<uint32_t, 512> & result,
+                                                                                uint32_t n, bool required);
+
+    // Save tensors data offset of the main file.
+    // For subsidiary files, `meta` tensor data offset must not be used,
+    // so we build a unified tensors index for weights.
+    void llama_model_loader::process_loaded_gguf(struct ggml_context * ctx, gguf_file_load & gguf_load, uint16_t idx) {
+        contexts.emplace_back(ctx);
+        files.emplace_back(std::move(gguf_load.file));
+        llama_file * raw_file_ptr = files.back().get();
+
+        // Save tensors data offset info of the shard.
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+            std::string tensor_name = std::string(cur->name);
+            LLAMA_LOG_CMAKE_DEBUG("%s: loaded tensor %s at split %d\n", tensor_name.c_str(), __func__, idx);
+            // make sure there is no duplicated tensor names
+            if (weights_map.find(tensor_name) != weights_map.end()) {
+                throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+            }
+            n_elements += ggml_nelements(cur);
+            n_bytes += ggml_nbytes(cur);
+            weights_map.emplace(tensor_name,
+                                llama_model_loader::llama_tensor_weight(raw_file_ptr, idx, gguf_load.meta.get(), cur));
+        }
+    }
 
 llama_model_loader::llama_model_loader(
-        const std::string & fname,
-        std::vector<std::string> & splits,
+        load_input_t load_input,
         bool use_mmap,
         bool check_tensors,
         const llama_model_kv_override * param_overrides_p,
@@ -485,58 +513,46 @@ llama_model_loader::llama_model_loader(
 
     tensor_buft_overrides = param_tensor_buft_overrides_p;
 
-    // Load the main GGUF
+    std::optional<std::set<std::string>> tensor_list = load_input_variant::parse_tensor_list_from_future(load_input);
+
     struct ggml_context * ctx = NULL;
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx,
-    };
+    gguf_file_load main_gguf(&ctx, load_input);
 
-    meta.reset(gguf_init_from_file(fname.c_str(), params));
-    if (!meta) {
-        throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
+    if (load_input_variant::variant_supports_split_load_from_memory(load_input)) {
+        incremental_splits_tensor_load.emplace(ctx, *this, main_gguf, std::move(*tensor_list));
+    } else {
+        process_loaded_gguf(ctx, main_gguf, 0);
     }
 
+    meta = std::move(main_gguf.meta);
+
     get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
     llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-    files.emplace_back(new llama_file(fname.c_str(), "rb"));
-    contexts.emplace_back(ctx);
-
-    // Save tensors data offset of the main file.
-    // For subsidiary files, `meta` tensor data offset must not be used,
-    // so we build a unified tensors index for weights.
-    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-        std::string tensor_name = std::string(cur->name);
-        // make sure there is no duplicated tensor names
-        if (weights_map.find(tensor_name) != weights_map.end()) {
-            throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
-        }
-        n_elements += ggml_nelements(cur);
-        n_bytes    += ggml_nbytes(cur);
-        weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
-    }
     uint16_t n_split = 0;
     get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
 
     // Load additional GGML contexts
-    if (n_split > 1) {
+    if (load_input_variant::variant_supports_split_load(load_input) && n_split > 1) {
+
+        load_input_variant::fname_load_input base_split = load_input_variant::split_name_from_variant(load_input);
+
         // make sure the main file is loaded first
         uint16_t idx = 0;
         const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
         get_key(kv_split_no, idx);
         if (idx != 0) {
-            throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
+            throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, base_split.fname.c_str()));
         }
 
         // generate list of splits if needed
-        if (splits.empty()) {
-            splits = llama_get_list_splits(fname, idx, n_split);
+        if (base_split.splits.empty()) {
+            base_split.splits = llama_get_list_splits(base_split.fname, idx, n_split);
         }
 
         // in case user give a custom list of splits, check if it matches the expected number
-        if (n_split != (uint16_t)splits.size()) {
-            throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
+        if (n_split != (uint16_t)base_split.splits.size()) {
+            throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", base_split.splits.size(), n_split));
         }
 
         if (trace > 0) {
@@ -545,49 +561,20 @@ llama_model_loader::llama_model_loader(
 
         // load other splits
         for (idx = 1; idx < n_split; idx++) {
-            const char * fname_split = splits[idx].c_str();
-
-            struct gguf_init_params split_params = {
-                /*.no_alloc = */ true,
-                /*.ctx      = */ &ctx,
-            };
-            gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
-            if (!ctx_gguf) {
-                throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
-            }
+            SplitLoad split_load(load_input, base_split, idx, kv_split_no);
 
-            // check idx
-            {
-                const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
-                if (kid < 0) {
-                    throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
-                }
-                int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
-                if (idx_gguf != idx) {
-                    throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
-                }
+            if(incremental_splits_tensor_load.has_value()) {
+                incremental_splits_tensor_load->add_split(std::move(split_load));
             }
-
-            files.emplace_back(new llama_file(fname_split, "rb"));
-            contexts.emplace_back(ctx);
-
-            // Save tensors data offset info of the shard.
-            for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-                std::string tensor_name = std::string(cur->name);
-                // make sure there is no duplicated tensor names
-                if (weights_map.find(tensor_name) != weights_map.end()) {
-                    throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
-                }
-                n_elements += ggml_nelements(cur);
-                n_bytes    += ggml_nbytes(cur);
-                weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
+            else {
+                split_load.load(*this);
             }
         }
 
         get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
 
-        // sanity check
-        {
+        // sanity check (the incremental loader does the check after loading the last split)
+        if(!incremental_splits_tensor_load.has_value()) {
             const int n_tensors_loaded = (int) weights_map.size();
             if (n_tensors != n_tensors_loaded) {
                 throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
@@ -598,16 +585,22 @@ llama_model_loader::llama_model_loader(
     }
 
     n_kv      = gguf_get_n_kv(meta.get());
-    n_tensors = weights_map.size();
+    if (incremental_splits_tensor_load.has_value()) {
+        n_tensors = incremental_splits_tensor_load->expected_n_tensors();
+        LLAMA_LOG_CMAKE_DEBUG("%s: n_tensors (expected from summary list): %d\n", __func__, n_tensors);
+    } else {
+        n_tensors = weights_map.size();
+        LLAMA_LOG_CMAKE_DEBUG("%s: exact n_tensors: %d\n", __func__,  n_tensors);
+    }
 
     fver = (enum llama_fver) gguf_get_version(meta.get());
 
     LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
-            __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
+            __func__, n_kv, n_tensors, load_input_variant::identifier(load_input), llama_file_version_name(fver));
 
     // determine file type based on the number of tensors for each quantization and print meta data
     // TODO: make optional
-    {
+    if(!incremental_splits_tensor_load.has_value()) {
         std::map<enum ggml_type, uint32_t> n_type;
 
         uint32_t n_type_max = 0;
@@ -915,12 +908,9 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
     }
 }
 
-bool llama_model_loader::load_all_data(
-        struct ggml_context * ctx,
-        llama_buf_map & bufs,
-        llama_mlocks * lmlocks,
-        llama_progress_callback progress_callback,
-        void * progress_callback_user_data) {
+bool llama_model_loader::load_all_data(size_t size_data, struct ggml_context * ctx, llama_buf_map & bufs,
+                                       llama_mlocks * lmlocks, llama_progress_callback progress_callback,
+                                       void * progress_callback_user_data) {
     GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
     std::vector<no_init<uint8_t>> read_buf;
@@ -1060,6 +1050,12 @@ bool llama_model_loader::load_all_data(
             }
         } else {
             const auto & file = files.at(weight->idx);
+            if (file == nullptr) {
+                throw std::runtime_error(
+                    format("file not found for tensor '%s' at split-index %d", ggml_get_name(cur), weight->idx));
+            }
+            LLAMA_LOG_CMAKE_DEBUG("%s: uploading tensor %s from file at split-index %d\n", __func__, ggml_get_name(cur),
+                                  weight->idx);
             if (ggml_backend_buffer_is_host(cur->buffer)) {
                 file->seek(weight->offs, SEEK_SET);
                 file->read_raw(cur->data, n_size);
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index 0f52b011b6986..605a9784bcd03 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -5,6 +5,7 @@
 #include "llama-impl.h"
 #include "llama-arch.h"
 #include "llama-mmap.h"
+#include "llama-model-load.h"
 
 #include "ggml-cpp.h"
 
@@ -78,6 +79,9 @@ struct llama_model_loader {
     llama_mmaps mappings;
 
     std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
+
+    std::optional<IncrementalSplitsTensorLoad> incremental_splits_tensor_load;
+
     std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
     const llama_model_tensor_buft_override * tensor_buft_overrides;
 
@@ -91,9 +95,10 @@ struct llama_model_loader {
     size_t size_data = 0;
     std::vector<std::pair<size_t, size_t>> mmaps_used;
 
+    void process_loaded_gguf(struct ggml_context * ctx, gguf_file_load & gguf_load, uint16_t idx);
+
     llama_model_loader(
-        const std::string & fname,
-        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
+        load_input_t load_input,
         bool use_mmap,
         bool check_tensors,
         const llama_model_kv_override * param_overrides_p,
@@ -156,12 +161,8 @@ struct llama_model_loader {
     void load_data_for(struct ggml_tensor * cur) const;
 
     // Returns false if cancelled by progress_callback
-    bool load_all_data(
-            struct ggml_context * ctx,
-            llama_buf_map & bufs,
-            llama_mlocks * lmlocks,
-            llama_progress_callback progress_callback,
-            void * progress_callback_user_data);
+    bool load_all_data(size_t size_data, struct ggml_context * ctx, llama_buf_map & bufs, llama_mlocks * lmlocks,
+                       llama_progress_callback progress_callback, void * progress_callback_user_data);
 
     std::string ftype_name() const;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9b19da984081e..700dfc6567850 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -17,10 +17,10 @@
 #include <cassert>
 #include <cmath>
 #include <cfloat>
+#include <cstdint>
 #include <cstring>
 #include <cmath>
 #include <functional>
-#include <map>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
@@ -1643,9 +1643,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
 
         auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
-            ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
-
+            const std::string& tensor_name = tn.str();
+            ggml_tensor * t_meta = ml.get_tensor_meta(tensor_name.c_str());
+            std::optional<uint16_t> split_idx;
+            if (!t_meta && (flags & TENSOR_NOT_REQUIRED) &&
+                IncrementalSplitsTensorLoad::tensor_ignored(ml.incremental_splits_tensor_load, tensor_name.c_str())) {
+                return nullptr;
+            }
+            if (ml.incremental_splits_tensor_load.has_value()) {
+                split_idx = ml.incremental_splits_tensor_load->load_tensor_metadata(ml, tn.str().c_str(), &t_meta);
+                LLAMA_LOG_CMAKE_DEBUG("split idx for tensor %s: %d\n", tn.str().c_str(), *split_idx);
+            }
             if (!t_meta) {
+                LLAMA_LOG_ERROR("%s: missing tensor %s\n", __func__ , tn.str().c_str());
                 if (flags & TENSOR_NOT_REQUIRED) {
                     return nullptr;
                 }
@@ -1758,16 +1768,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 }
             }
 
-            ggml_context * ctx = ctx_for_buft(buft);
+            ggml_context * ctx =
+                split_idx.has_value() ?
+                    ml.incremental_splits_tensor_load->get_model_ctx_for_split_buft(buft, *split_idx, pimpl.get()) :
+                    ctx_for_buft(buft);
 
             // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
             if (flags & TENSOR_DUPLICATED) {
-                ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+                auto tn_str = tn.str();
+                ggml_tensor * t = ggml_get_tensor(ctx, tn_str.c_str());
                 if (t) {
                     return t;
                 }
+                LLAMA_LOG_WARN("%s: duplicated tensor %s not found on existing context\n", tn_str.c_str(), __func__);
+            }
+            struct ggml_tensor * tensor = ml.create_tensor(ctx, tn, ne, flags);
+
+            if (split_idx.has_value() && ml.incremental_splits_tensor_load->all_tensors_are_loaded(*split_idx)) {
+                // Upload right now.
+                if (!create_split_backend_buffers(*split_idx, ml.incremental_splits_tensor_load->ctx_split_map, ml,
+                                                  use_mmap_buffer, use_mlock, n_gpu_layers)) {
+                    throw std::runtime_error("Failed to create incremental backend buffers");
+                }
+                IncrementalSplitsTensorLoad::release_split(ml, *split_idx);
             }
-            return ml.create_tensor(ctx, tn, ne, flags);
+
+            return tensor;
         };
 
         layers.resize(n_layer);
@@ -4285,9 +4311,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
     ml.done_getting_tensors();
 
+    if (ml.incremental_splits_tensor_load.has_value()) {
+        // Already did incremental load.
+        print_backend_buffers_info(n_gpu_layers);
+        return true;
+    }
+
     ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
     pimpl->mappings.reserve(ml.mappings.size());
 
+    return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers);
+}
+
+bool llama_model::create_split_backend_buffers(
+    const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
+    llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) {
+    // Extract contexts for the given split index from ctx_split_map into a new map
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    for (const auto & [buft_split_idx, ctx] : ctx_split_map) {
+        const auto & [buft, split_idx] = buft_split_idx;
+        if (split_idx == idx) {
+            ctx_map[buft] = ctx;
+        }
+    }
+
+    const std::size_t split_data_size = ml.incremental_splits_tensor_load->get_split_data_size(idx);
+    LLAMA_LOG_CMAKE_DEBUG("%s: creating backend buffers for split %d with size %zu\n", __func__, idx, split_data_size);
+    constexpr bool do_print_backend_buffers_info = false;
+    const bool     creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock,
+                                                             n_gpu_layers, do_print_backend_buffers_info);
+
+    return creation_success;
+}
+
+bool llama_model::create_backend_buffers(std::size_t                                                  size_data,
+                                         const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
+                                         llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
+                                         const int32_t n_gpu_layers, bool do_print_backend_buffers_info) {
     // create the backend buffers
     std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
     ctx_bufs.reserve(ctx_map.size());
@@ -4296,7 +4356,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
     pimpl->bufs.reserve(n_max_backend_buffer);
 
-    for (auto & it : ctx_map) {
+    for (const auto & it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx              = it.second;
 
@@ -4372,23 +4432,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         ctx_bufs.emplace_back(ctx, buf_map);
     }
 
-    if (llama_supports_gpu_offload()) {
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
-        if (n_gpu_layers > (int) hparams.n_layer) {
-            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
-        }
-
-        const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers       = hparams.n_layer + 1;
-
-        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
-    }
-
-    // print memory requirements per buffer type
-    for (auto & buf : pimpl->bufs) {
-        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+    if(do_print_backend_buffers_info) {
+        print_backend_buffers_info(n_gpu_layers);
     }
 
     // populate tensors_by_name
@@ -4402,7 +4447,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     for (auto & it : ctx_bufs) {
         ggml_context * ctx = it.first;
         auto & bufs = it.second;
-        if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+        if (!ml.load_all_data(size_data, ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
             return false;
         }
     }
@@ -4416,6 +4461,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     return true;
 }
 
+void llama_model::print_backend_buffers_info(const int32_t n_gpu_layers) {
+    if (llama_supports_gpu_offload()) {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
+        }
+
+        const int max_backend_supported_layers = hparams.n_layer + 1;
+        const int max_offloadable_layers       = hparams.n_layer + 1;
+
+        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers),
+                       max_backend_supported_layers);
+    }
+
+    // print memory requirements per buffer type
+    for (auto & buf : pimpl->bufs) {
+        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()),
+                       ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+    }
+}
+
 std::string llama_model::arch_name() const {
     return llm_arch_name(arch);
 }
diff --git a/src/llama-model.h b/src/llama-model.h
index 06e6c687943cc..98ba0d29da2d3 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -7,10 +7,12 @@
 #include "llama-memory.h"
 #include "llama-vocab.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <map>
 
 struct llama_cparams;
 struct llama_ubatch;
@@ -373,6 +375,19 @@ struct llama_model {
     explicit llama_model(const struct llama_model_params & params);
     ~llama_model();
 
+    /// @brief Create backend buffers for all tensors
+    bool create_backend_buffers(std::size_t                                                  size_data,
+                                const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
+                                llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers,
+                                bool do_print_backend_buffers_info = true);
+
+    /// @brief Create backend buffers for tensors on a split file idenfified by `idx`. Removes the split from the map.
+    bool create_split_backend_buffers(
+        uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
+        llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers);
+
+    void print_backend_buffers_info(int32_t n_gpu_layers);
+
     void load_stats  (llama_model_loader & ml);
     void load_arch   (llama_model_loader & ml);
     void load_hparams(llama_model_loader & ml);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 43229e1938597..0cb6ebe238ef6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -583,7 +583,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
+    load_input_variant::fname_load_input inp{fname_inp, splits};
+    llama_model_loader                   ml(inp, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
     llama_model model(llama_model_default_params());
diff --git a/src/llama.cpp b/src/llama.cpp
index 34906cdb62844..3fe4e8f7e5013 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9,6 +9,7 @@
 
 #include "ggml.h"
 #include "ggml-backend.h"
+#include "uint8-buff-stream.h"
 
 #include <algorithm>
 #include <cstddef>
@@ -16,11 +17,17 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <stdexcept>
+#include <streambuf>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#ifdef __cplusplus
+#include "llama-cpp.h"
+#endif
+
 //
 // interface implementation
 //
@@ -84,7 +91,7 @@ int64_t llama_time_us(void) {
 }
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+static int llama_model_load(llama_model_loader & ml, llama_model & model, llama_model_params & params) {
     // loading time will be recalculated after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = 0;
@@ -93,8 +100,6 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
     model.t_start_us = tm.t_start_us;
 
     try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
-
         ml.print_info();
 
         model.hparams.vocab_only = params.vocab_only;
@@ -135,8 +140,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
 }
 
 static struct llama_model * llama_model_load_from_file_impl(
-        const std::string & path_model,
-        std::vector<std::string> & splits,
+        llama_model_loader& ml,
         struct llama_model_params params) {
     ggml_time_init();
 
@@ -218,7 +222,7 @@ static struct llama_model * llama_model_load_from_file_impl(
         LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
     }
 
-    const int status = llama_model_load(path_model, splits, *model, params);
+    const int status = llama_model_load(ml, *model, params);
     GGML_ASSERT(status <= 0);
     if (status < 0) {
         if (status == -1) {
@@ -241,26 +245,80 @@ struct llama_model * llama_load_model_from_file(
     return llama_model_load_from_file(path_model, params);
 }
 
-struct llama_model * llama_model_load_from_file(
-        const char * path_model,
-        struct llama_model_params params) {
+static llama_model_loader create_disk_fileloader(const char * path_model, std::vector<std::string> & splits,
+                                                 struct llama_model_params params) {
+    load_input_variant::fname_load_input loader_input{ path_model, splits };
+    return llama_model_loader(loader_input, params.use_mmap, params.check_tensors, params.kv_overrides,
+                              params.tensor_buft_overrides);
+}
+
+struct llama_model * llama_model_load_from_file(const char * path_model, struct llama_model_params params) {
     std::vector<std::string> splits = {};
-    return llama_model_load_from_file_impl(path_model, splits, params);
+    llama_model_loader       ml     = create_disk_fileloader(path_model, splits, params);
+    return llama_model_load_from_file_impl(ml, params);
 }
 
-struct llama_model * llama_model_load_from_splits(
-        const char ** paths,
-        size_t n_paths,
-        struct llama_model_params params) {
+namespace {
+void override_and_disable_mmap(struct llama_model_params & params) {
+    if (params.use_mmap) {
+        LLAMA_LOG_WARN("Overriding and disabling memory mapping when loading from memory buffer\n");
+        params.use_mmap = false;
+    }
+}
+}  // namespace
+
+struct llama_model * llama_model_load_from_buffer(std::vector<uint8_t> && data, struct llama_model_params params) {
+    std::unique_ptr<std::basic_streambuf<uint8_t>> streambuf = std::make_unique<Uint8BufferStreamBuf>(std::move(data));
+    override_and_disable_mmap(params);
+    llama_model_loader ml(load_input_variant::buffer_load_input{ streambuf }, params.use_mmap, params.check_tensors,
+                          params.kv_overrides, params.tensor_buft_overrides);
+    return llama_model_load_from_file_impl(ml, params);
+}
+
+namespace {
+std::vector<std::string> splits_from_c_paths(const char ** paths, size_t n_paths) {
     std::vector<std::string> splits;
     if (n_paths == 0) {
         LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
-        return nullptr;
+        return splits;
     }
     for (size_t i = 0; i < n_paths; ++i) {
         splits.push_back(paths[i]);
     }
-    return llama_model_load_from_file_impl(splits.front(), splits, params);
+    return splits;
+}
+}  // namespace
+
+struct llama_model * llama_model_load_from_splits(const char ** paths, size_t n_paths,
+                                                  struct llama_model_params params) {
+    std::vector<std::string> splits = splits_from_c_paths(paths, n_paths);
+    if (splits.empty()) {
+        return nullptr;
+    }
+    llama_model_loader ml = create_disk_fileloader(splits.front().c_str(), splits, params);
+    return llama_model_load_from_file_impl(ml, params);
+}
+
+struct llama_model * llama_model_load_from_split_futures(const char ** paths, size_t n_paths, const char * context,
+                                                         const char *              tensor_list_file,
+                                                         struct llama_model_params params) {
+    std::vector<std::string> splits = splits_from_c_paths(paths, n_paths);
+    if (splits.empty()) {
+        return nullptr;
+    }
+    std::string tensor_list_file_str(tensor_list_file);
+
+    load_input_variant::buffer_future_load_input loader_input{ splits.front(), context, splits, tensor_list_file_str };
+    override_and_disable_mmap(params);
+    llama_model_loader ml(loader_input, params.use_mmap, params.check_tensors, params.kv_overrides,
+                          params.tensor_buft_overrides);
+    return llama_model_load_from_file_impl(ml, params);
+}
+
+bool llama_model_load_fulfill_split_future(const char * path, const char * context,
+                                           std::unique_ptr<std::basic_streambuf<uint8_t>> && streambuf) {
+    return llama_future_file_buffer_ro::fulfill_promise(path, context,
+                                                        std::make_unique<llama_file_buffer_ro>(std::move(streambuf)));
 }
 
 void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index fc1557a2d4065..cb7ebae0a6bae 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,7 +8,7 @@ function(llama_build source)
     endif()
 
     add_executable(${TEST_TARGET} ${source})
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
+    target_link_libraries(${TEST_TARGET} PRIVATE common llama-common-test)
     install(TARGETS ${TEST_TARGET} RUNTIME)
 endfunction()
 
@@ -97,7 +97,7 @@ function(llama_build_and_test source)
 
     add_executable(${TEST_TARGET} ${source} get-model.cpp)
     install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
+    target_link_libraries(${TEST_TARGET} PRIVATE common llama-common-test)
 
     add_test(
         NAME ${TEST_TARGET}
@@ -197,6 +197,9 @@ llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
+llama_build_and_test(test-model-load-disk.cpp    LABEL "model")
+llama_build_and_test(test-model-load-memory.cpp  LABEL "model")
+llama_build_and_test(test-model-load-memory-split.cpp LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")
 
 if (NOT GGML_BACKEND_DL)
diff --git a/tests/test-model-load-disk.cpp b/tests/test-model-load-disk.cpp
new file mode 100644
index 0000000000000..3310681200c0f
--- /dev/null
+++ b/tests/test-model-load-disk.cpp
@@ -0,0 +1,41 @@
+#include <cstdlib>
+
+#include "get-model.h"
+#include "llama.h"
+
+int main(int argc, char * argv[]) {
+    auto * model_path = get_model_or_exit(argc, argv);
+    auto * file       = fopen(model_path, "r");
+    if (file == nullptr) {
+        fprintf(stderr, "no model at '%s' found\n", model_path);
+        return EXIT_FAILURE;
+    }
+
+    fprintf(stderr, "using '%s'\n", model_path);
+    fclose(file);
+
+    llama_backend_init();
+    auto params              = llama_model_params{};
+    params.use_mmap          = false;
+    params.progress_callback = [](float progress, void * ctx) {
+        (void) ctx;
+        fprintf(stderr, "%.2f%% ", progress * 100.0f);
+        // true means: Don't cancel the load
+        return true;
+    };
+    auto * model = llama_model_load_from_file(model_path, params);
+
+    // Add newline after progress output
+    fprintf(stderr, "\n");
+
+    if (model == nullptr) {
+        fprintf(stderr, "Failed to load model\n");
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    fprintf(stderr, "Model loaded successfully\n");
+    llama_model_free(model);
+    llama_backend_free();
+    return EXIT_SUCCESS;
+}
diff --git a/tests/test-model-load-memory-split.cpp b/tests/test-model-load-memory-split.cpp
new file mode 100644
index 0000000000000..5b87bcc9c5dbb
--- /dev/null
+++ b/tests/test-model-load-memory-split.cpp
@@ -0,0 +1,74 @@
+#include <cstdlib>
+#include <thread>
+#include <vector>
+
+#include "get-model.h"
+#include "llama-cpp.h"
+#include "load_into_memory.h"
+
+int main(int argc, char * argv[]) {
+    auto * model_path = get_model_or_exit(argc, argv);
+
+    if (!is_split_file(model_path)) {
+        printf("Skipping not-split model %s\n", model_path);
+        return EXIT_SUCCESS;
+    }
+
+    // Manually load into a memory buffer first
+    file_entry              tensor_list_file = load_tensor_list_file(model_path);
+    std::vector<file_entry> files            = load_files_into_streambuf(model_path);
+
+    llama_backend_init();
+    auto params              = llama_model_params{};
+    params.use_mmap          = false;
+    params.progress_callback = [](float progress, void * ctx) {
+        (void) ctx;
+        fprintf(stderr, "%.2f%% ", progress * 100.0f);
+        // true means: Don't cancel the load
+        return true;
+    };
+
+    printf("Loading model from %zu files\n", files.size());
+
+    std::vector<const char *> file_paths;
+    for (size_t i = 0; i < files.size(); i++) {
+        printf("Found file %s \n", files[i].path.c_str());
+        file_paths.push_back(files[i].path.c_str());
+    }
+
+    const char * async_load_context = "test-model-load";
+    std::thread  fulfill_thread([&files, &tensor_list_file, &async_load_context]() {
+        const bool success = llama_model_load_fulfill_split_future(tensor_list_file.path.c_str(), async_load_context,
+                                                                    std::move(tensor_list_file.streambuf));
+        printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(), success ? "success" : "failure");
+        if (!success) {
+            exit(EXIT_FAILURE);
+        }
+        for (size_t i = 0; i < files.size(); i++) {
+            const bool success = llama_model_load_fulfill_split_future(files[i].path.c_str(), async_load_context,
+                                                                        std::move(files[i].streambuf));
+            printf("Fulfilling file %s: %s\n", files[i].path.c_str(), success ? "success" : "failure");
+            if (!success) {
+                exit(EXIT_FAILURE);
+            }
+        }
+    });
+    fprintf(stderr, "Loading model from splits\n");
+    auto * model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context,
+                                                       tensor_list_file.path.c_str(), params);
+    fulfill_thread.join();
+
+    fprintf(stderr, "\n");
+
+    if (model == nullptr) {
+        fprintf(stderr, "Failed to load model\n");
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    fprintf(stderr, "Model loaded successfully\n");
+    llama_model_free(model);
+    llama_backend_free();
+
+    return EXIT_SUCCESS;
+}
diff --git a/tests/test-model-load-memory.cpp b/tests/test-model-load-memory.cpp
new file mode 100644
index 0000000000000..255abb46e499f
--- /dev/null
+++ b/tests/test-model-load-memory.cpp
@@ -0,0 +1,47 @@
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+
+#include "get-model.h"
+#include "llama-cpp.h"
+#include "load_into_memory.h"
+
+int main(int argc, char * argv[]) {
+    auto * model_path = get_model_or_exit(argc, argv);
+
+    if (is_split_file(model_path)) {
+        printf("Skipping split model %s\n", model_path);
+        return EXIT_SUCCESS;
+    }
+
+    // Manually load into a memory buffer first
+    std::vector<std::uint8_t> buffer = load_file_into_buffer(model_path);
+
+    llama_backend_init();
+    auto params              = llama_model_params{};
+    params.use_mmap          = false;
+    params.progress_callback = [](float progress, void * ctx) {
+        (void) ctx;
+        fprintf(stderr, "%.2f%% ", progress * 100.0f);
+        // true means: Don't cancel the load
+        return true;
+    };
+
+    // Test that it can load directly from a buffer
+    printf("Loading model from buffer of size %zu bytes\n", buffer.size());
+    auto * model = llama_model_load_from_buffer(std::move(buffer), params);
+
+    // Add newline after progress output
+    fprintf(stderr, "\n");
+
+    if (model == nullptr) {
+        fprintf(stderr, "Failed to load model\n");
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    fprintf(stderr, "Model loaded successfully\n");
+    llama_model_free(model);
+    llama_backend_free();
+    return EXIT_SUCCESS;
+}
diff --git a/tools/gguf-split/gguf-split.cpp b/tools/gguf-split/gguf-split.cpp
index 30e771564e808..5c45940cff4fe 100644
--- a/tools/gguf-split/gguf-split.cpp
+++ b/tools/gguf-split/gguf-split.cpp
@@ -13,6 +13,7 @@
 #include <fstream>
 #include <string>
 #include <vector>
+#include <set>
 
 #if defined(_WIN32)
     #include <windows.h>
@@ -43,6 +44,8 @@ struct split_params {
     std::string output;
     bool no_tensor_first_split = false;
     bool dry_run = false;
+    bool verbose = false;
+    std::set<std::string> must_be_followed_layers;
 };
 
 static void split_print_usage(const char * executable) {
@@ -50,7 +53,8 @@ static void split_print_usage(const char * executable) {
     printf("\n");
     printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
     printf("\n");
-    printf("Apply a GGUF operation on IN to OUT.");
+    printf("Apply a GGUF operation on IN to OUT.\n");
+    printf("When splitting, also creates GGUF_OUT.tensors.txt with all tensor names.\n");
     printf("\n");
     printf("options:\n");
     printf("  -h, --help              show this help message and exit\n");
@@ -60,7 +64,9 @@ static void split_print_usage(const char * executable) {
     printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
     printf("  --split-max-size N(M|G) max size per split\n");
     printf("  --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
+    printf("  --must-be-followed LAYER ensure LAYER is not the last tensor in a split and will not be released when loading after any tensor is created (can be used multiple times)\n");
     printf("  --dry-run               only print out a split plan and exit, without writing any new files\n");
+    printf("  --verbose               show tensor names for each split\n");
     printf("\n");
 }
 
@@ -106,6 +112,9 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
         } else if (arg == "--dry-run") {
             arg_found = true;
             params.dry_run = true;
+        } else if (arg == "--verbose") {
+            arg_found = true;
+            params.verbose = true;
         } else if (arg == "--no-tensor-first-split") {
             arg_found = true;
             params.no_tensor_first_split = true;
@@ -143,6 +152,13 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
             }
             params.mode = MODE_SIZE;
             params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
+        } else if (arg == "--must-be-followed") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            params.must_be_followed_layers.insert(argv[arg_idx]);
         }
 
         if (!arg_found) {
@@ -275,7 +291,19 @@ struct split_strategy {
         }
     }
 
+    bool must_be_followed(int i_tensor) {
+        if (i_tensor > 0 && i_tensor < n_tensors) {
+            const char* tensor_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+            return params.must_be_followed_layers.find(tensor_name) != params.must_be_followed_layers.end();
+        }
+        return false;
+    }
+
     bool should_split(int i_tensor, size_t next_size) {
+        if (must_be_followed(i_tensor) || must_be_followed(i_tensor - 1)) {
+            return false;
+        }
+
         if (params.mode == MODE_SIZE) {
             // split by max size per file
             return next_size > params.n_bytes_split;
@@ -299,10 +327,41 @@ struct split_strategy {
             }
             total_size = total_size / 1000 / 1000; // convert to megabytes
             printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            
+            if (params.verbose) {
+                for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
+                    const char * t_name = gguf_get_tensor_name(ctx_out, i);
+                    printf("  - %s\n", t_name);
+                }
+            }
             i_split++;
         }
     }
 
+    void write_tensor_list() {
+        // Create a .txt file with all tensor names from all splits
+        std::string   tensor_list_path = params.output + ".tensors.txt";
+        std::ofstream tensor_file(tensor_list_path);
+        if (!tensor_file.is_open()) {
+            fprintf(stderr, "warning: failed to create tensor list file %s\n", tensor_list_path.c_str());
+            return;
+        }
+
+        printf("Writing tensor list to %s ... ", tensor_list_path.c_str());
+        fflush(stdout);
+
+        // Write all tensor names from all splits
+        for (auto & ctx_out : ctx_outs) {
+            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
+                const char * t_name = gguf_get_tensor_name(ctx_out, i);
+                tensor_file << t_name << "\n";
+            }
+        }
+
+        tensor_file.close();
+        printf("done\n");
+    }
+
     void write() {
         int i_split = 0;
         int n_split = ctx_outs.size();
@@ -382,6 +441,9 @@ static void gguf_split(const split_params & split_params) {
     int n_split = strategy.ctx_outs.size();
     strategy.print_info();
 
+    // Write tensor list file
+    strategy.write_tensor_list();
+
     if (!split_params.dry_run) {
         // write all output splits
         strategy.write();