[fbuffers] Expose async interface

jesusmb1995 · jesusmb1995 · commit 3ee6016ada13 · 2025-08-19T17:11:35.000+02:00
Add functions to Llama.cpp public headers to asynchronously load shards.
diff --git a/include/llama-cpp.h b/include/llama-cpp.h
@@ -31,3 +31,5 @@ typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
 typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
 
 struct llama_model * llama_model_load_from_buffer(std::vector<uint8_t> && data, struct llama_model_params params);
+bool                 llama_model_load_fulfill_split_future(const char * path, const char * context,
+                                                           std::unique_ptr<std::basic_streambuf<uint8_t>> && streambuf);
diff --git a/include/llama.h b/include/llama.h
@@ -456,6 +456,11 @@ extern "C" {
                                  size_t    n_paths,
               struct llama_model_params    params);
 
+    LLAMA_API struct llama_model * llama_model_load_from_split_futures(const char ** paths, size_t n_paths,
+                                                                       const char *              context,
+                                                                       const char *              tensor_list_file,
+                                                                       struct llama_model_params params);
+
     LLAMA_API void llama_model_save_to_file(
             const struct llama_model * model,
                         const char * path_model);
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -18,6 +18,7 @@
 #include <cstring>
 #include <ctime>
 #include <stdexcept>
+#include <streambuf>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -298,6 +299,28 @@ struct llama_model * llama_model_load_from_splits(const char ** paths, size_t n_
     return llama_model_load_from_file_impl(ml, params);
 }
 
+struct llama_model * llama_model_load_from_split_futures(const char ** paths, size_t n_paths, const char * context,
+                                                         const char *              tensor_list_file,
+                                                         struct llama_model_params params) {
+    std::vector<std::string> splits = splits_from_c_paths(paths, n_paths);
+    if (splits.empty()) {
+        return nullptr;
+    }
+    std::string tensor_list_file_str(tensor_list_file);
+
+    load_input_variant::buffer_future_load_input loader_input{ splits.front(), context, splits, tensor_list_file_str };
+    override_and_disable_mmap(params);
+    llama_model_loader ml(loader_input, params.use_mmap, params.check_tensors, params.kv_overrides,
+                          params.tensor_buft_overrides);
+    return llama_model_load_from_file_impl(ml, params);
+}
+
+bool llama_model_load_fulfill_split_future(const char * path, const char * context,
+                                           std::unique_ptr<std::basic_streambuf<uint8_t>> && streambuf) {
+    return llama_future_file_buffer_ro::fulfill_promise(path, context,
+                                                        std::make_unique<llama_file_buffer_ro>(std::move(streambuf)));
+}
+
 void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
     llama_model_saver ms(*model);
     ms.add_kv_from_model();