Skip to content

QVAC-3697: Load GGUF File From Buffer #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: temp-load-from-buffer
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f8942e7
[common] Pure interface for files
jesusmb1995 Jul 16, 2025
ca481ed
[common] Compile time debug logs
jesusmb1995 Jul 29, 2025
cd6f698
[aux] Test full load from disk
jesusmb1995 Jul 16, 2025
67e1868
[aux] GGUF split summary
jesusmb1995 Jul 30, 2025
eb2e355
[aux] gguf tensor must be followed
jesusmb1995 Aug 19, 2025
3ca5a71
[aux] verbose gguf split
jesusmb1995 Aug 19, 2025
1440377
[common] Stream buffer for uint8 data
jesusmb1995 Jul 29, 2025
b6f825d
[mbuffer] Llama file buffer implementation
jesusmb1995 Jul 16, 2025
86da48c
[refactor] C splits into C++
jesusmb1995 Jul 30, 2025
cba0254
[common] GGUF reader from memory
jesusmb1995 Jul 17, 2025
610d73e
[refactor][mbuffer] File load from variant
jesusmb1995 Jul 18, 2025
762c968
[refactor] Process file method
jesusmb1995 Jul 30, 2025
be62aaa
[mbuffer] Expose single-buffer loading to Llama interface
jesusmb1995 Jul 16, 2025
3a0855d
[fbuffers] Future file buffer implementation
jesusmb1995 Jul 30, 2025
85c4d3b
[fbuffers] Incremental loading of future files
jesusmb1995 Jul 30, 2025
bd60c89
[refactor] Create backend buffers
jesusmb1995 Jul 30, 2025
0561525
[refactor] Load all data
jesusmb1995 Jul 30, 2025
77cef5b
[fbuffers] Incremental model load
jesusmb1995 Jul 30, 2025
ff882fe
[fbuffers] Expose async interface
jesusmb1995 Jul 30, 2025
dab6554
[refactor] Increase common loading granularity
jesusmb1995 Jul 18, 2025
f5902e8
[aux] Common test
jesusmb1995 Jul 30, 2025
425e192
[aux] Memory example (embedding)
jesusmb1995 Jul 30, 2025
f0e7125
[aux] Memory example (simple)
jesusmb1995 Jul 30, 2025
cd1b485
[aux] Auto. memory loading tests
jesusmb1995 Jul 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,10 @@ if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()

if(LLAMA_BUILD_EXAMPLES OR LLAMA_BUILD_TESTS)
add_subdirectory(common_test)
endif()

if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
Expand Down
23 changes: 14 additions & 9 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -899,15 +899,7 @@ std::string fs_get_cache_file(const std::string & filename) {
// Model utils
//

struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return iparams;
}
struct common_init_result common_init_from_model_and_params(llama_model* model, common_init_result iparams, common_params & params) {

const llama_vocab * vocab = llama_model_get_vocab(model);

Expand Down Expand Up @@ -1068,6 +1060,19 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}

struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return iparams;
}

return common_init_from_model_and_params(model, std::move(iparams), params);
}

std::string get_model_endpoint() {
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ struct common_init_result {
};

struct common_init_result common_init_from_params(common_params & params);
struct common_init_result common_init_from_model_and_params(llama_model * model, common_init_result iparams,
common_params & params);

struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params);
Expand Down
15 changes: 15 additions & 0 deletions common_test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# common_test library for load_into_memory.h and uint8-buff-stream.h

set(TARGET llama-common-test)

add_library(${TARGET} INTERFACE)

target_include_directories(${TARGET} INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}
)

target_compile_definitions(${TARGET} INTERFACE LLAMA_COMMON_TEST_HEADERS)

target_compile_features(${TARGET} INTERFACE cxx_std_17)

target_link_libraries(${TARGET} INTERFACE common)
220 changes: 220 additions & 0 deletions common_test/load_into_memory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#pragma once

#include <chrono>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <memory>
#include <sstream>
#include <streambuf>
#include <string>
#include <thread>
#include <vector>

// header-only utilities to showcase how to directly load a model from memory
#include "uint8-buff-stream-wrapper.h"

namespace {
bool is_split_file(const char * const model_path) {
if (!model_path) {
fprintf(stderr, "No model file provided\n");
exit(EXIT_FAILURE);
}

std::string path(model_path);
return path.find("-of-") != std::string::npos;
}

std::vector<uint8_t> load_file_into_buffer(const char * const model_path) {
std::ifstream file_stream(model_path, std::ios::binary | std::ios::ate);
if (!file_stream) {
fprintf(stderr, "Failed to open file %s for reading into streambuf\n", model_path);
exit(EXIT_FAILURE);
}

const size_t file_size = file_stream.tellg();
file_stream.seekg(0, std::ios::beg);

static_assert(sizeof(std::uint8_t) == sizeof(char), "uint8_t must be same size as char");
std::vector<std::uint8_t> buffer(file_size);
if (!file_stream.read((char *) buffer.data(), file_size)) {
fprintf(stderr, "Failed to read entire file into buffer\n");
exit(EXIT_FAILURE);
}

return buffer;
}

std::unique_ptr<std::basic_streambuf<uint8_t>> load_file_into_streambuf(const char * const model_path) {
return std::make_unique<Uint8BufferStreamBuf>(load_file_into_buffer(model_path));
}

struct file_entry {
std::string path;
std::unique_ptr<std::basic_streambuf<uint8_t>> streambuf;
};

std::vector<file_entry> load_files_into_streambuf(const char * const model_path) {
std::vector<file_entry> files;

// Extract pattern from first file path
std::string path(model_path);

// Split by '-'
std::vector<std::string> parts;
std::stringstream ss(path);
std::string item;
while (std::getline(ss, item, '-')) {
parts.push_back(item);
}

// Split the last part by '.'
std::string last_part = parts.back();
parts.pop_back();
size_t dot_pos = last_part.find('.');
if (dot_pos != std::string::npos) {
parts.push_back(last_part.substr(0, dot_pos));
parts.push_back(last_part.substr(dot_pos + 1)); // extension
} else {
parts.push_back(last_part);
}

// Check if we have enough parts
if (parts.size() < 4) {
fprintf(stderr, "Model path does not contain expected pattern\n");
exit(EXIT_FAILURE);
}

// Get total files from [-2] position (before the extension)
int total_files = std::stoi(parts[parts.size() - 2]);

// Get base path by joining all parts except -start-of-end.gguf
std::string base_path;
for (size_t i = 0; i < parts.size() - 4; i++) {
if (i > 0) {
base_path += "-";
}
base_path += parts[i];
}

for (int i = 1; i <= total_files; i++) {
char numbered_path[1024];
snprintf(numbered_path, sizeof(numbered_path), "%s-%05d-of-%05d.gguf", base_path.c_str(), i, total_files);

files.push_back({ numbered_path, load_file_into_streambuf(numbered_path) });
}

return files;
}

file_entry load_tensor_list_file(const char * const model_path) {
std::string path(model_path);

// Split by '-'
std::vector<std::string> parts;
std::stringstream ss(path);
std::string item;
while (std::getline(ss, item, '-')) {
parts.push_back(item);
}

// Split the last part by '.'
std::string last_part = parts.back();
parts.pop_back();
size_t dot_pos = last_part.find('.');
if (dot_pos != std::string::npos) {
parts.push_back(last_part.substr(0, dot_pos));
parts.push_back(last_part.substr(dot_pos + 1)); // extension
} else {
parts.push_back(last_part);
}

// Check if we have enough parts
if (parts.size() < 4) {
fprintf(stderr, "Model path does not contain expected pattern\n");
exit(EXIT_FAILURE);
}

// Get base path by joining all parts except -start-of-end.gguf
std::string base_path;
for (size_t i = 0; i < parts.size() - 4; i++) {
if (i > 0) {
base_path += "-";
}
base_path += parts[i];
}

// Construct tensor list file path
std::string tensor_list_path = base_path + ".tensors.txt";

printf("Loading tensor list file: %s\n", tensor_list_path.c_str());
return { tensor_list_path, load_file_into_streambuf(tensor_list_path.c_str()) };
}

llama_model * load_model_from_memory_configuration(const char * model_path, llama_model_params & model_params) {
llama_model * model;
std::chrono::steady_clock::time_point load_start_time;
if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER")) {
std::vector<uint8_t> buffer = load_file_into_buffer(model_path);
fprintf(stdout, "%s: loading model from memory buffer\n", __func__);
load_start_time = std::chrono::steady_clock::now();
model = llama_model_load_from_buffer(std::move(buffer), model_params);
} else if (getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT")) {
file_entry tensor_list_file = load_tensor_list_file(model_path);
std::vector<file_entry> files = load_files_into_streambuf(model_path);
fprintf(stdout, "%s: loading model from %zu file streambufs\n", __func__, files.size());

std::vector<const char *> file_paths;
for (const auto & file : files) {
printf("Found file %s with streambuf\n", file.path.c_str());
file_paths.push_back(file.path.c_str());
}

load_start_time = std::chrono::steady_clock::now();
const char * async_load_context = "test-model-load";
std::thread fulfill_thread([&files, &tensor_list_file, &async_load_context]() {
const bool success = llama_model_load_fulfill_split_future(
tensor_list_file.path.c_str(), async_load_context, std::move(tensor_list_file.streambuf));
printf("Fulfilling tensor list file %s: %s\n", tensor_list_file.path.c_str(),
success ? "success" : "failure");
if (!success) {
exit(EXIT_FAILURE);
}

for (auto & file : files) {
const bool success = llama_model_load_fulfill_split_future(file.path.c_str(), async_load_context,
std::move(file.streambuf));
printf("Fulfilling file %s with streambuf: %s\n", file.path.c_str(), success ? "success" : "failure");
if (!success) {
exit(EXIT_FAILURE);
}
}
});
fprintf(stderr, "Loading model from splits\n");
model = llama_model_load_from_split_futures(file_paths.data(), file_paths.size(), async_load_context,
tensor_list_file.path.c_str(), model_params);
fulfill_thread.join();
} else if (getenv("LLAMA_EXAMPLE_FROM_FILE")) {
load_start_time = std::chrono::steady_clock::now();
model = llama_model_load_from_file(model_path, model_params);
} else {
return nullptr;
}

if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
exit(1);
}
std::chrono::steady_clock::time_point load_end_time = std::chrono::steady_clock::now();
std::chrono::duration<double> load_duration = load_end_time - load_start_time;
fprintf(stdout, "%s: loading model took %f seconds\n", __func__, load_duration.count());
return model;
}

bool memory_configuration_env_is_set() {
return getenv("LLAMA_EXAMPLE_MEMORY_BUFFER") || getenv("LLAMA_EXAMPLE_MEMORY_BUFFER_SPLIT") ||
getenv("LLAMA_EXAMPLE_FROM_FILE");
}
} // namespace
5 changes: 5 additions & 0 deletions common_test/uint8-buff-stream-wrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

// Wrapper to include the specific header from src
#include "uint8-buff-stream.h"

2 changes: 1 addition & 1 deletion examples/embedding/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-embedding)
add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama llama-common-test ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
33 changes: 28 additions & 5 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <thread>
#include <vector>

#include "arg.h"
#include "common.h"
#include "llama-cpp.h"
#include "log.h"
#include "llama.h"

#include <ctime>
#include <algorithm>

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

#ifdef LLAMA_COMMON_TEST_HEADERS
#include "load_into_memory.h"
#endif

static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
std::vector<std::string> lines;
size_t start = 0;
Expand Down Expand Up @@ -94,7 +104,20 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa);

// load the model
common_init_result llama_init = common_init_from_params(params);
common_init_result llama_init;

#ifdef LLAMA_COMMON_TEST_HEADERS
if (memory_configuration_env_is_set()) {
llama_model_params mparams = common_model_params_to_llama(params);
common_init_result iparams;
llama_model * model = load_model_from_memory_configuration(params.model.path.c_str(), mparams);
llama_init = common_init_from_model_and_params(model, std::move(iparams), params);
} else {
llama_init = common_init_from_params(params);
}
#else
llama_init = common_init_from_params(params);
#endif

llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();
Expand Down
2 changes: 1 addition & 1 deletion examples/simple/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-simple)
add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama llama-common-test ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
Loading
Loading