Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions Pmll.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// SPDX-License-Identifier: MIT
// Persistent Memory Logic Loop adapter for llama.cpp
//
// © 2025 Dr. Josef Kurk Edwards & John Trompeter
// Simplified BSD-style—see LICENSE-PMLL.

#include "llama.h"
#include <filesystem>
#include <fstream>
#include <mutex>
#include <stdexcept>
#include <string>
#include <vector>

namespace pmll {

using fs = std::filesystem;

struct LoopHook {
// Override this to inject your own logic each step.
// Return false to abort generation.
virtual bool operator()(const std::string& prompt,
const std::vector<llama_token>& last_out) = 0;
virtual ~LoopHook() = default;
};

class Loop {
public:
Loop(const std::string& model_path,
const std::string& state_dir,
uint32_t n_ctx = 4096,
LoopHook* user_hook = nullptr)
: model_path_(model_path),
state_dir_(state_dir),
user_hook_(user_hook) {

fs::create_directories(state_dir_);

llama_backend_init(); // init ggml backend
llama_model_params mp = llama_model_default_params();
model_ = llama_model_load_from_file(model_path_.c_str(), mp);
if (!model_) throw std::runtime_error("model load failed");

llama_context_params cp = llama_context_default_params();
cp.n_ctx = n_ctx;
ctx_ = llama_init_from_model(model_, cp);
if (!ctx_) throw std::runtime_error("context init failed");

mem_ = llama_get_memory(ctx_); // unified KV handle
}

~Loop() {
llama_free(ctx_);
llama_model_free(model_);
llama_backend_free();
}

/// Generate up to n_predict tokens, persisting state after each decode
std::string generate(const std::string& prompt,
int n_predict = 128,
llama_seq_id seq = 0) {
std::lock_guard<std::mutex> lock(mu_);
restore(seq); // 1⃣ try resume

// --- tokenize prompt --------------------------------------------------
std::vector<llama_token> tokens(prompt.size() + 8);
int n = llama_tokenize(model_, prompt.c_str(),
tokens.data(), tokens.size(), true, true);
tokens.resize(n);

llama_batch batch = llama_batch_init(n, 0, 1);
Copy link

Copilot AI Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The batch allocation and deallocation happens multiple times in the generation loop. Consider reusing batch objects or allocating them once outside the loop to reduce memory allocation overhead during token generation.

Copilot uses AI. Check for mistakes.
for (int i = 0; i < n; ++i) {
batch.token[i] = tokens[i];
batch.pos[i] = i;
batch.seq_id[i] = &seq;
Copy link

Copilot AI Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Taking the address of the seq parameter is incorrect. The seq_id field expects an array of sequence IDs, not a pointer to the seq variable. This should be batch.seq_id[i] = seq; and the seq_id array should be properly allocated.

Suggested change
batch.seq_id[i] = &seq;
batch.seq_id[i] = seq;

Copilot uses AI. Check for mistakes.
batch.n_seq_id[i] = 1;
}
llama_decode(ctx_, batch); // prompt
llama_batch_free(batch);

std::vector<llama_token> out;
out.reserve(n_predict);

for (int step = 0; step < n_predict; ++step) {
llama_batch b1 = llama_batch_init(1, 0, 1);
b1.token[0] = sample_next(); // greedy / top-k
b1.pos[0] = tokens.size() + step;
b1.seq_id[0] = &seq;
Copy link

Copilot AI Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as line 75 - taking the address of seq is incorrect. This should be b1.seq_id[0] = seq; and proper sequence ID array handling should be implemented.

Suggested change
b1.seq_id[0] = &seq;
b1.seq_id[0] = seq;

Copilot uses AI. Check for mistakes.
b1.n_seq_id[0] = 1;
llama_decode(ctx_, b1);
out.push_back(b1.token[0]);
llama_batch_free(b1);

// 2⃣ optional user logic-loop
if (user_hook_ && !(*user_hook_)(prompt, out)) break;

// 3⃣ persist every step
persist(seq);
}
std::string txt = tokens_to_str(out);
return txt;
}

private:
llama_token sample_next() {
const float* logits = llama_get_logits(ctx_);
int n_vocab = llama_n_vocab(llama_model_get_vocab(model_));
Copy link

Copilot AI Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function llama_model_get_vocab() appears to be an incorrect API usage. Based on llama.cpp's API, this should likely be llama_n_vocab(model_) directly, as llama_n_vocab typically takes the model pointer, not a vocab object.

Copilot uses AI. Check for mistakes.
int best = 0;
for (int i = 1; i < n_vocab; ++i)
if (logits[i] > logits[best]) best = i;
return best; // naive greedy
}

void persist(llama_seq_id seq) {
std::string file = state_dir_ + "/seq-" + std::to_string(seq) + ".pmll";
llama_state_seq_save_file(ctx_, file.c_str(), seq, nullptr, 0);
}

void restore(llama_seq_id seq) {
std::string file = state_dir_ + "/seq-" + std::to_string(seq) + ".pmll";
if (fs::exists(file)) {
llama_state_seq_load_file(ctx_, file.c_str(), seq,
nullptr, 0, nullptr);
}
}

std::string tokens_to_str(const std::vector<llama_token>& t) {
std::string s;
for (auto tok : t) {
char buf[8];
Copy link

Copilot AI Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The buffer size of 8 bytes is insufficient for token-to-string conversion. Some tokens can produce UTF-8 sequences longer than 8 bytes, which could lead to truncated output or buffer overflow. Consider using a larger buffer size (e.g., 32 or 64 bytes) or dynamically allocating based on the token.

Copilot uses AI. Check for mistakes.
int n = llama_token_to_str(model_, tok, buf, sizeof(buf));
if (n > 0) s.append(buf, n);
}
return s;
}

std::mutex mu_;
std::string model_path_;
std::string state_dir_;
LoopHook* user_hook_;
llama_model* model_ = nullptr;
llama_context* ctx_ = nullptr;
llama_memory_t mem_ = nullptr;
Copy link

Copilot AI Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The mem_ member variable is assigned in the constructor but never used elsewhere in the class. Consider removing it if it's not needed, or document its intended purpose if it will be used in future functionality.

Suggested change
llama_memory_t mem_ = nullptr;
// Removed unused mem_ member variable.

Copilot uses AI. Check for mistakes.
};

} // namespace pmll