fix: hide context from public API by passing it directly to the session, ref #17

pminev · pminev · commit 324b9b529e68 · 2024-12-11T15:50:23.000+02:00
diff --git a/ac-local-plugin/code/LocalLlama.cpp b/ac-local-plugin/code/LocalLlama.cpp
@@ -41,7 +41,7 @@ class ChatSession {
     using Interface = ac::local::schema::LlamaCppInterface;
 
     ChatSession(llama::Instance& instance, Interface::OpChatBegin::Params& params)
-        : m_session(instance.newSession({}))
+        : m_session(instance.startSession({}))
         , m_vocab(instance.model().vocab())
     {
         m_promptTokens = instance.model().vocab().tokenize(params.setup.value(), true, true);
@@ -149,7 +149,7 @@ class LlamaInstance final : public Instance {
         auto& prompt = params.prompt.value();
         const auto maxTokens = params.maxTokens.value();
 
-        auto s = m_instance.newSession({});
+        auto s = m_instance.startSession({});
 
         auto promptTokens = m_instance.model().vocab().tokenize(prompt, true, true);
         s.setInitialPrompt(promptTokens);
diff --git a/code/ac/llama/Instance.cpp b/code/ac/llama/Instance.cpp
@@ -116,9 +116,12 @@ void Instance::warmup() {
     llama_perf_context_reset(lctx);
 }
 
-Session Instance::newSession(const Session::InitParams params) {
-    // not a real await as we return suspend_always initially
-    return Session(*this, params);
+Session& Instance::startSession(const Session::InitParams params) {
+    if (!m_session) {
+        m_session.reset(new Session(*this, m_lctx.get(), params));
+    }
+
+    return *m_session;
 }
 
 } // namespace ac::llama
diff --git a/code/ac/llama/Instance.hpp b/code/ac/llama/Instance.hpp
@@ -34,18 +34,17 @@ class AC_LLAMA_EXPORT Instance {
     void warmup();
 
     // only one session per instance can be active at a time
-    Session newSession(const Session::InitParams params);
+    Session& startSession(const Session::InitParams params);
+    void stopSession() noexcept { m_session.reset(); }
 
     const Model& model() const noexcept { return m_model; }
-    llama_context* ctx() const noexcept { return m_lctx.get(); }
     Sampler& sampler() noexcept { return m_sampler; }
 
 private:
     Model& m_model;
     Sampler m_sampler;
     astl::c_unique_ptr<llama_context> m_lctx;
-
-    bool m_hasActiveSession = false;
+    std::unique_ptr<Session> m_session;
 };
 
 } // namespace ac::llama
diff --git a/code/ac/llama/Session.cpp b/code/ac/llama/Session.cpp
@@ -21,20 +21,20 @@ llama_batch makeInputBatch(std::span<const Token> tokens) {
 }
 }
 
-Session::Session(Instance& instance, InitParams params)
+Session::Session(Instance& instance, llama_context* ctx, InitParams params)
     : m_instance(instance)
+    , m_ctx(ctx)
     , m_params(std::move(params))
 {
-    auto lctx = m_instance.ctx();
     auto& sampler = m_instance.sampler();
 
-    llama_kv_cache_clear(lctx);
-    llama_synchronize(lctx);
-    llama_perf_context_reset(lctx);
+    llama_kv_cache_clear(m_ctx);
+    llama_synchronize(m_ctx);
+    llama_perf_context_reset(m_ctx);
     sampler.reset();
     sampler.perfReset();
 
-    const auto ctxLen = llama_n_ctx(lctx);
+    const auto ctxLen = llama_n_ctx(m_ctx);
     m_state.maxTokens = ctxLen - 4; // (#16)
 }
 
@@ -45,8 +45,7 @@ void Session::setInitialPrompt(std::span<const Token> initialPrompt) {
 
     Token initialToken; // used to reset the initial prompt to a single token
 
-    auto lctx = m_instance.ctx();
-    const auto ctxLen = llama_n_ctx(lctx);
+    const auto ctxLen = llama_n_ctx(m_ctx);
     const auto tokenBos = llama_token_bos(m_instance.model().lmodel());
     m_state.numKeep = std::min(uint32_t(initialPrompt.size()), m_state.maxTokens); // number of tokens to keep in the context in case we overflow
 
@@ -70,7 +69,7 @@ void Session::setInitialPrompt(std::span<const Token> initialPrompt) {
 
     if (m_instance.model().hasEncoder()) {
         auto batch = makeInputBatch(initialPrompt);
-        auto res = llama_encode(lctx, batch);
+        auto res = llama_encode(m_ctx, batch);
         if (res != 0) {
             throw_ex{} << "Failed to encode input";
         }
@@ -117,7 +116,7 @@ Token Session::getToken() {
     auto& sampler = m_instance.sampler();
     auto& vocab = m_instance.model().vocab();
 
-    m_state.m_currToken = sampler.sample(m_instance.ctx());
+    m_state.m_currToken = sampler.sample(m_ctx);
 
     if (vocab.isEog(m_state.m_currToken)) {
         // don't decode eog tokens in case the the interaction is continued
@@ -132,9 +131,9 @@ std::vector<uint8_t> Session::getState() {
         throw_ex{} << "Session hasn't started yet";
     }
 
-    const auto size = llama_state_get_size(m_instance.ctx());
+    const auto size = llama_state_get_size(m_ctx);
     std::vector<uint8_t> state(size);
-    if (llama_state_get_data(m_instance.ctx(), state.data(), size) != size) {
+    if (llama_state_get_data(m_ctx, state.data(), size) != size) {
         throw_ex{} << "Failed to get state";
     }
     return state;
@@ -145,19 +144,13 @@ bool Session::setState(std::span<uint8_t> state) {
         throw_ex{} << "Session already started";
     }
 
-    if (llama_state_set_data(m_instance.ctx(), state.data(), state.size()) != state.size()) {
+    if (llama_state_set_data(m_ctx, state.data(), state.size()) != state.size()) {
         throw_ex{} << "Failed to set state";
     }
     return true;
 }
 
 void Session::doDecode(std::span<const Token> tokens, Source src) {
-    // first try to expand the context if needed
-    const auto gaFactor = m_params.gaFactor;
-    auto lctx = m_instance.ctx();
-    const auto ctxLen = llama_n_ctx(lctx);
-    auto& sampler = m_instance.sampler();
-
     // Ensure the input doesn't exceed the context size by truncating embd if necessary.
     if (tokens.size() > m_state.maxTokens) {
         const auto skipped = tokens.size() - m_state.maxTokens;
@@ -166,6 +159,10 @@ void Session::doDecode(std::span<const Token> tokens, Source src) {
     }
 
     bool haveFullContextMitigation = false;
+    const auto gaFactor = m_params.gaFactor;
+    const auto ctxLen = llama_n_ctx(m_ctx);
+    auto& sampler = m_instance.sampler();
+
     if (gaFactor == 1) {
         // infinite text generation via context shifting
         // if we run out of context:
@@ -183,8 +180,8 @@ void Session::doDecode(std::span<const Token> tokens, Source src) {
             LLAMA_LOG(Debug, "Context is full. Swapping: past = ", m_state.numPast, ", numLeft: ", numLeft,
                 ", ctxLen: ", ctxLen, ", numKeep: ", m_state.numKeep, ", numDiscard: ", numDiscard);
 
-            llama_kv_cache_seq_rm(lctx, 0, m_state.numKeep, m_state.numKeep + numDiscard);
-            llama_kv_cache_seq_add(lctx, 0, m_state.numKeep + numDiscard, m_state.numPast, -numDiscard);
+            llama_kv_cache_seq_rm(m_ctx, 0, m_state.numKeep, m_state.numKeep + numDiscard);
+            llama_kv_cache_seq_add(m_ctx, 0, m_state.numKeep + numDiscard, m_state.numPast, -numDiscard);
 
             m_state.numPast -= numDiscard;
             haveFullContextMitigation = true;
@@ -201,9 +198,9 @@ void Session::doDecode(std::span<const Token> tokens, Source src) {
 
             LLAMA_LOG(Debug, "Group attention shift: ib = ", ib, ", bd = ", bd, ", dd = ", dd);
 
-            llama_kv_cache_seq_add(lctx, 0, m_state.gaIndex, m_state.numPast, ib * bd);
-            llama_kv_cache_seq_div(lctx, 0, m_state.gaIndex + ib * bd, m_state.gaIndex + ib * bd + gaWidth, gaFactor);
-            llama_kv_cache_seq_add(lctx, 0, m_state.gaIndex + ib * bd + gaWidth, m_state.numPast + ib * bd, dd);
+            llama_kv_cache_seq_add(m_ctx, 0, m_state.gaIndex, m_state.numPast, ib * bd);
+            llama_kv_cache_seq_div(m_ctx, 0, m_state.gaIndex + ib * bd, m_state.gaIndex + ib * bd + gaWidth, gaFactor);
+            llama_kv_cache_seq_add(m_ctx, 0, m_state.gaIndex + ib * bd + gaWidth, m_state.numPast + ib * bd, dd);
 
             m_state.numPast -= bd;
 
@@ -223,14 +220,14 @@ void Session::doDecode(std::span<const Token> tokens, Source src) {
     }
 
     // decode
-    const auto batchSize = llama_n_batch(lctx);
+    const auto batchSize = llama_n_batch(m_ctx);
 
     // decode with batches of batchSize
     while (!tokens.empty()) {
         auto batchTokens = tokens.size() > batchSize ? tokens.first(batchSize) : tokens;
         tokens = tokens.subspan(batchTokens.size());
         auto batch = makeInputBatch(batchTokens);
-        if (llama_decode(lctx, batch) != 0) {
+        if (llama_decode(m_ctx, batch) != 0) {
             throw_ex{} << "Failed to decode tokens";
         }
         m_state.numPast += uint32_t(batchTokens.size());
diff --git a/code/ac/llama/Session.hpp b/code/ac/llama/Session.hpp
@@ -10,6 +10,8 @@
 #include <vector>
 #include <cassert>
 
+struct llama_context;
+
 namespace ac::llama {
 class Instance;
 
@@ -23,7 +25,7 @@ class Session {
         // only used if gaFactor == 1
         bool infiniteContext = true;
     };
-    Session(Instance& instance, InitParams params);
+    Session(Instance& instance, llama_context* ctx, InitParams params);
 
     void setInitialPrompt(std::span<const Token> prompt);
 
@@ -56,6 +58,7 @@ class Session {
     };
 
     Instance& m_instance;
+    llama_context* m_ctx;
     InitParams m_params;
     State m_state;
 };
diff --git a/example/e-basic.cpp b/example/e-basic.cpp
@@ -60,7 +60,7 @@ int main() try {
     std::cout << "Prompt: " << prompt << "\n";
 
     // start session
-    auto session = instance.newSession({});
+    auto session = instance.startSession({});
     session.setInitialPrompt(model.vocab().tokenize(prompt, true, true));
 
     // generate and print 100 tokens
diff --git a/example/e-gui.cpp b/example/e-gui.cpp
@@ -75,10 +75,11 @@ class UModel {
             class Session {
             public:
                 Session(ac::llama::Instance& instance, std::string_view prompt, std::vector<std::string> antiprompts, ac::llama::Session::InitParams params)
-                    : m_vocab(instance.model().vocab())
+                    : m_instance(instance)
+                    , m_vocab(instance.model().vocab())
                     , m_params(std::move(params))
                     , m_text(std::move(prompt))
-                    , m_session(instance.newSession(m_params))
+                    , m_session(instance.startSession(m_params))
                 {
                     m_promptTokens = m_vocab.tokenize(m_text, true, true);
                     m_session.setInitialPrompt(m_promptTokens);
@@ -87,6 +88,10 @@ class UModel {
                     }
                 }
 
+                ~Session() {
+                    m_instance.stopSession();
+                }
+
                 const std::string& text() const { return m_text; }
                 const ac::llama::Session::InitParams& params() const { return m_params; }
 
@@ -123,6 +128,7 @@ class UModel {
                 }
 
             private:
+                ac::llama::Instance& m_instance;
                 const ac::llama::Vocab& m_vocab;
                 ac::llama::Session::InitParams m_params;
                 std::vector<ac::llama::Token> m_promptTokens;
diff --git a/test/t-integration.cpp b/test/t-integration.cpp
@@ -64,7 +64,7 @@ TEST_CASE("inference") {
         std::vector<ac::llama::Token> tokens;
 
         // choose a very, very suggestive prompt and hope that all architectures will agree
-        auto s = inst.newSession({});
+        auto s = inst.startSession({});
         tokens = model.vocab().tokenize("President George W.", true, true);
         s.setInitialPrompt(tokens);
         {
@@ -107,7 +107,7 @@ TEST_CASE("session states") {
             ac::llama::Instance inst(model, {});
             inst.addControlVector(ctrlVector);
             inst.warmup(); // should be safe
-            auto s = inst.newSession({});
+            auto s = inst.startSession({});
             std::vector<ac::llama::Token> tokens = model.vocab().tokenize("My car's fuel consumption is", true, true);
             s.setInitialPrompt(tokens);
             std::string text;
@@ -124,7 +124,7 @@ TEST_CASE("session states") {
             ac::llama::Instance inst(model, {});
             inst.addControlVector(ctrlVector);
             inst.warmup(); // should be safe
-            auto s = inst.newSession({});
+            auto s = inst.startSession({});
             std::vector<ac::llama::Token> tokens = model.vocab().tokenize("My car's fuel consumption is", true, true);
             s.setInitialPrompt(tokens);
             std::string text;
@@ -168,7 +168,7 @@ TEST_CASE("control_vector") {
     {
         // session 1
 
-        auto s = inst.newSession({});
+        auto s = inst.startSession({});
         auto tokens = model.vocab().tokenize(prompt, true, true);
         s.setInitialPrompt(tokens);
 
@@ -196,7 +196,7 @@ TEST_CASE("control_vector") {
     // test restoring the intial state
     // since the sampler is in the initial state we should get the same string
     {
-        auto s = inst.newSession({});
+        auto s = inst.startSession({});
         s.setState(initialState);
         std::string restoredStr;
 
@@ -218,7 +218,7 @@ TEST_CASE("control_vector") {
         //restores session 1
         std::string restoredStr;
         {
-            auto s = inst.newSession({});
+            auto s = inst.startSession({});
             s.setState(sessionMiddleState);
 
             for (size_t i = 0; i < nPredict / 2; i++) {
@@ -235,7 +235,7 @@ TEST_CASE("control_vector") {
         //restores session 2
         std::string restoredStr2;
         {
-            auto s = inst.newSession({});
+            auto s = inst.startSession({});
             s.setState(sessionMiddleState);
 
             for (size_t i = 0; i < nPredict / 2; i++) {

Original file line number	Diff line number	Diff line change
`@@ -116,9 +116,12 @@ void Instance::warmup() {`
`116`	`116`	`llama_perf_context_reset(lctx);`
`117`	`117`	`}`
`118`	`118`
`119`		`-Session Instance::newSession(const Session::InitParams params) {`
`120`		`- // not a real await as we return suspend_always initially`
`121`		`- return Session(*this, params);`
	`119`	`+Session& Instance::startSession(const Session::InitParams params) {`
	`120`	`+ if (!m_session) {`
	`121`	`+ m_session.reset(new Session(*this, m_lctx.get(), params));`
	`122`	`+ }`
	`123`	`+`
	`124`	`+ return *m_session;`
`122`	`125`	`}`
`123`	`126`
`124`	`127`	`} // namespace ac::llama`