alpaca-core
diff --git a/‎code/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎code/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎code/ac/llama/Instance.cpp‎
Lines changed: 2 additions & 223 deletions b/‎code/ac/llama/Instance.cpp‎
Lines changed: 2 additions & 223 deletions
diff --git a/‎code/ac/llama/Instance.hpp‎
Lines changed: 4 additions & 11 deletions b/‎code/ac/llama/Instance.hpp‎
Lines changed: 4 additions & 11 deletions
@@ -28,6 +28,7 @@ target_sources(ac-llama PRIVATE
     ac/llama/Instance.hpp
     ac/llama/Instance.cpp
     ac/llama/Session.hpp
+    ac/llama/Session.cpp
     ac/llama/AntipromptManager.hpp
     ac/llama/AntipromptManager.cpp
     ac/llama/IncrementalStringFinder.hpp
 
@@ -116,230 +116,9 @@ void Instance::warmup() {
     llama_perf_context_reset(lctx);
 }
 
-Session Instance::newSession(const SessionParams params) {
+Session Instance::newSession(const Session::InitParams params) {
     // not a real await as we return suspend_always initially
-    auto op = co_await Session::Prompt{};
-
-    if (m_hasActiveSession) {
-        throw_ex{} << "Instance already has an active session";
-    }
-
-    if (op.type != Session::SessionOpData::OpType::Prompt && op.type != Session::SessionOpData::OpType::SetState) {
-        throw_ex{} << "Invalid initial session operation type";
-    }
-
-    m_hasActiveSession = true;
-    astl::sentry closeSessionSentry([this] { m_hasActiveSession = false; });
-
-    auto lctx = m_lctx.get();
-    auto& vocab = m_model.vocab();
-
-    llama_kv_cache_clear(lctx);
-    llama_synchronize(lctx);
-    llama_perf_context_reset(lctx);
-    m_sampler.reset();
-    m_sampler.perfReset();
-
-    std::vector<llama_token> sessionTokens;
-    const auto tokenBos = llama_token_bos(m_model.lmodel());
-    const auto ctxLen = llama_n_ctx(lctx);
-    const auto maxTokens = ctxLen - 4; // (#16)
-    auto numKeep = llama_get_kv_cache_token_count(lctx);
-
-    if (op.type == Session::SessionOpData::OpType::Prompt) {
-        Token initialToken; // used to reset the initial prompt to a single token
-        auto& initialPrompt = op.pendingPrompt;
-        numKeep = std::min(uint32_t(initialPrompt.size()), maxTokens); // number of tokens to keep in the context in case we overflow
-
-        if (initialPrompt.empty()) {
-            initialToken = tokenBos;
-            initialPrompt = {&initialToken, 1};
-        }
-
-        if (initialPrompt.empty()) {
-            throw_ex{} << "Empty initial prompt";
-        }
-
-        if (initialPrompt.size() > maxTokens) {
-            throw_ex{} << "Initial prompt too long. Got " << initialPrompt.size() << " tokens, max: " << ctxLen - 4;
-        }
-
-        if (params.gaFactor != 1) {
-            const uint32_t gaFactor = params.gaFactor;
-            const uint32_t gaWidth = params.gaWidth;
-            if (gaWidth % gaFactor != 0) {
-                throw_ex{} << "Group-attention width " << gaWidth << " must be a multiple of group-attention factor " << gaFactor;
-            }
-            LLAMA_LOG(Info, "self-extend: train = ", m_model.trainCtxLength(), ", gaFactor = ", gaFactor, ", gaWidth = ", gaWidth);
-        }
-
-        if (m_model.hasEncoder()) {
-            auto batch = makeInputBatch(initialPrompt);
-            auto res = llama_encode(lctx, batch);
-            if (res != 0) {
-                throw_ex{} << "Failed to encode input";
-            }
-            initialToken = vocab.decoderStartToken();
-            initialPrompt = {&initialToken, 1};
-        }
-    } else {
-        if (llama_state_set_data(lctx, op.state.data(), op.state.size()) != op.state.size()) {
-            throw_ex{} << "Failed to set state";
-        }
-    }
-
-    // group attention state
-    uint32_t gaIndex = 0; // number of grouped KV tokens (only used if params.gaFactor > 1)
-    uint32_t numPast = 0; // number of tokens in the context (that's prompts + generated)
-
-    enum class Source {
-        InitialPrompt,
-        InteractivePrompt,
-        Generated
-    };
-
-    auto doDecode = [&](std::span<const Token> tokens, Source src) {
-        // first try to expand the context if needed
-        const auto gaFactor = params.gaFactor;
-
-        // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-        if (tokens.size() > maxTokens) {
-            const auto skipped = tokens.size() - maxTokens;
-            tokens = tokens.first(maxTokens);
-            LLAMA_LOG(Warning, "Input too long. Skipping ", skipped, " tokens");
-        }
-
-        bool haveFullContextMitigation = false;
-        if (gaFactor == 1) {
-            // infinite text generation via context shifting
-            // if we run out of context:
-            // - take the n_keep first tokens from the original prompt (via numPast)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            const auto num = numPast + tokens.size();
-            if (num >= ctxLen) {
-                if (!params.infiniteContext) {
-                    throw_ex{} << "context limit of " << ctxLen << " reached";
-                }
-
-                const auto numLeft = numPast - numKeep;
-                const int numDiscard = numLeft / 2; // somewhat arbitrary
-
-                LLAMA_LOG(Debug, "Context is full. Swapping: past = ", numPast, ", numLeft: ", numLeft,
-                    ", ctxLen: ", ctxLen, ", numKeep: ", numKeep, ", numDiscard: ", numDiscard);
-
-                llama_kv_cache_seq_rm(lctx, 0, numKeep, numKeep + numDiscard);
-                llama_kv_cache_seq_add(lctx, 0, numKeep + numDiscard, numPast, -numDiscard);
-
-                numPast -= numDiscard;
-                haveFullContextMitigation = true;
-            }
-        }
-        else {
-            const uint32_t gaWidth = params.gaWidth;
-
-            while (numPast >= gaIndex + gaWidth) {
-                // context extension via Self-Extend
-                const int ib = (gaFactor * gaIndex) / gaWidth;
-                const int bd = (gaWidth / gaFactor) * (gaFactor - 1);
-                const int dd = (gaWidth / gaFactor) - ib * bd - gaWidth;
-
-                LLAMA_LOG(Debug, "Group attention shift: ib = ", ib, ", bd = ", bd, ", dd = ", dd);
-
-                llama_kv_cache_seq_add(lctx, 0, gaIndex, numPast, ib * bd);
-                llama_kv_cache_seq_div(lctx, 0, gaIndex + ib * bd, gaIndex + ib * bd + gaWidth, gaFactor);
-                llama_kv_cache_seq_add(lctx, 0, gaIndex + ib * bd + gaWidth, numPast + ib * bd, dd);
-
-                numPast -= bd;
-
-                gaIndex += gaWidth / gaFactor;
-                haveFullContextMitigation = true;
-            }
-        }
-
-        if (haveFullContextMitigation) {
-            LLAMA_LOG(Info, "Context full mitigation performed: past = ", numPast, ", tokens = ", tokens.size());
-        }
-
-        // add to sampler
-        for (auto t : tokens) {
-            // only apply grammar for generated content
-            m_sampler.accept(t, src == Source::Generated);
-        }
-
-        // decode
-        const auto batchSize = llama_n_batch(lctx);
-
-        // decode with batches of batchSize
-        while (!tokens.empty()) {
-            auto batchTokens = tokens.size() > batchSize ? tokens.first(batchSize) : tokens;
-            tokens = tokens.subspan(batchTokens.size());
-            auto batch = makeInputBatch(batchTokens);
-            if (llama_decode(lctx, batch) != 0) {
-                throw_ex{} << "Failed to decode tokens";
-            }
-            numPast += uint32_t(batchTokens.size());
-        }
-    };
-
-    if (op.type == Session::SessionOpData::OpType::Prompt) {
-        doDecode(op.pendingPrompt, Source::InitialPrompt);
-
-        co_await Session::StartGeneration{}; // suspend pre generation
-    } else {
-        // set the state
-        co_yield true;
-    }
-
-    while (true) {
-        auto currOp = co_await Session::Prompt{};
-
-        if (currOp.type == Session::SessionOpData::OpType::GetState) {
-            // get the state
-            const auto size = llama_state_get_size(m_lctx.get());
-            std::vector<uint8_t> state(size);
-            if (llama_state_get_data(m_lctx.get(), state.data(), size) != size) {
-                throw_ex{} << "Failed to get state";
-            }
-            co_yield state;
-            continue;
-        } else if (currOp.type == Session::SessionOpData::OpType::SetState) {
-            auto& state = currOp.state;
-            if (llama_state_set_data(m_lctx.get(), state.data(), state.size()) != state.size()) {
-                throw_ex{} << "Failed to set state";
-            }
-            co_yield true;
-            continue;
-        } else if (currOp.type == Session::SessionOpData::OpType::Prompt) {
-            auto& prompt = currOp.pendingPrompt;
-            if (!prompt.empty()) {
-
-                // reset sampling and don't allow previous inputs to affect the generation
-                m_sampler.reset();
-
-                if (m_model.prefixInputsWithBos()) {
-                    // add bos token to the prompt
-                    doDecode({&tokenBos, 1}, Source::InteractivePrompt);
-                }
-
-                doDecode(prompt, Source::InteractivePrompt);
-            }
-
-            auto token = m_sampler.sample(lctx);
-            sessionTokens.push_back(token);
-            if (vocab.isEog(token)) {
-                co_yield Token_Invalid;
-                // don't decode eog tokens in case the the interaction is continued
-            }
-            else {
-                // first yield, then decode, thus we don't decode if the session is aborted
-                co_yield token;
-                doDecode({&token, 1}, Source::Generated);
-            }
-        } else {
-            LLAMA_LOG(Error, "Unrecognized session operation type");
-        }
-
-    }
+    return Session(*this, params);
 }
 
 } // namespace ac::llama
@@ -4,6 +4,7 @@
 #pragma once
 #include "export.h"
 #include "Sampler.hpp"
+#include "Session.hpp"
 #include <astl/mem_ext.hpp>
 
 struct llama_context;
@@ -32,20 +33,12 @@ class AC_LLAMA_EXPORT Instance {
     // do an empty model run to load model data in cache
     void warmup();
 
-    struct SessionParams {
-        uint32_t gaFactor = 1; // group-attention factor
-        uint32_t gaWidth = 512; // group-attention width
-
-        // if true, the inference tries to extend the context by truncating previous tokens
-        // only used if gaFactor == 1
-        bool infiniteContext = true;
-    };
-
     // only one session per instance can be active at a time
-    Session newSession(const SessionParams params);
+    Session newSession(const Session::InitParams params);
 
     const Model& model() const noexcept { return m_model; }
-    const Sampler& sampler() const noexcept { return m_sampler; }
+    llama_context* ctx() const noexcept { return m_lctx.get(); }
+    Sampler& sampler() noexcept { return m_sampler; }
 
 private:
     Model& m_model;