refactor: remove unused extractTokenData method and improve getSampledTokenData logic

pminev · pminev · commit cb33cd2c6fef · 2025-05-08T16:32:16.000+03:00
diff --git a/code/ac/llama/Sampler.cpp b/code/ac/llama/Sampler.cpp
@@ -172,23 +172,6 @@ Token Sampler::sample(llama_context* lctx, int idx, bool grammarFirst) {
     return cur.data[cur.selected].id;
 }
 
-TokenDataVector Sampler::extractTokenData(llama_context* lctx) {
-   auto chain = m_samplerChain.get();
-
-    auto cur = fillLogits(m_cur, lctx, -1);
-
-    llama_sampler_apply(chain, &cur);
-
-    TokenDataVector result(cur.size);
-
-    for (size_t i = 0; i < cur.size; i++)
-    {
-        result[i] = {cur.data[i].id, cur.data[i].logit, cur.data[i].p};
-    }
-
-    return result;
-}
-
 void Sampler::reset() {
     llama_sampler_reset(m_grammarSampler.get());
     llama_sampler_reset(m_samplerChain.get());
diff --git a/code/ac/llama/Sampler.hpp b/code/ac/llama/Sampler.hpp
@@ -101,8 +101,6 @@ class AC_LLAMA_EXPORT Sampler {
     // idx is optional for sampling from the logits of the ith token
     Token sample(llama_context* lctx, int idx = -1, bool grammarFirst = false);
 
-    TokenDataVector extractTokenData(llama_context* lctx);
-
     // accept token as sampled
     // if acceptGrammar is true, the token is accepted both by the sampling chain and the grammar
     void accept(Token id, bool acceptGrammar);
diff --git a/code/ac/llama/Session.cpp b/code/ac/llama/Session.cpp
@@ -19,6 +19,37 @@ llama_batch makeInputBatch(std::span<const Token> tokens) {
     auto nonConstTokens = const_cast<Token*>(tokens.data());
     return llama_batch_get_one(nonConstTokens, int32_t(tokens.size()));
 }
+
+void fillLogits(TokenDataVector& out, llama_context* lctx) {
+    const auto* logits = llama_get_logits_ith(lctx, -1);
+
+    const auto* lmodel = llama_get_model(lctx);
+    const int vocabSize = llama_vocab_n_tokens(llama_model_get_vocab(lmodel));
+
+    out.resize(vocabSize);
+
+    for (llama_token id = 0; id < vocabSize; id++) {
+        out[id] = {id, logits[id], 0.0f};
+    }
+}
+
+static void applySoftMax(TokenDataVector& data) {
+    // Apply softmax to the logits
+    // The vector should be sorted in descending order
+
+    float max_l = data[0].logit;
+    float cum_sum = 0.0f;
+
+    for (size_t i = 0; i < data.size(); ++i) {
+        float p = expf(data[i].logit - max_l);
+        data[i].prob = p;
+        cum_sum += p;
+    }
+
+    for (size_t i = 0; i < data.size(); ++i) {
+        data[i].prob /= cum_sum;
+    }
+}
 }
 
 Session::Session(Instance& instance, llama_context* ctx, InitParams params)
@@ -167,22 +198,22 @@ Token Session::getToken() {
     return m_state.m_currToken;
 }
 
-TokenDataVector Session::getSampledTokenData(int32_t topK, float topP) {
+TokenDataVector Session::getSampledTokenData(int32_t topK, float /*topP*/) {
     flushPendingState();
 
-    Sampler::Params sParams = {
-        .topK = topK,
-        .topP = topP,
-        .samplerSequence = {
-            Sampler::SamplingType::Top_K,
-            Sampler::SamplingType::Top_P,
-        }
-    };
-    Sampler sampler(const_cast<Model&>(m_instance.model()), sParams);
+    TokenDataVector tempData;
+    fillLogits(tempData, m_ctx);
+
+    std::sort(tempData.begin(), tempData.end(), [](const TokenData & a, const TokenData & b) {
+        return a.logit > b.logit;
+    });
+
+    TokenDataVector result;
+    result.insert(result.end(), tempData.begin(), tempData.begin() + topK);
 
-    auto logits = sampler.extractTokenData(m_ctx);
+    applySoftMax(result);
 
-    return logits;
+    return result;
 }
 
 std::vector<uint8_t> Session::getState() {