refactor: change JSD with entropy to test the behavior

pminev · pminev · commit f650d88657b6 · 2025-05-08T16:32:16.000+03:00
diff --git a/code/ac/llama/LogitComparer.cpp b/code/ac/llama/LogitComparer.cpp
@@ -3,6 +3,7 @@
 //
 #include "LogitComparer.hpp"
 #include <cmath>
+#include <cassert>
 
 namespace ac::llama {
 
@@ -13,36 +14,47 @@ namespace ac::llama {
 //  - If at least 80% of the tokens are the same, we consider them equal
 // 3. Compare the Jensen-Shannon divergence of the probabilities
 //  - If the divergence is less than the treshold, we consider them equal
-bool LogitComparer::compare(const TokenDataVector& data1, const TokenDataVector& data2) {
-    const auto minSize = std::min(data1.size(), data2.size());
-    float distance1 = euclidean_distance_sq({data1.data(), minSize});
-    float distance2 = euclidean_distance_sq({data2.data(), minSize});
-
-    float relative_threshold = 0.02f; // 2% difference allowed
-    float res = std::fabs(distance1 - distance2) / std::max(distance1, distance2);
-    if (res > relative_threshold) {
-        return false;
-    }
+float LogitComparer::compare(const TokenDataVector& data1, const TokenDataVector& data2) {
+    // const auto minSize = std::min(data1.size(), data2.size());
+    // float distance1 = euclidean_distance_sq({data1.data(), minSize});
+    // float distance2 = euclidean_distance_sq({data2.data(), minSize});
+
+    // float relative_threshold = 0.02f; // 2% difference allowed
+    // float res = std::fabs(distance1 - distance2) / std::max(distance1, distance2);
+    // if (res > relative_threshold) {
+    //     return false;
+    // }
 
     std::unordered_map<int32_t, float> prob_map, prob_map2;
 
     for (const auto& p : data1) prob_map[p.token] = p.prob;
     for (const auto& p : data2) prob_map2[p.token] = p.prob;
 
     // Check if at least 80% of the tokens are the same
-    float matchingTokens = 0;
-    for (const auto& p : data1) {
-        if (prob_map2.count(p.token)) {
-            matchingTokens++;
-        }
-    }
+    // float matchingTokens = 0;
+    // for (const auto& p : data1) {
+    //     if (prob_map2.count(p.token)) {
+    //         matchingTokens++;
+    //     }
+    // }
 
-    float matchingPercentage = matchingTokens / minSize;
-    if (matchingPercentage < 0.8f) {
-        return false;
-    }
+    // float matchingPercentage = matchingTokens / minSize;
+    // if (matchingPercentage < 0.8f) {
+    //     return false;
+    // }
+
+    return jsd(prob_map, prob_map2);
+}
 
-    return jsd(prob_map, prob_map2) < 0.01;
+float LogitComparer::cosineDistance(const TokenDataVector& data1, const TokenDataVector& data2) {
+    assert(data1.size() == data2.size());
+    float dot = 0.0f, normA = 0.0f, normB = 0.0f;
+    for (size_t i = 0; i < data1.size(); ++i) {
+        dot += data1[i].logit * data2[i].logit;
+        normA += data1[i].logit * data1[i].logit;
+        normB += data2[i].logit * data2[i].logit;
+    }
+    return 1.0f - (dot / (std::sqrt(normA) * std::sqrt(normB)));
 }
 
 float LogitComparer::jsd(const std::unordered_map<Token, float>& probs1, const std::unordered_map<Token, float>& probs2) {
diff --git a/code/ac/llama/LogitComparer.hpp b/code/ac/llama/LogitComparer.hpp
@@ -10,7 +10,9 @@ namespace ac::llama {
 
 class LogitComparer {
 public:
-    static bool compare(const TokenDataVector& data1, const TokenDataVector& data2);
+    static float compare(const TokenDataVector& data1, const TokenDataVector& data2);
+
+    static float cosineDistance(const TokenDataVector& data1, const TokenDataVector& data2);
 
 private:
     static float jsd(const std::unordered_map<Token, float>& logits1, const std::unordered_map<Token, float>& logits2);
diff --git a/code/ac/llama/Vocab.cpp b/code/ac/llama/Vocab.cpp
@@ -30,6 +30,10 @@ bool Vocab::isEog(Token token) const noexcept {
     return llama_vocab_is_eog(m_lVocab, token);
 }
 
+int32_t Vocab::nTokens() const noexcept {
+    return llama_vocab_n_tokens(m_lVocab);
+}
+
 std::vector<Token> Vocab::tokenize(std::string_view text, bool addSpecial, bool parseSpecial) const {
     int32_t numTokens = int32_t(text.length()) + 2 * addSpecial; // optimistic max
     std::vector<Token> ret(numTokens);
diff --git a/code/ac/llama/Vocab.hpp b/code/ac/llama/Vocab.hpp
@@ -23,6 +23,7 @@ class AC_LLAMA_EXPORT Vocab {
     Token decoderStartToken() const noexcept; // fallback to bos if not available
 
     bool isEog(Token token) const noexcept;
+    int32_t nTokens() const noexcept;
 
     std::string tokenToString(Token token, bool special = true) const;
 
diff --git a/example/e-verify.cpp b/example/e-verify.cpp
@@ -39,6 +39,12 @@ class Model {
         m_instance.reset(new ac::llama::Instance(*m_model, {
             .ctxSize = 2048,
         }));
+        m_session = &m_instance->startSession({});
+        m_session->setInitialPrompt({}); // empty prompt
+    }
+
+    ~Model() {
+        m_instance->stopSession();
     }
 
     struct GenerationResult {
@@ -48,10 +54,27 @@ class Model {
     };
 
     GenerationResult generate(std::string prompt, uint32_t maxTokens) {
-        m_session = &m_instance->startSession({});
-
         auto promptTokens = m_model->vocab().tokenize(prompt, true, true);
-        m_session->setInitialPrompt(promptTokens);
+        return generate_impl(promptTokens, maxTokens);
+    }
+
+    GenerationResult generate(std::span<ac::llama::Token> prompt, uint32_t maxTokens) {
+        return generate_impl(prompt, maxTokens);
+    }
+
+    std::vector<ac::llama::Token> tokenize(std::string prompt) {
+        return m_model->vocab().tokenize(prompt, true, true);
+    }
+
+    bool tokenExists(ac::llama::Token token) {
+        return m_model->vocab().nTokens() > token;
+    }
+
+private:
+    GenerationResult generate_impl(std::span<ac::llama::Token> promptTokens, uint32_t maxTokens) {
+        if (!promptTokens.empty()) {
+            m_session->pushPrompt(promptTokens, {});
+        }
 
         constexpr int32_t topK = 10;
         auto data = m_session->getSampledTokenData(topK);
@@ -85,13 +108,15 @@ class Model {
             });
         }
 
-        m_instance->stopSession();
-        m_session = nullptr;
+        std::string initialPrompt = "";
+        for (size_t i = 0; i < promptTokens.size(); i++){
+            initialPrompt += m_model->vocab().tokenToString(promptTokens[i], false);
+        }
 
         return {
-            .initalPrompt = prompt,
-            .result = result,
-            .steps = genSteps
+            .initalPrompt = std::move(initialPrompt),
+            .result = std::move(result),
+            .steps = std::move(genSteps)
         };
     }
 
@@ -101,6 +126,35 @@ class Model {
     ac::llama::Session* m_session;
 };
 
+// -- Helper function to compute normalized entropy --
+float normalizedEntropy(const ac::llama::TokenDataVector& data) {
+    std::vector<float> probs(data.size());
+    float sum = 0.0f;
+
+    // Calculate softmax probabilities
+    for (auto& val : data) {
+        sum += std::exp(val.logit);
+    }
+    for (size_t i = 0; i < data.size(); ++i) {
+        probs[i] = std::exp(data[i].logit) / sum;
+    }
+
+    // Calculate entropy
+    float entropy = 0.0f;
+    for (float p : probs) {
+        if (p > 0.0f) {
+            entropy -= p * std::log(p);
+        }
+    }
+
+    // Normalize entropy by maximum possible entropy (log(number of classes))
+    float maxEntropy = std::log(float(probs.size()));
+    return entropy / maxEntropy;
+}
+
+
+
+
 int main() try {
     ac::jalog::Instance jl;
     jl.setup().add<ac::jalog::sinks::ColorSink>();
@@ -112,7 +166,10 @@ int main() try {
 
     // load model
     std::string tmpFolder = AC_TEST_DATA_LLAMA_DIR "/../../../tmp/";
-    std::string modelGguf = "Meta-Llama-3.1-70B-Instruct-Q5_K_S.gguf";
+    // std::string modelGguf = "Meta-Llama-3.1-70B-Instruct-Q5_K_S.gguf";
+    std::string modelGguf = "Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf";
+    // std::string modelGguf = "BgGPT-Gemma-2-2B-IT-v1.0.Q8_0.gguf";
+    // std::string modelGguf = "Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf";
     std::string modelGguf2 = "Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf";
 
     Model m1(tmpFolder + modelGguf, {});
@@ -128,21 +185,66 @@ int main() try {
 
     for (int i = 0; i < 1; ++i) {
 
-        auto res = m1.generate(prompt, 100);
+        auto res = m1.generate(prompt, 1000);
         std::cout << "Model 1 generated: " << res.result << "\n";
         std::string genPrompt = res.initalPrompt;
+
+        auto genPromptTokens = m2.tokenize(genPrompt);
+
+        float totalWeightedDist = 0.0f;
+        float totalWeight = 0.0f;
+
         for (size_t i = 0; i < res.steps.size(); i++) {
             auto& step  = res.steps[i];
             if (i > 0) {
-                genPrompt += step.tokenStr;
+                if (m2.tokenExists(step.token)) {
+                    genPromptTokens.push_back(step.token);
+                }
+                else {
+                    // Instead of skipping, penalize fully
+                    float fakeDist = 1.0f; // Maximum possible distance
+                    float weight = 1.0f;    // Assume maximum confidence since we can't know entropy
+                    totalWeightedDist += weight * fakeDist;
+                    totalWeight += weight;
+
+                    std::cout << "Token not found in model 2: " << step.tokenStr << "\n";
+                    continue;
+                }
             }
-            auto res2 = m2.generate(genPrompt, 0);
-            assert(res2.steps.size() == 1);
 
-            if (ac::llama::LogitComparer::compare(step.data, res2.steps[0].data)) {
-                std::cout << "Models are the same. Generated str by now:\n" << genPrompt << "\n\n";
+            Model::GenerationResult res2;
+            if (i == 0) {
+                res2 = m2.generate(genPromptTokens, 0);
+            } else {
+                std::vector<ac::llama::Token> token{step.token};
+                res2 = m2.generate(token, 0);
             }
+
+            assert(res2.steps.size() == 1);
+
+            // Step 1: Compare logits
+            float dist = ac::llama::LogitComparer::cosineDistance(step.data, res2.steps[0].data);
+
+            // Step 2: Calculate confidence weight
+            float entropy = normalizedEntropy(step.data);
+            float weight = 1.0f - entropy; // high confidence = high weight
+
+            // Step 3: Accumulate weighted distance
+            totalWeightedDist += weight * dist;
+            totalWeight += weight;
         }
+
+        // Final step: Normalize
+
+        // Score range | Interpretation
+        // 0.0 | Perfect match (identical predictions)
+        // 0.0001 - 0.001 | Practically indistinguishable
+        // 0.001 - 0.01 | Very close, slight variation
+        // 0.01 - 0.1 | Moderate variation, likely different versions/settings
+        // 0.1 - 1.0 | Large differences, likely different models
+        float finalScore = (totalWeight > 0.0f) ? (totalWeightedDist / totalWeight) : 0.0f;
+        std::cout << "Final weighted distance score: " << finalScore << "\n";
+
     }
     std::cout << '\n';