refactor: return JSD comparison seems to be more suitable

pminev · pminev · commit 67806665511c · 2025-05-08T16:32:16.000+03:00
-- however, this cannot tell which generation might be bad for now.
diff --git a/code/ac/llama/LogitComparer.cpp b/code/ac/llama/LogitComparer.cpp
@@ -14,34 +14,43 @@ namespace ac::llama {
 //  - If at least 80% of the tokens are the same, we consider them equal
 // 3. Compare the Jensen-Shannon divergence of the probabilities
 //  - If the divergence is less than the treshold, we consider them equal
-float LogitComparer::compare(const TokenDataVector& data1, const TokenDataVector& data2) {
-    // const auto minSize = std::min(data1.size(), data2.size());
-    // float distance1 = euclidean_distance_sq({data1.data(), minSize});
-    // float distance2 = euclidean_distance_sq({data2.data(), minSize});
-
-    // float relative_threshold = 0.02f; // 2% difference allowed
-    // float res = std::fabs(distance1 - distance2) / std::max(distance1, distance2);
-    // if (res > relative_threshold) {
-    //     return false;
-    // }
+bool LogitComparer::compare(const TokenDataVector& data1, const TokenDataVector& data2) {
+    const auto minSize = std::min(data1.size(), data2.size());
+    float distance1 = euclidean_distance_sq({data1.data(), minSize});
+    float distance2 = euclidean_distance_sq({data2.data(), minSize});
+
+    float relative_threshold = 0.02f; // 2% difference allowed
+    float res = std::fabs(distance1 - distance2) / std::max(distance1, distance2);
+    if (res > relative_threshold) {
+        return false;
+    }
 
     std::unordered_map<int32_t, float> prob_map, prob_map2;
 
     for (const auto& p : data1) prob_map[p.token] = p.prob;
     for (const auto& p : data2) prob_map2[p.token] = p.prob;
 
     // Check if at least 80% of the tokens are the same
-    // float matchingTokens = 0;
-    // for (const auto& p : data1) {
-    //     if (prob_map2.count(p.token)) {
-    //         matchingTokens++;
-    //     }
-    // }
-
-    // float matchingPercentage = matchingTokens / minSize;
-    // if (matchingPercentage < 0.8f) {
-    //     return false;
-    // }
+    float matchingTokens = 0;
+    for (const auto& p : data1) {
+        if (prob_map2.count(p.token)) {
+            matchingTokens++;
+        }
+    }
+
+    float matchingPercentage = matchingTokens / minSize;
+    if (matchingPercentage < 0.8f) {
+        return false;
+    }
+
+    return jsd(prob_map, prob_map2) < 0.01f; // 1% divergence allowed
+}
+
+float LogitComparer::JSD(const TokenDataVector& data1, const TokenDataVector& data2) {
+    std::unordered_map<int32_t, float> prob_map, prob_map2;
+
+    for (const auto& p : data1) prob_map[p.token] = p.prob;
+    for (const auto& p : data2) prob_map2[p.token] = p.prob;
 
     return jsd(prob_map, prob_map2);
 }
diff --git a/code/ac/llama/LogitComparer.hpp b/code/ac/llama/LogitComparer.hpp
@@ -10,7 +10,9 @@ namespace ac::llama {
 
 class LogitComparer {
 public:
-    static float compare(const TokenDataVector& data1, const TokenDataVector& data2);
+    static bool compare(const TokenDataVector& data1, const TokenDataVector& data2);
+
+    static float JSD(const TokenDataVector& data1, const TokenDataVector& data2);
 
     static float cosineDistance(const TokenDataVector& data1, const TokenDataVector& data2);
 
diff --git a/example/e-verify.cpp b/example/e-verify.cpp
@@ -170,7 +170,8 @@ int main() try {
     std::string modelGguf = "Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf";
     // std::string modelGguf = "BgGPT-Gemma-2-2B-IT-v1.0.Q8_0.gguf";
     // std::string modelGguf = "Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf";
-    std::string modelGguf2 = "Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf";
+    std::string modelGguf2 = "Meta-Llama-3.1-70B-Instruct-Q5_K_S.gguf";
+    // std::string modelGguf2 = "Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf";
 
     Model m1(tmpFolder + modelGguf, {});
     Model m2(tmpFolder + modelGguf2, {});
@@ -183,9 +184,10 @@ int main() try {
     std::cout << "Models to compare:\n" << modelGguf << "\n" << modelGguf2 << "\n";
     std::cout << "Comparing...\n";
 
+    std::vector<float> jsdResults;
     for (int i = 0; i < 1; ++i) {
 
-        auto res = m1.generate(prompt, 1000);
+        auto res = m1.generate(prompt, 100);
         std::cout << "Model 1 generated: " << res.result << "\n";
         std::string genPrompt = res.initalPrompt;
 
@@ -207,6 +209,8 @@ int main() try {
                     totalWeightedDist += weight * fakeDist;
                     totalWeight += weight;
 
+                    jsdResults.push_back(1);
+
                     std::cout << "Token not found in model 2: " << step.tokenStr << "\n";
                     continue;
                 }
@@ -222,16 +226,24 @@ int main() try {
 
             assert(res2.steps.size() == 1);
 
-            // Step 1: Compare logits
-            float dist = ac::llama::LogitComparer::cosineDistance(step.data, res2.steps[0].data);
+            {
+                // Step 1: Compare logits
+                float dist = ac::llama::LogitComparer::cosineDistance(step.data, res2.steps[0].data);
+
+                // Step 2: Calculate confidence weight
+                float entropy = normalizedEntropy(step.data);
+                float weight = 1.0f - entropy; // high confidence = high weight
 
-            // Step 2: Calculate confidence weight
-            float entropy = normalizedEntropy(step.data);
-            float weight = 1.0f - entropy; // high confidence = high weight
+                // Step 3: Accumulate weighted distance
+                totalWeightedDist += weight * dist;
+                totalWeight += weight;
+            }
+
+            {
+                float jsd = ac::llama::LogitComparer::JSD(step.data, res2.steps[0].data);
+                jsdResults.push_back(jsd);
+            }
 
-            // Step 3: Accumulate weighted distance
-            totalWeightedDist += weight * dist;
-            totalWeight += weight;
         }
 
         // Final step: Normalize
@@ -245,6 +257,19 @@ int main() try {
         float finalScore = (totalWeight > 0.0f) ? (totalWeightedDist / totalWeight) : 0.0f;
         std::cout << "Final weighted distance score: " << finalScore << "\n";
 
+        // Final score interpretation
+        // average JSD score
+        // 0.0 | Perfect match (identical predictions)
+        // 0.0001 - 0.001 | Practically indistinguishable
+        // 0.001 - 0.01 | Moderate variation, likely different versions/settings
+        // 0.01 - 0.1 | Large differences, likely different models
+        float jsdSum = 0.0f;
+        for (const auto& jsd : jsdResults) {
+            jsdSum += jsd;
+        }
+        float jsdAvg = jsdSum / jsdResults.size();
+        std::cout << "Average JSD score: " << jsdAvg << "\n";
+
     }
     std::cout << '\n';