Fix sampling errors due to float rounding errors on large-vocab models

turboderp · turboderp · commit b8a5bcf1947b · 2024-07-13T05:47:54.000+02:00
diff --git a/exllamav2/exllamav2_ext/cpp/sampling.cpp b/exllamav2/exllamav2_ext/cpp/sampling.cpp
@@ -833,7 +833,12 @@ int multinomial_cpu
     while (true)
     {
         if (accum >= random) break;
-        if (idx == num_candidates - 1) break;
+        if (idx == num_candidates - 1)
+        {
+            // Roll back in case the sampled probability is exactly zero
+            while (idx > 0 && temp_probs[idx] == 0.0f) idx--;
+            break;
+        }
         idx++;
         accum += temp_probs[idx];
     }
diff --git a/exllamav2/exllamav2_ext/cpp/util.h b/exllamav2/exllamav2_ext/cpp/util.h
@@ -14,6 +14,7 @@
 #define DBGF(__x) printf("%s: %f\n", #__x, __x)
 #define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
 #define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGF4(__x, __y, __z, __w) printf("%s, %s, %s, %s: %f, %f, %f, %f\n", #__x, #__y, #__z, #__w, __x, __y, __z, __w)
 #define DBGIF(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __y)
 
 #define TIME_START \
diff --git a/exllamav2/exllamav2_ext/ext_sampling.cpp b/exllamav2/exllamav2_ext/ext_sampling.cpp
@@ -230,7 +230,24 @@ std::vector<float> sample_basic
             random_s = powf(random, expf(-skew));
         }
 
-        multinomial_cpu(num_candidates, temp_probs, temp_indices, random_s);
+//        {
+//            float sum = 0.0f;
+//            float pmin = temp_probs[0];
+//            float pmax = pmin;
+//            for (int i = 0; i < num_candidates; ++i)
+//            {
+//                if (temp_probs[i] < pmin) pmin = temp_probs[i];
+//                if (temp_probs[i] > pmax) pmax = temp_probs[i];
+//                sum += temp_probs[i];
+//            }
+//            DBGF4(pmin, pmax, sum, random_s);
+//        }
+
+        // Scale random sampling point a little to account for FP32 rounding errors during softmax. Probs
+        // can potentially sum to slightly less than 1 for large-vocab models
+        float random_s_adj = random_s * 0.9998;
+
+        multinomial_cpu(num_candidates, temp_probs, temp_indices, random_s_adj);
 
         output_tokens[i][0] = temp_indices[0];
         output_probs[i][0] = temp_probs[0];

Original file line number	Diff line number	Diff line change
`@@ -833,7 +833,12 @@ int multinomial_cpu`
`833`	`833`	`while (true)`
`834`	`834`	`{`
`835`	`835`	`if (accum >= random) break;`
`836`		`- if (idx == num_candidates - 1) break;`
	`836`	`+ if (idx == num_candidates - 1)`
	`837`	`+ {`
	`838`	`+ // Roll back in case the sampled probability is exactly zero`
	`839`	`+ while (idx > 0 && temp_probs[idx] == 0.0f) idx--;`
	`840`	`+ break;`
	`841`	`+ }`
`837`	`842`	`idx++;`
`838`	`843`	`accum += temp_probs[idx];`
`839`	`844`	`}`