Optimize logit filtering in sampler

turboderp · turboderp · commit 0978ba5f860e · 2024-08-22T11:17:45.000+02:00
diff --git a/exllamav2/exllamav2_ext/cpp/sampling.cpp b/exllamav2/exllamav2_ext/cpp/sampling.cpp
@@ -129,7 +129,7 @@ int softmax_cpu_nonavx2
 
     for (int i = 0; i < vocab_size; i++)
     {
-        if (!logits_filter[i]) continue;
+        if (logits_filter && !logits_filter[i]) continue;
         if (logits[i] > maxl)
         {
             maxl = logits[i];
@@ -139,7 +139,7 @@ int softmax_cpu_nonavx2
 
     for (int i = 0; i < vocab_size; i++)
     {
-        if (!logits_filter[i]) continue;
+        if (logits_filter && !logits_filter[i]) continue;
         float l = logits[i] - maxl;
         if (exponent == 2.0f)
             l *= -l;
@@ -154,7 +154,7 @@ int softmax_cpu_nonavx2
 
     for (int i = 0; i < vocab_size; i++)
     {
-        if (logits_filter[i]) output[i] *= isum;
+        if (!logits_filter || logits_filter[i]) output[i] *= isum;
         else output[i] = 0.0f;
     }
 
diff --git a/exllamav2/exllamav2_ext/cpp/sampling_avx2.cpp b/exllamav2/exllamav2_ext/cpp/sampling_avx2.cpp
@@ -34,11 +34,10 @@ int softmax_cpu_avx2
 
     // Apply logit filter and find max logit
 
-    int i = 0;
-    for (; i < vocab_size; ++i)
+    for (int i = 0; i < vocab_size; ++i)
     {
         float l = logits[i];
-        bool f = logits_filter[i];
+        bool f = !logits_filter || logits_filter[i];
         l = f ? l : minf;
         if (l > maxl)
         {
@@ -47,7 +46,8 @@ int softmax_cpu_avx2
         }
         output[i] = l;
     }
-    for (; i < vocab_size_aligned; i++)
+
+    for (int i = vocab_size; i < vocab_size_aligned; i++)
         output[i] = minf;
 
     // SIMD values
@@ -61,8 +61,7 @@ int softmax_cpu_avx2
     if (exponent == 2.0f)
     {
         __m256 sign_mask = _mm256_set1_ps(-0.0f);
-        i = 0;
-        for (; i < vocab_size_aligned; i += 8)
+        for (int i = 0; i < vocab_size_aligned; i += 8)
         {
             __m256 x = _mm256_load_ps(&output[i]);
             x = _mm256_sub_ps(x, maxl8);
@@ -87,10 +86,9 @@ int softmax_cpu_avx2
     }
     else
     {
-        i = 0;
         if (itemp == 1.0f)
         {
-            for (; i < vocab_size_aligned; i += 8)
+            for (int i = 0; i < vocab_size_aligned; i += 8)
             {
                 __m256 x = _mm256_load_ps(&output[i]);
                 x = _mm256_sub_ps(x, maxl8);
@@ -101,7 +99,7 @@ int softmax_cpu_avx2
         }
         else
         {
-            for (; i < vocab_size_aligned; i += 8)
+            for (int i = 0; i < vocab_size_aligned; i += 8)
             {
                 __m256 x = _mm256_load_ps(&output[i]);
                 x = _mm256_sub_ps(x, maxl8);
@@ -121,8 +119,7 @@ int softmax_cpu_avx2
     float isum = 1.0f / esum;
     __m256 isum8  = _mm256_set1_ps(isum);
 
-    i = 0;
-    for (; i < vocab_size_aligned; i += 8)
+    for (int i = 0; i < vocab_size_aligned; i += 8)
     {
         __m256 x = _mm256_load_ps(&output[i]);
         x = _mm256_mul_ps(x, isum8);
diff --git a/exllamav2/exllamav2_ext/ext_sampling.cpp b/exllamav2/exllamav2_ext/ext_sampling.cpp
@@ -66,7 +66,7 @@ void apply_rep_penalty
 
 std::vector<float> sample_basic
 (
-    torch::Tensor logits,           // shape [bsz, vocab_size]
+    torch::Tensor logits,           // shape [bsz, 1, vocab_size]
     float temperature,
     int top_k,
     float top_p,
@@ -96,10 +96,10 @@ std::vector<float> sample_basic
     TORCH_CHECK_DTYPE(output_tokens, kLong);
     TORCH_CHECK_DTYPE(output_probs, kFloat);
     TORCH_CHECK_DTYPE(logits, kFloat);
-    TORCH_CHECK_DTYPE(logit_filter, kBool);
+    TORCH_CHECK_DTYPE_OPT(logit_filter, kBool);
 
-    TORCH_CHECK_SHAPES(logit_filter, 0, logits, 0, 1);
-    TORCH_CHECK_SHAPES(logit_filter, 1, logits, 1, 1);
+    TORCH_CHECK_SHAPES_OPT(logit_filter, 0, logits, 0, 1);
+    TORCH_CHECK_SHAPES_OPT(logit_filter, 1, logits, -1, 1);
 
     int vocab_size = logits.size(-1);
     int bsz = logits.size(0);
@@ -112,7 +112,7 @@ std::vector<float> sample_basic
     if (!output_kprobs.device().is_meta())
         num_probs = output_kprobs.size(2);
 
-    bool* logits_filter_ptr = (bool*) logit_filter.data_ptr();
+    bool* logits_filter_ptr = logit_filter.device().is_meta() ? NULL : (bool*) logit_filter.data_ptr();
 
     Py_BEGIN_ALLOW_THREADS
 
@@ -136,7 +136,7 @@ std::vector<float> sample_basic
              vocab_size,
              temperature,
              logits_ptr + i * vocab_size,
-             logits_filter_ptr + i * vocab_size,
+             logits_filter_ptr ? logits_filter_ptr + i * vocab_size : NULL,
              exponent,
              temp_probs
         );
@@ -282,7 +282,7 @@ std::vector<float> sample_basic
 void logit_filter_exclusive
 (
     torch::Tensor filter,                                       // shape [bsz, vocab_size]
-    const std::vector<std::vector<int>> &exclusive_lists
+    const py::list& exclusive_lists
 )
 {
     TORCH_CHECK_DTYPE(filter, kBool);
@@ -291,13 +291,15 @@ void logit_filter_exclusive
     bool* filter_ptr = (bool*) filter.data_ptr();
     unsigned int vocab_size = filter.size(1);
 
-    Py_BEGIN_ALLOW_THREADS
+//    Py_BEGIN_ALLOW_THREADS
 
-    for(const auto& list : exclusive_lists)
+    for(const auto& list_ : exclusive_lists)
     {
+        auto list = list_.cast<py::list>();
+
         unsigned int id = 0;
         unsigned int next_id_idx = 0;
-        unsigned int next_id = list[next_id_idx];
+        unsigned int next_id = list[next_id_idx].cast<unsigned int>();
 
         while (id < vocab_size)
         {
@@ -309,13 +311,13 @@ void logit_filter_exclusive
             id++;
             next_id_idx++;
             if (next_id_idx >= list.size()) next_id = vocab_size;
-            else next_id = list[next_id_idx];
+            else next_id = list[next_id_idx].cast<unsigned int>();;
         }
 
         filter_ptr += vocab_size;
     }
 
-    Py_END_ALLOW_THREADS
+//    Py_END_ALLOW_THREADS
 }
 
 void fast_fill_cpu_ones_bool(torch::Tensor tensor)
diff --git a/exllamav2/exllamav2_ext/ext_sampling.h b/exllamav2/exllamav2_ext/ext_sampling.h
@@ -43,7 +43,7 @@ std::vector<float> sample_basic
 void logit_filter_exclusive
 (
     torch::Tensor filter,                                       // shape [bsz, vocab_size]
-    const std::vector<std::vector<int>> &exclusive_lists
+    const py::list& exclusive_lists
 );
 
 void fast_fill_cpu_ones_bool(torch::Tensor tensor);
diff --git a/exllamav2/generator/sampler.py b/exllamav2/generator/sampler.py
@@ -7,8 +7,36 @@
 from exllamav2.generator.hooks import ExLlamaV2PostSamplingHook
 from exllamav2.ext import exllamav2_ext as ext_c, none_tensor
 from copy import copy
+import threading
 # import line_profiler
 
+_tl_tensors = threading.local()
+
+def _get_logit_filter(shape, dtype):
+    global _tl_tensors
+    if not hasattr(_tl_tensors, 'logit_filter') \
+        or _tl_tensors.logit_filter.shape != shape \
+        or _tl_tensors.logit_filter.dtype != dtype:
+        _tl_tensors.logit_filter = torch.empty(shape, dtype = dtype)
+    return _tl_tensors.logit_filter
+
+def _get_output_tokens(shape, dtype):
+    global _tl_tensors
+    if not hasattr(_tl_tensors, 'output_tokens') \
+        or _tl_tensors.output_tokens.shape != shape \
+        or _tl_tensors.output_tokens.dtype != dtype:
+        _tl_tensors.output_tokens = torch.empty(shape, dtype = dtype)
+    return _tl_tensors.output_tokens
+
+def _get_output_probs(shape, dtype):
+    global _tl_tensors
+    if not hasattr(_tl_tensors, 'output_probs') \
+        or _tl_tensors.output_probs.shape != shape \
+        or _tl_tensors.output_probs.dtype != dtype:
+        _tl_tensors.output_probs = torch.empty(shape, dtype = dtype)
+    return _tl_tensors.output_probs
+
+
 class ExLlamaV2Sampler:
 
     @dataclass
@@ -186,7 +214,7 @@ def sample(
         else:
             assert batch_size == 1 or len(filters) == 0, "Filters not implemented for batch size > 1"
 
-        logits = logits.squeeze(1)
+        # logits = logits.view(batch_size, vocab_size)
 
         # Sync
 
@@ -203,8 +231,13 @@ def sample(
 
         # Prepare filter
 
-        logit_filter = torch.empty((batch_size, vocab_size), dtype = torch.bool)
-        ext_c.fast_fill_cpu_ones_bool(logit_filter)
+        logit_filter = None
+        def prep_logit_filter(lf):
+            if lf is not None:
+                return lf
+            lf = _get_logit_filter((batch_size, vocab_size), torch.bool)
+            ext_c.fast_fill_cpu_ones_bool(lf)
+            return lf
 
         # Repetition penalty
 
@@ -223,7 +256,7 @@ def sample(
         # Temporarily ban individual tokens
 
         if blocked_tokens:
-            logits[:, blocked_tokens] = -1e30
+            logits[:, :, blocked_tokens] = -1e30
 
         # Token bias
 
@@ -247,7 +280,7 @@ def sample(
                 assert pass_tokens, "Filter excluded all tokens"
                 if filter_prefer_eos and tokenizer.eos_token_id in pass_tokens:
                     pass_tokens = { tokenizer.eos_token_id }
-                # TODO: pass pass_tokens as a numpy array or Python set
+                logit_filter = prep_logit_filter(logit_filter)
                 ext_c.logit_filter_exclusive(logit_filter, [sorted(list(pass_tokens))])
 
         # Healing
@@ -260,6 +293,7 @@ def sample(
             for i in range(batch_size):
                 valid_token_lists.append(prefix_id_to_ids[prefix_token[i, 0].item()])
 
+            logit_filter = prep_logit_filter(logit_filter)
             ext_c.logit_filter_exclusive(logit_filter, valid_token_lists)
 
         # Begin Mirostat
@@ -272,20 +306,20 @@ def sample(
 
         vs = tokenizer.get_vocab_size()
         if vs < logits.shape[-1]:
-            logits[:, vs:] = float("-inf")
+            logits[:, :, vs:] = float("-inf")
 
         # Sampling
 
-        batch_size = logits.shape[0]
-
-        output_tokens = torch.empty((batch_size, 1), device = "cpu", dtype = torch.long)
-        output_probs = torch.empty((batch_size, 1), device = "cpu", dtype = torch.float)
+        output_tokens = torch.empty((batch_size, 1), dtype = torch.long)
+        # output_tokens = _get_output_tokens((batch_size, 1), torch.long)
+        output_probs = torch.empty((batch_size, 1), dtype = torch.float)
+        # output_probs = _get_output_probs((batch_size, 1), torch.float)
         if return_top_tokens == 0:
             output_ktokens = none_tensor
             output_kprobs = none_tensor
         else:
-            output_ktokens = torch.empty((batch_size, 1, return_top_tokens), device = "cpu", dtype = torch.long)
-            output_kprobs = torch.empty((batch_size, 1, return_top_tokens), device = "cpu", dtype = torch.float)
+            output_ktokens = torch.empty((batch_size, 1, return_top_tokens), dtype = torch.long)
+            output_kprobs = torch.empty((batch_size, 1, return_top_tokens), dtype = torch.float)
 
         m = ext_c.sample_basic(
             logits,
@@ -301,7 +335,7 @@ def sample(
             output_probs,
             output_kprobs,
             output_ktokens,
-            logit_filter,
+            logit_filter if logit_filter is not None else none_tensor,
             settings.mirostat,
             settings.mirostat_mu if settings.mirostat else [],
             settings.mirostat_tau,

Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ int softmax_cpu_nonavx2`
`129`	`129`
`130`	`130`	`for (int i = 0; i < vocab_size; i++)`
`131`	`131`	`{`
`132`		`- if (!logits_filter[i]) continue;`
	`132`	`+ if (logits_filter && !logits_filter[i]) continue;`
`133`	`133`	`if (logits[i] > maxl)`
`134`	`134`	`{`
`135`	`135`	`maxl = logits[i];`
`@@ -139,7 +139,7 @@ int softmax_cpu_nonavx2`
`139`	`139`
`140`	`140`	`for (int i = 0; i < vocab_size; i++)`
`141`	`141`	`{`
`142`		`- if (!logits_filter[i]) continue;`
	`142`	`+ if (logits_filter && !logits_filter[i]) continue;`
`143`	`143`	`float l = logits[i] - maxl;`
`144`	`144`	`if (exponent == 2.0f)`
`145`	`145`	`l *= -l;`
`@@ -154,7 +154,7 @@ int softmax_cpu_nonavx2`
`154`	`154`
`155`	`155`	`for (int i = 0; i < vocab_size; i++)`
`156`	`156`	`{`
`157`		`- if (logits_filter[i]) output[i] *= isum;`
	`157`	`+ if (!logits_filter \|\| logits_filter[i]) output[i] *= isum;`
`158`	`158`	`else output[i] = 0.0f;`
`159`	`159`	`}`
`160`	`160`
Original file line number	Diff line number	Diff line change
`@@ -34,11 +34,10 @@ int softmax_cpu_avx2`
`34`	`34`
`35`	`35`	`// Apply logit filter and find max logit`
`36`	`36`
`37`		`- int i = 0;`
`38`		`- for (; i < vocab_size; ++i)`
	`37`	`+ for (int i = 0; i < vocab_size; ++i)`
`39`	`38`	`{`
`40`	`39`	`float l = logits[i];`
`41`		`- bool f = logits_filter[i];`
	`40`	`+ bool f = !logits_filter \|\| logits_filter[i];`
`42`	`41`	`l = f ? l : minf;`
`43`	`42`	`if (l > maxl)`
`44`	`43`	`{`
`@@ -47,7 +46,8 @@ int softmax_cpu_avx2`
`47`	`46`	`}`
`48`	`47`	`output[i] = l;`
`49`	`48`	`}`
`50`		`- for (; i < vocab_size_aligned; i++)`
	`49`	`+`
	`50`	`+ for (int i = vocab_size; i < vocab_size_aligned; i++)`
`51`	`51`	`output[i] = minf;`
`52`	`52`
`53`	`53`	`// SIMD values`
`@@ -61,8 +61,7 @@ int softmax_cpu_avx2`
`61`	`61`	`if (exponent == 2.0f)`
`62`	`62`	`{`
`63`	`63`	`__m256 sign_mask = _mm256_set1_ps(-0.0f);`
`64`		`- i = 0;`
`65`		`- for (; i < vocab_size_aligned; i += 8)`
	`64`	`+ for (int i = 0; i < vocab_size_aligned; i += 8)`
`66`	`65`	`{`
`67`	`66`	`__m256 x = _mm256_load_ps(&output[i]);`
`68`	`67`	`x = _mm256_sub_ps(x, maxl8);`
`@@ -87,10 +86,9 @@ int softmax_cpu_avx2`
`87`	`86`	`}`
`88`	`87`	`else`
`89`	`88`	`{`
`90`		`- i = 0;`
`91`	`89`	`if (itemp == 1.0f)`
`92`	`90`	`{`
`93`		`- for (; i < vocab_size_aligned; i += 8)`
	`91`	`+ for (int i = 0; i < vocab_size_aligned; i += 8)`
`94`	`92`	`{`
`95`	`93`	`__m256 x = _mm256_load_ps(&output[i]);`
`96`	`94`	`x = _mm256_sub_ps(x, maxl8);`
`@@ -101,7 +99,7 @@ int softmax_cpu_avx2`
`101`	`99`	`}`
`102`	`100`	`else`
`103`	`101`	`{`
`104`		`- for (; i < vocab_size_aligned; i += 8)`
	`102`	`+ for (int i = 0; i < vocab_size_aligned; i += 8)`
`105`	`103`	`{`
`106`	`104`	`__m256 x = _mm256_load_ps(&output[i]);`
`107`	`105`	`x = _mm256_sub_ps(x, maxl8);`
`@@ -121,8 +119,7 @@ int softmax_cpu_avx2`
`121`	`119`	`float isum = 1.0f / esum;`
`122`	`120`	`__m256 isum8 = _mm256_set1_ps(isum);`
`123`	`121`
`124`		`- i = 0;`
`125`		`- for (; i < vocab_size_aligned; i += 8)`
	`122`	`+ for (int i = 0; i < vocab_size_aligned; i += 8)`
`126`	`123`	`{`
`127`	`124`	`__m256 x = _mm256_load_ps(&output[i]);`
`128`	`125`	`x = _mm256_mul_ps(x, isum8);`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ std::vector<float> sample_basic`
`43`	`43`	`void logit_filter_exclusive`
`44`	`44`	`(`
`45`	`45`	`torch::Tensor filter, // shape [bsz, vocab_size]`
`46`		`- const std::vector<std::vector<int>> &exclusive_lists`
	`46`	`+ const py::list& exclusive_lists`
`47`	`47`	`);`
`48`	`48`
`49`	`49`	`void fast_fill_cpu_ones_bool(torch::Tensor tensor);`