ORippler
diff --git a/‎common/arg.cpp‎
Lines changed: 0 additions & 8 deletions b/‎common/arg.cpp‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 10 additions & 5 deletions b/‎common/common.cpp‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎common/common.h‎
Lines changed: 0 additions & 2 deletions b/‎common/common.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎common/sampling.cpp‎
Lines changed: 163 additions & 33 deletions b/‎common/sampling.cpp‎
Lines changed: 163 additions & 33 deletions
diff --git a/‎examples/batched/README.md‎
Lines changed: 30 additions & 17 deletions b/‎examples/batched/README.md‎
Lines changed: 30 additions & 17 deletions
diff --git a/‎tools/server/public/index.html.gz‎
3.12 KB b/‎tools/server/public/index.html.gz‎
3.12 KB
@@ -1520,14 +1520,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.backend_sampling = true;
         }
     ).set_sparam());
-    add_opt(common_arg(
-        {"--backend-dist"},
-        "perform final (distribution) sampling on backend (default: disabled)",
-        [](common_params & params) {
-            params.sampling.backend_dist = true;
-            params.sampling.backend_sampling = true;
-        }
-    ).set_sparam());
     add_opt(common_arg(
         {"--pooling"}, "{none,mean,cls,last,rank}",
         "pooling type for embeddings, use model default if unspecified",
 
@@ -1021,12 +1021,17 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     // backend sampling initialization
     if (params.sampling.backend_sampling) {
-        iparams.samplers_seq_config.resize(cparams.n_seq_max);
-        for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
-            iparams.samplers_seq_config[i] = { i, common_sampler_backend_init(model, params.sampling) };
+        llama_sampler * backend_chain = common_sampler_backend_init(model, params.sampling);
+        if (backend_chain != nullptr) {
+            iparams.samplers_seq_config.resize(cparams.n_seq_max);
+            for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
+                iparams.samplers_seq_config[i] = { i, llama_sampler_clone(backend_chain) };
+            }
+            cparams.samplers   = iparams.samplers_seq_config.data();
+            cparams.n_samplers = cparams.n_seq_max;
+
+            llama_sampler_free(backend_chain);
         }
-        cparams.samplers   = iparams.samplers_seq_config.data();
-        cparams.n_samplers = cparams.n_seq_max;
     }
 
     llama_context * lctx = llama_init_from_model(model, cparams);
 
@@ -213,9 +213,7 @@ struct common_params_sampling {
     std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
     std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
 
-    // Backend sampling flags
     bool backend_sampling = false; // enable backend sampling
-    bool backend_dist     = false; // backend performs final sampling (dist)
 
     // print the parameters into a string
     std::string print() const;
 
@@ -105,7 +105,8 @@ struct common_sampler {
     common_params_sampling params;
 
     struct llama_sampler * grmr;
-    struct llama_sampler * chain;
+    struct llama_sampler * chain;          // CPU sampling chain
+    struct llama_sampler * backend_chain;  // Backend sampling chain
 
     ring_buffer<llama_token> prev;
 
@@ -118,6 +119,9 @@ struct common_sampler {
 
         llama_sampler_reset(grmr);
         llama_sampler_reset(chain);
+        if (backend_chain) {
+            llama_sampler_reset(backend_chain);
+        }
     }
 
     void set_logits(struct llama_context * ctx, int idx) {
@@ -165,6 +169,20 @@ static bool sampler_enabled(const struct common_params_sampling & params, enum c
     return std::find(params.samplers.begin(), params.samplers.end(), type) != params.samplers.end();
 }
 
+static bool sampler_backend_supported(enum common_sampler_type type) {
+    switch (type) {
+        case COMMON_SAMPLER_TYPE_TOP_K:
+        case COMMON_SAMPLER_TYPE_TEMPERATURE:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool has_logit_bias(const struct common_params_sampling & params) {
+    return !params.logit_bias.empty();
+}
+
 std::string common_params_sampling::print() const {
     char result[1024];
 
@@ -249,22 +267,86 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
     }
 
     auto * result = new common_sampler {
-        /* .params = */ params,
-        /* .grmr   = */ grmr,
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur    = */ {},
-        /* .cur_p  = */ {},
+        /* .params        = */ params,
+        /* .grmr          = */ grmr,
+        /* .chain         = */ llama_sampler_chain_init(lparams),
+        /* .backend_chain = */ nullptr,
+        /* .prev          = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .cur           = */ {},
+        /* .cur_p         = */ {},
     };
 
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_vocab_n_tokens(vocab),
-                params.logit_bias.size(),
-                params.logit_bias.data()));
+    size_t backend_sampler_count = 0;
+    if (params.backend_sampling && params.mirostat == 0) {
+        if (has_logit_bias(params)) {
+            backend_sampler_count++;
+        }
+
+        // Find the longest contiguous chain of backend-supported samplers from the start
+        for (const auto & sampler_type : params.samplers) {
+            if (sampler_backend_supported(sampler_type)) {
+                backend_sampler_count++;
+            } else {
+                break;
+            }
+        }
+    }
+
+    // If the samplers combination is supported then we can build the backend chain.
+    if (backend_sampler_count > 0 || (params.backend_sampling && has_logit_bias(params))) {
+        llama_sampler_chain_params backend_params = llama_sampler_chain_default_params();
+        backend_params.no_perf = params.no_perf;
+        result->backend_chain = llama_sampler_chain_init(backend_params);
+
+        if (has_logit_bias(params)) {
+            llama_sampler_chain_add(result->backend_chain,
+                llama_sampler_backend_init_logit_bias(
+                    llama_vocab_n_tokens(vocab),
+                    params.logit_bias.size(),
+                    params.logit_bias.data()));
+        }
+
+        size_t backend_idx = 0;
+        for (const auto & sampler_type : params.samplers) {
+            if (backend_idx >= backend_sampler_count - has_logit_bias(params)) {
+                break;
+            }
+
+            switch (sampler_type) {
+                case COMMON_SAMPLER_TYPE_TOP_K:
+                    if (params.top_k > 0) {
+                        llama_sampler_chain_add(result->backend_chain, llama_sampler_backend_init_top_k(params.top_k));
+                    }
+                    backend_idx++;
+                    break;
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                    if (params.temp > 0.0f) {
+                        llama_sampler_chain_add(result->backend_chain, llama_sampler_backend_init_temp(params.temp));
+                    }
+                    backend_idx++;
+                    break;
+                default:
+                    GGML_ASSERT(false && "unsupported backend sampler");
+            }
+        }
+    }
+
+    size_t cpu_start_idx = backend_sampler_count - has_logit_bias(params);
+    bool cpu_has_samplers = cpu_start_idx < params.samplers.size();
+
+    // Build CPU chain
+    if (!params.backend_sampling || !has_logit_bias(params)) {
+        llama_sampler_chain_add(result->chain,
+                llama_sampler_init_logit_bias(
+                    llama_vocab_n_tokens(vocab),
+                    params.logit_bias.size(),
+                    params.logit_bias.data()));
+    }
 
     if (params.mirostat == 0) {
-        for (const auto & cnstr : params.samplers) {
+        // Add remaining CPU samplers
+        for (size_t i = cpu_start_idx; i < params.samplers.size(); i++) {
+            const auto & cnstr = params.samplers[i];
             switch (cnstr) {
                 case COMMON_SAMPLER_TYPE_DRY:
                     {
@@ -308,7 +390,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                     GGML_ASSERT(false && "unknown sampler type");
             }
         }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+
+        // If all samplers are on backend, add dist to backend; otherwise add to CPU
+        if (result->backend_chain && !cpu_has_samplers) {
+            llama_sampler_chain_add(result->backend_chain, llama_sampler_backend_init_dist(params.seed));
+        } else {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+        }
     } else if (params.mirostat == 1) {
         llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
         llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
@@ -323,36 +411,74 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 }
 
 struct llama_sampler * common_sampler_backend_init(const struct llama_model * model, const struct common_params_sampling & params) {
-    if (!params.backend_sampling) {
+    if (!params.backend_sampling || params.mirostat != 0) {
         return nullptr;
     }
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    // Determine the split point for backend sampling using the same logic as common_sampler_init
+    size_t backend_sampler_count = 0;
+    if (has_logit_bias(params)) {
+        backend_sampler_count++;
+    }
+
+    // Find the longest contiguous chain of backend-supported samplers from the start
+    for (const auto & sampler_type : params.samplers) {
+        if (sampler_backend_supported(sampler_type)) {
+            backend_sampler_count++;
+        } else {
+            break;
+        }
+    }
+
+    if (backend_sampler_count == 0 && !has_logit_bias(params)) {
+        return nullptr;
+    }
+
     llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
     chain_params.no_perf = params.no_perf;
 
     struct llama_sampler * chain = llama_sampler_chain_init(chain_params);
 
-    const bool enable_temp  = params.temp  > 0.0f && sampler_enabled(params, COMMON_SAMPLER_TYPE_TEMPERATURE);
-    const bool enable_top_k = params.top_k > 0    && sampler_enabled(params, COMMON_SAMPLER_TYPE_TOP_K);
-    const bool enable_dist  = params.backend_dist;
-
-    if (!params.logit_bias.empty()) {
+    // Add logit_bias to backend chain if present
+    if (has_logit_bias(params)) {
         llama_sampler_chain_add(chain, llama_sampler_backend_init_logit_bias(
                     llama_vocab_n_tokens(vocab),
                     params.logit_bias.size(),
                     params.logit_bias.data()));
     }
 
-    if (enable_temp) {
-        llama_sampler_chain_add(chain, llama_sampler_backend_init_temp(params.temp));
-    }
+    size_t backend_idx = 0;
+    for (const auto & sampler_type : params.samplers) {
+        if (backend_idx >= backend_sampler_count - has_logit_bias(params)) {
+            break;
+        }
 
-    if (enable_top_k) {
-        llama_sampler_chain_add(chain, llama_sampler_backend_init_top_k(params.top_k));
+        switch (sampler_type) {
+            case COMMON_SAMPLER_TYPE_TOP_K:
+                if (params.top_k > 0) {
+                    llama_sampler_chain_add(chain, llama_sampler_backend_init_top_k(params.top_k));
+                }
+                backend_idx++;
+                break;
+            case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                if (params.temp > 0.0f) {
+                    llama_sampler_chain_add(chain, llama_sampler_backend_init_temp(params.temp));
+                }
+                backend_idx++;
+                break;
+            default:
+                GGML_ASSERT(false && "unsupported backend sampler");
+        }
     }
 
-    if (enable_dist) {
+    // Determine if we should add dist sampler to backend chain
+    // Only add it if all samplers from params.samplers are on the backend
+    size_t cpu_start_idx = backend_sampler_count - has_logit_bias(params);
+    bool cpu_has_samplers = cpu_start_idx < params.samplers.size();
+
+    if (!cpu_has_samplers) {
         llama_sampler_chain_add(chain, llama_sampler_backend_init_dist(params.seed));
     }
 
@@ -362,9 +488,12 @@ struct llama_sampler * common_sampler_backend_init(const struct llama_model * mo
 void common_sampler_free(struct common_sampler * gsmpl) {
     if (gsmpl) {
         llama_sampler_free(gsmpl->grmr);
-
         llama_sampler_free(gsmpl->chain);
 
+        if (gsmpl->backend_chain) {
+            llama_sampler_free(gsmpl->backend_chain);
+        }
+
         delete gsmpl;
     }
 }
@@ -387,12 +516,13 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
 
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
     return new common_sampler {
-        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
-        /* .prev   = */ gsmpl->prev,
-        /* .cur    = */ gsmpl->cur,
-        /* .cur_p  = */ gsmpl->cur_p,
+        /* .params        = */ gsmpl->params,
+        /* .grmr          = */ llama_sampler_clone(gsmpl->grmr),
+        /* .chain         = */ llama_sampler_clone(gsmpl->chain),
+        /* .backend_chain = */ gsmpl->backend_chain ? llama_sampler_clone(gsmpl->backend_chain) : nullptr,
+        /* .prev          = */ gsmpl->prev,
+        /* .cur           = */ gsmpl->cur,
+        /* .cur_p         = */ gsmpl->cur_p,
     };
 }
 
 
@@ -45,25 +45,38 @@ llama_print_timings:       total time =  4156.04 ms
 
 ### Using backend samplers
 It is possible to run this example using backend samplers so that sampling is
-performed on the backend device, like a GPU.
+performed on a backend device, like a GPU.
 ```bash
 ./llama-batched \
     -m models/Qwen2.5-VL-3B-Instruct-Q8_0.gguf -p "Hello my name is" \
-    -np 4 -kvu \
-    --backend_sampling --top-k 80 --backend_dist
+    -np 4 \
+    -kvu \
+    --backend_sampling \
+    --samplers 'top_k;temperature' \
+    --top-k 80
 ```
-The `--verbose` flag can be added to see more detailed output and also show
-that the backend samplers are being used. The above example will perform distribution
-sampling on the backend device and only transfer the sampled token ids back to the host.
+The samplers specified with `--samplers` must be supported by the backend and
+this is why we are explicitly specifying only `top_k` and `temperature` here as
+at the time of writing these are supported.
 
-It is also possible to perform partial sampling on the backend, and then allow CPU samplers
-to process those results further. This is sometimes referred to as hybrid sampling.
-For an example of this we can remove `--backend_dist` from the above command:
-```bash
-./llama-batched \
-    -m models/Qwen2.5-VL-3B-Instruct-Q8_0.gguf -p "Hello my name is" \
-    -np 4 -kvu \
-    --backend_sampling --top-k 80 -v
-```
-This will perform the top-k filtering on the backend device, and then transfer the filtered logits
-back to the host for sampling.
+The `--verbose` flag can be added to see more detailed output and also show
+that the backend samplers are being used.
+
+With `--backend_sampling` enabled, the sampler chain is automatically analyzed
+to determine which samplers can run on the backend. The system finds the longest
+contiguous chain of backend-supported samplers from the start of the sampler
+sequence. For example:
+* If the chain is `top-k -> temperature -> top-p`, and both `top-k` and
+  `temperature` are backend-supported but `top-p` is not, then `top-k` and
+  `temperature` will run on the backend, while `top-p` and subsequent samplers
+  run on the CPU.
+* If all configured samplers are supported, the final distribution sampling will
+  also happen on the backend, transferring only the sampled token IDs back to the
+  host.
+* If the sampler chain starts with an unsupported sampler (e.g., `penalties`),
+  all sampling runs on the CPU.
+
+**Note:** The default sampler chain includes `penalties` as the first sampler,
+which is not backend-supported yet. To use backend sampling, you must explicitly
+configure a sampler chain that starts with backend-supported samplers using
+`--samplers` like shown above.