File tree Expand file tree Collapse file tree 2 files changed +7
-0
lines changed Expand file tree Collapse file tree 2 files changed +7
-0
lines changed Original file line number Diff line number Diff line change @@ -366,6 +366,8 @@ extern "C" {
366366 bool no_perf; // measure performance timings
367367 bool op_offload; // offload host tensor operations to device
368368 bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
369+ // NOTE: setting to false when n_seq_max > 1 can cause bad perforamnce in some cases
370+ // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
369371 };
370372
371373 // model quantization parameters
Original file line number Diff line number Diff line change @@ -123,6 +123,11 @@ llama_context::llama_context(
123123 __func__, n_ctx_per_seq, hparams.n_ctx_train );
124124 }
125125
126+ if (!params.swa_full && cparams.n_seq_max > 1 ) {
127+ LLAMA_LOG_WARN (" %s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n " ,
128+ __func__, cparams.n_seq_max , " https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573" );
129+ }
130+
126131 if (!hparams.vocab_only ) {
127132 // GPU backends
128133 for (auto * dev : model.devices ) {
You can’t perform that action at this time.
0 commit comments