Skip to content

Commit 52d537b

Browse files
committed
llama : adjust default context size + print warnings
ggml-ci
1 parent a6744e4 commit 52d537b

File tree

2 files changed

+21
-7
lines changed

2 files changed

+21
-7
lines changed

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ struct common_sampler_params {
155155

156156
struct common_params {
157157
int32_t n_predict = -1; // new tokens to predict
158-
int32_t n_ctx = 0; // context size
158+
int32_t n_ctx = 4096; // context size
159159
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
160160
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
161161
int32_t n_keep = 0; // number of tokens to keep from initial prompt

src/llama.cpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19440,12 +19440,26 @@ struct llama_context * llama_new_context_with_model(
1944019440
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
1944119441
}
1944219442

19443-
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
19444-
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
19445-
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
19446-
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
19447-
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
19448-
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
19443+
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
19444+
19445+
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
19446+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
19447+
LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
19448+
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
19449+
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
19450+
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
19451+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
19452+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
19453+
19454+
if (n_ctx_per_seq < hparams.n_ctx_train) {
19455+
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
19456+
__func__, n_ctx_per_seq, hparams.n_ctx_train);
19457+
}
19458+
19459+
if (n_ctx_per_seq > hparams.n_ctx_train) {
19460+
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
19461+
__func__, n_ctx_per_seq, hparams.n_ctx_train);
19462+
}
1944919463

1945019464
ctx->abort_callback = params.abort_callback;
1945119465
ctx->abort_callback_data = params.abort_callback_data;

0 commit comments

Comments
 (0)