Skip to content

Commit d2fcd91

Browse files
authored
server : disable context shift by default (ggml-org#15416)
* server : disable context shift by default ggml-ci * server : make scopr of test parameters local
1 parent a6d3cfe commit d2fcd91

16 files changed

+27
-20
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1530,6 +1530,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15301530
params.ctx_shift = false;
15311531
}
15321532
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1533+
add_opt(common_arg(
1534+
{"--context-shift"},
1535+
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
1536+
[](common_params & params) {
1537+
params.ctx_shift = true;
1538+
}
1539+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
15331540
add_opt(common_arg(
15341541
{"--chunks"}, "N",
15351542
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ struct common_params {
375375
bool cont_batching = true; // insert new sequences for decoding on-the-fly
376376
bool flash_attn = false; // flash attention
377377
bool no_perf = false; // disable performance metrics
378-
bool ctx_shift = true; // context shift on inifinite text generation
378+
bool ctx_shift = false; // context shift on inifinite text generation
379379
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
380380
bool kv_unified = false; // enable unified KV cache
381381

tools/server/tests/unit/test_basic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
server = ServerPreset.tinyllama2()
66

77

8-
@pytest.fixture(scope="module", autouse=True)
8+
@pytest.fixture(autouse=True)
99
def create_server():
1010
global server
1111
server = ServerPreset.tinyllama2()

tools/server/tests/unit/test_completion.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
server = ServerPreset.tinyllama2()
88

99

10-
@pytest.fixture(scope="module", autouse=True)
10+
@pytest.fixture(autouse=True)
1111
def create_server():
1212
global server
1313
server = ServerPreset.tinyllama2()
@@ -229,7 +229,7 @@ def test_nocache_long_input_prompt():
229229
"temperature": 1.0,
230230
"cache_prompt": False,
231231
})
232-
assert res.status_code == 200
232+
assert res.status_code == 400
233233

234234

235235
def test_completion_with_tokens_input():

tools/server/tests/unit/test_ctx_shift.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
1212
""".strip()
1313

14-
@pytest.fixture(scope="module", autouse=True)
14+
@pytest.fixture(autouse=True)
1515
def create_server():
1616
global server
1717
server = ServerPreset.tinyllama2()
@@ -25,6 +25,7 @@ def test_ctx_shift_enabled():
2525
# the prompt is truncated to keep the last 109 tokens
2626
# 64 tokens are generated thanks to shifting the context when it gets full
2727
global server
28+
server.enable_ctx_shift = True
2829
server.start()
2930
res = server.make_request("POST", "/completion", data={
3031
"n_predict": 64,
@@ -42,7 +43,6 @@ def test_ctx_shift_enabled():
4243
])
4344
def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
4445
global server
45-
server.disable_ctx_shift = True
4646
server.n_predict = -1
4747
server.start()
4848
res = server.make_request("POST", "/completion", data={
@@ -56,7 +56,6 @@ def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, tr
5656

5757
def test_ctx_shift_disabled_long_prompt():
5858
global server
59-
server.disable_ctx_shift = True
6059
server.start()
6160
res = server.make_request("POST", "/completion", data={
6261
"n_predict": 64,
@@ -68,7 +67,6 @@ def test_ctx_shift_disabled_long_prompt():
6867

6968
def test_ctx_shift_disabled_stream():
7069
global server
71-
server.disable_ctx_shift = True
7270
server.start()
7371
res = server.make_stream_request("POST", "/v1/completions", data={
7472
"n_predict": 256,

tools/server/tests/unit/test_embedding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
EPSILON = 1e-3
1010

11-
@pytest.fixture(scope="module", autouse=True)
11+
@pytest.fixture(autouse=True)
1212
def create_server():
1313
global server
1414
server = ServerPreset.bert_bge_small()

tools/server/tests/unit/test_infill.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
server = ServerPreset.tinyllama_infill()
55

6-
@pytest.fixture(scope="module", autouse=True)
6+
@pytest.fixture(autouse=True)
77
def create_server():
88
global server
99
server = ServerPreset.tinyllama_infill()

tools/server/tests/unit/test_lora.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
77

8-
@pytest.fixture(scope="module", autouse=True)
8+
@pytest.fixture(autouse=True)
99
def create_server():
1010
global server
1111
server = ServerPreset.stories15m_moe()

tools/server/tests/unit/test_rerank.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
server = ServerPreset.jina_reranker_tiny()
55

66

7-
@pytest.fixture(scope="module", autouse=True)
7+
@pytest.fixture(autouse=True)
88
def create_server():
99
global server
1010
server = ServerPreset.jina_reranker_tiny()

tools/server/tests/unit/test_security.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
TEST_API_KEY = "sk-this-is-the-secret-key"
88

9-
@pytest.fixture(scope="module", autouse=True)
9+
@pytest.fixture(autouse=True)
1010
def create_server():
1111
global server
1212
server = ServerPreset.tinyllama2()

0 commit comments

Comments
 (0)