Skip to content

Commit 35d08e7

Browse files
committed
[BodhiApp] changes for bodhiapp.
[21-dec-24] using prompt if passed in chat completions, and not using messages - added python integration tests for changes in server - having add_special as request param to pass to upstream, allows pre-formatted chat messages to not be formatted again - modified workflow to download and cache the llama2-7b model used for integration testing [17-jan-25] updated to latest llama.cpp server.cpp had changes where ctx_server.vocab was used instead of the earlier ctx_server.ctx
1 parent 3edfa7d commit 35d08e7

File tree

5 files changed

+156
-1
lines changed

5 files changed

+156
-1
lines changed

.github/workflows/server.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,26 @@ jobs:
7171
with:
7272
python-version: '3.11'
7373

74+
- name: envs
75+
shell: bash
76+
run: |
77+
echo "USER_HOME=$HOME" >> $GITHUB_ENV
78+
79+
- name: Cache HuggingFace models
80+
uses: actions/cache@v4
81+
id: cache-hf
82+
with:
83+
path: ${{ env.USER_HOME }}/.cache/huggingface
84+
key: hf-cache-llama2-7b-chat
85+
enableCrossOsArchive: true
86+
87+
- name: Check and Download Llama model
88+
if: steps.cache-hf.outputs.cache-hit != 'true'
89+
run: |
90+
python -m pip install -U pip
91+
python -m pip install -U "huggingface_hub[cli]"
92+
huggingface-cli download --revision 191239b3e26b2882fb562ffccdd1cf0f65402adb TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf
93+
7494
- name: Tests dependencies
7595
id: test_dependencies
7696
run: |
@@ -180,6 +200,25 @@ jobs:
180200
run: |
181201
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
182202
203+
- name: Set environment variables
204+
shell: pwsh
205+
run: |
206+
echo "USER_HOME=${HOME}" >> $env:GITHUB_ENV
207+
208+
- name: Cache HuggingFace models
209+
uses: actions/cache@v4
210+
id: cache-hf
211+
with:
212+
path: ${{ env.USER_HOME }}\.cache\huggingface
213+
key: hf-cache-Windows-llama2-7b-chat
214+
215+
- name: Check and Download Llama model
216+
if: steps.cache-hf.outputs.cache-hit != 'true'
217+
run: |
218+
python -m pip install -U pip
219+
python -m pip install -U "huggingface_hub[cli]"
220+
huggingface-cli download --revision 191239b3e26b2882fb562ffccdd1cf0f65402adb TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf
221+
183222
- name: Tests
184223
id: server_integration_tests
185224
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}

examples/server/server.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3646,7 +3646,10 @@ int main(int argc, char ** argv) {
36463646
std::vector<server_task> tasks;
36473647

36483648
try {
3649-
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true);
3649+
const bool add_special = json_value(data, "add_special", true);
3650+
const bool with_pieces = json_value(data, "with_pieces", true);
3651+
3652+
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), add_special, with_pieces);
36503653
tasks.reserve(tokenized_prompts.size());
36513654
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
36523655
server_task task = server_task(type);
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import pytest
2+
from utils import *
3+
4+
server = ServerPreset.llama2()
5+
6+
7+
@pytest.fixture(scope="module", autouse=True)
8+
def create_server():
9+
global server
10+
server = ServerPreset.llama2()
11+
12+
13+
@pytest.mark.parametrize(
14+
"model,data,max_tokens,re_content,n_prompt,n_predicted,finish_reason, prompt",
15+
[
16+
(
17+
"llama2",
18+
{
19+
"messages": [
20+
{"role": "system", "content": "You are a helpful assistant."},
21+
{"role": "user", "content": "What day comes after Monday?"},
22+
]
23+
},
24+
16,
25+
"(Tuesday)+",
26+
56,
27+
8,
28+
"stop",
29+
"""<s> <|im_start|>system
30+
You are a helpful assistant.<|im_end|>
31+
<|im_start|>user
32+
What day comes after Monday?<|im_end|>
33+
<|im_start|>assistant
34+
""",
35+
),
36+
(
37+
"llama2",
38+
{
39+
"prompt": """<s>[INST] <<SYS>>
40+
You are a helpful assistant.
41+
<</SYS>>
42+
43+
What day comes after Monday? [/INST]""",
44+
"add_special": False,
45+
},
46+
1024,
47+
"(Tuesday)+",
48+
33,
49+
25,
50+
"stop",
51+
"""<s> [INST] <<SYS>>
52+
You are a helpful assistant.
53+
<</SYS>>
54+
55+
What day comes after Monday? [/INST]""",
56+
),
57+
],
58+
)
59+
def test_chat_completion_without_preformatted_prompt(
60+
model, data, max_tokens, re_content, n_prompt, n_predicted, finish_reason, prompt
61+
):
62+
global server
63+
server.start()
64+
res = server.make_request(
65+
"POST",
66+
"/chat/completions",
67+
data={
68+
"model": model,
69+
"max_tokens": max_tokens,
70+
**data,
71+
},
72+
)
73+
assert res.status_code == 200
74+
assert (
75+
"cmpl" in res.body["id"]
76+
) # make sure the completion id has the expected format
77+
assert res.body["model"] == model
78+
# assert res.body["usage"]["prompt_tokens"] == n_prompt
79+
# assert res.body["usage"]["completion_tokens"] == n_predicted
80+
choice = res.body["choices"][0]
81+
assert "assistant" == choice["message"]["role"]
82+
assert match_regex(re_content, choice["message"]["content"])
83+
assert choice["finish_reason"] == finish_reason
84+
assert res.body["__verbose"]["prompt"] == prompt

examples/server/tests/utils.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,29 @@ def jina_reranker_tiny() -> ServerProcess:
333333
server.server_reranking = True
334334
return server
335335

336+
@staticmethod
337+
def llama2() -> ServerProcess:
338+
server = ServerProcess()
339+
server.model_file = os.path.join(
340+
os.path.expanduser("~"),
341+
".cache",
342+
"huggingface",
343+
"hub",
344+
"models--TheBloke--Llama-2-7B-Chat-GGUF",
345+
"snapshots",
346+
"191239b3e26b2882fb562ffccdd1cf0f65402adb",
347+
"llama-2-7b-chat.Q4_K_M.gguf",
348+
)
349+
server.debug = True
350+
server.model_hf_repo = None
351+
server.model_hf_file = None
352+
server.model_alias = "llama2"
353+
server.n_ctx = 2048
354+
server.n_batch = 32
355+
server.n_slots = 2
356+
server.n_predict = 2048
357+
server.seed = 42
358+
return server
336359

337360
def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
338361
"""

examples/server/utils.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,8 +582,14 @@ static json oaicompat_chat_completion_params_parse(
582582
const std::string & chat_template) {
583583
json llama_params;
584584

585+
std::string prompt = json_value(body, "prompt", std::string(""));
586+
if (prompt != "") {
587+
LOG_WRN("Using prompt from body '%s'", prompt.c_str());
588+
llama_params["prompt"] = prompt;
589+
} else {
585590
// Apply chat template to the list of messages
586591
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
592+
}
587593

588594
// Handle "stop" field
589595
if (body.contains("stop") && body.at("stop").is_string()) {

0 commit comments

Comments
 (0)