[BodhiApp] changes for bodhiapp.

anagri · anagri · commit 35d08e770a4f · 2025-01-17T19:32:21.000+05:30
[21-dec-24] using prompt if passed in chat completions, and not using messages
- added python integration tests for changes in server
- having add_special as request param to pass to upstream, allows pre-formatted chat messages to not be formatted again
- modified workflow to download and cache the llama2-7b model used for integration testing
[17-jan-25] updated to latest llama.cpp
server.cpp had changes where ctx_server.vocab was used instead of the earlier ctx_server.ctx
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -71,6 +71,26 @@ jobs:
         with:
           python-version: '3.11'
 
+      - name: envs
+        shell: bash
+        run: |
+          echo "USER_HOME=$HOME" >> $GITHUB_ENV
+
+      - name: Cache HuggingFace models
+        uses: actions/cache@v4
+        id: cache-hf
+        with:
+          path: ${{ env.USER_HOME }}/.cache/huggingface
+          key: hf-cache-llama2-7b-chat
+          enableCrossOsArchive: true
+
+      - name: Check and Download Llama model
+        if: steps.cache-hf.outputs.cache-hit != 'true'
+        run: |
+          python -m pip install -U pip
+          python -m pip install -U "huggingface_hub[cli]"
+          huggingface-cli download --revision 191239b3e26b2882fb562ffccdd1cf0f65402adb TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf
+
       - name: Tests dependencies
         id: test_dependencies
         run: |
@@ -180,6 +200,25 @@ jobs:
         run: |
           cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
 
+      - name: Set environment variables
+        shell: pwsh
+        run: |
+          echo "USER_HOME=${HOME}" >> $env:GITHUB_ENV
+
+      - name: Cache HuggingFace models
+        uses: actions/cache@v4
+        id: cache-hf
+        with:
+          path: ${{ env.USER_HOME }}\.cache\huggingface
+          key: hf-cache-Windows-llama2-7b-chat
+
+      - name: Check and Download Llama model
+        if: steps.cache-hf.outputs.cache-hit != 'true'
+        run: |
+          python -m pip install -U pip
+          python -m pip install -U "huggingface_hub[cli]"
+          huggingface-cli download --revision 191239b3e26b2882fb562ffccdd1cf0f65402adb TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf
+
       - name: Tests
         id: server_integration_tests
         if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -3646,7 +3646,10 @@ int main(int argc, char ** argv) {
         std::vector<server_task> tasks;
 
         try {
-            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true);
+            const bool add_special = json_value(data, "add_special", true);
+            const bool with_pieces = json_value(data, "with_pieces", true);
+
+            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), add_special, with_pieces);
             tasks.reserve(tokenized_prompts.size());
             for (size_t i = 0; i < tokenized_prompts.size(); i++) {
                 server_task task = server_task(type);
diff --git a/examples/server/tests/unit/test_preformatted_prompt.py b/examples/server/tests/unit/test_preformatted_prompt.py
@@ -0,0 +1,84 @@
+import pytest
+from utils import *
+
+server = ServerPreset.llama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.llama2()
+
+
+@pytest.mark.parametrize(
+    "model,data,max_tokens,re_content,n_prompt,n_predicted,finish_reason, prompt",
+    [
+        (
+            "llama2",
+            {
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": "What day comes after Monday?"},
+                ]
+            },
+            16,
+            "(Tuesday)+",
+            56,
+            8,
+            "stop",
+            """<s> <|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>user
+What day comes after Monday?<|im_end|>
+<|im_start|>assistant
+""",
+        ),
+        (
+            "llama2",
+            {
+                "prompt": """<s>[INST] <<SYS>>
+You are a helpful assistant.
+<</SYS>>
+
+What day comes after Monday? [/INST]""",
+                "add_special": False,
+            },
+            1024,
+            "(Tuesday)+",
+            33,
+            25,
+            "stop",
+            """<s> [INST] <<SYS>>
+You are a helpful assistant.
+<</SYS>>
+
+What day comes after Monday? [/INST]""",
+        ),
+    ],
+)
+def test_chat_completion_without_preformatted_prompt(
+    model, data, max_tokens, re_content, n_prompt, n_predicted, finish_reason, prompt
+):
+    global server
+    server.start()
+    res = server.make_request(
+        "POST",
+        "/chat/completions",
+        data={
+            "model": model,
+            "max_tokens": max_tokens,
+            **data,
+        },
+    )
+    assert res.status_code == 200
+    assert (
+        "cmpl" in res.body["id"]
+    )  # make sure the completion id has the expected format
+    assert res.body["model"] == model
+    # assert res.body["usage"]["prompt_tokens"] == n_prompt
+    # assert res.body["usage"]["completion_tokens"] == n_predicted
+    choice = res.body["choices"][0]
+    assert "assistant" == choice["message"]["role"]
+    assert match_regex(re_content, choice["message"]["content"])
+    assert choice["finish_reason"] == finish_reason
+    assert res.body["__verbose"]["prompt"] == prompt
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
@@ -333,6 +333,29 @@ def jina_reranker_tiny() -> ServerProcess:
         server.server_reranking = True
         return server
 
+    @staticmethod
+    def llama2() -> ServerProcess:
+        server = ServerProcess()
+        server.model_file = os.path.join(
+            os.path.expanduser("~"),
+            ".cache",
+            "huggingface",
+            "hub",
+            "models--TheBloke--Llama-2-7B-Chat-GGUF",
+            "snapshots",
+            "191239b3e26b2882fb562ffccdd1cf0f65402adb",
+            "llama-2-7b-chat.Q4_K_M.gguf",
+        )
+        server.debug = True
+        server.model_hf_repo = None
+        server.model_hf_file = None
+        server.model_alias = "llama2"
+        server.n_ctx = 2048
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 2048
+        server.seed = 42
+        return server
 
 def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
     """
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -582,8 +582,14 @@ static json oaicompat_chat_completion_params_parse(
         const std::string & chat_template) {
     json llama_params;
 
+    std::string prompt = json_value(body, "prompt", std::string(""));
+    if (prompt != "") {
+        LOG_WRN("Using prompt from body '%s'", prompt.c_str());
+        llama_params["prompt"] = prompt;
+    } else {
     // Apply chat template to the list of messages
     llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
+    }
 
     // Handle "stop" field
     if (body.contains("stop") && body.at("stop").is_string()) {