diff --git a/serving/docs/lmi/user_guides/vllm_user_guide.md b/serving/docs/lmi/user_guides/vllm_user_guide.md index 81bb76020..501fef5d3 100644 --- a/serving/docs/lmi/user_guides/vllm_user_guide.md +++ b/serving/docs/lmi/user_guides/vllm_user_guide.md @@ -182,7 +182,7 @@ These are supported in LMI. For example, if you want to enable the `speculative_config`, you can do: * `option.speculative_config={"model": "meta-llama/Llama3.2-1B-Instruct", "num_speculative_tokens": 5}` -* `OPTION_SPECULATIVE_CONFIG={"model": "meta-llama/Llama3.2-1B-Instruct", "num_speculative_tokens": 5}` +* `OPTION_SPECULATIVE_CONFIG='{"model": "meta-llama/Llama3.2-1B-Instruct", "num_speculative_tokens": 5}'` ## Custom Handlers diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 0d7b0caf2..e02703743 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -284,10 +284,10 @@ def get_model_name(): "seq_length": [256], "tokenizer": "JackFram/llama-68m" }, - "llama-68m-speculative-eagle": { + "llama3-1-8b-speculative-eagle": { "batch_size": [1, 4], "seq_length": [256], - "tokenizer": "JackFram/llama-68m" + "tokenizer": "unsloth/Meta-Llama-3.1-8B" }, "llama-7b-unmerged-lora": { "batch_size": [3], diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index 9904822fd..33cde8ae6 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -366,19 +366,17 @@ "llama-68m-speculative-medusa": { "option.model_id": "s3://djl-llm/llama-68m/", "option.task": "text-generation", - "option.speculative_model": "abhigoyal/vllm-medusa-llama-68m-random", - "option.num_speculative_tokens": 4, - "option.use_v2_block_manager": True, + "option.speculative_config": + '{"method":"medusa","model":"abhigoyal/vllm-medusa-llama-68m-random","num_speculative_tokens":4}', "option.tensor_parallel_degree": 1, "option.max_rolling_batch_size": 4, }, - "llama-68m-speculative-eagle": { - "option.model_id": "s3://djl-llm/llama-68m/", + "llama3-1-8b-speculative-eagle": { + "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/", "option.task": "text-generation", - "option.speculative_model": "abhigoyal/vllm-eagle-llama-68m-random", - "option.num_speculative_tokens": 4, - "option.use_v2_block_manager": True, - "option.tensor_parallel_degree": 1, + "option.speculative_config": + '{"method":"eagle","model":"yuhuili/EAGLE-LLaMA3.1-Instruct-8B","num_speculative_tokens":4}', + "option.tensor_parallel_degree": 4, "option.max_rolling_batch_size": 4, }, "llama-7b-unmerged-lora": { diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 91ed6c9db..fe387ffbe 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -409,11 +409,11 @@ def test_llama_68m_speculative_medusa(self): @pytest.mark.gpu_4 class TestVllm2: - def test_llama_68m_speculative_eagle(self): - with Runner('lmi', 'llama-68m-speculative-eagle') as r: - prepare.build_vllm_async_model("llama-68m-speculative-eagle") + def test_llama3_1_8b_speculative_eagle(self): + with Runner('lmi', 'llama3-1-8b-speculative-eagle') as r: + prepare.build_vllm_async_model("llama3-1-8b-speculative-eagle") r.launch() - client.run("vllm llama-68m-speculative-eagle".split()) + client.run("vllm llama3-1-8b-speculative-eagle".split()) def test_llama3_1_8b_instruct_tool(self): with Runner('lmi', 'llama3-1-8b-instruct-tool') as r: