|
8 | 8 | import math |
9 | 9 | import json |
10 | 10 | import shutil |
| 11 | +import time |
11 | 12 | from random import randrange |
12 | 13 | import numpy as np |
13 | 14 | from datetime import datetime |
@@ -239,6 +240,21 @@ def get_model_name(): |
239 | 240 | "seq_length": [256], |
240 | 241 | "tokenizer": "MiniMaxAI/MiniMax-M2", |
241 | 242 | }, |
| 243 | + "llama3-8b-lmcache-cpu": { |
| 244 | + "batch_size": [1, 4], |
| 245 | + "seq_length": [256], |
| 246 | + "tokenizer": "TheBloke/Llama-3-8B-fp16" |
| 247 | + }, |
| 248 | + "llama3-8b-lmcache-local-storage": { |
| 249 | + "batch_size": [1, 4], |
| 250 | + "seq_length": [256], |
| 251 | + "tokenizer": "TheBloke/Llama-3-8B-fp16" |
| 252 | + }, |
| 253 | + "llama3-8b-no-lmcache": { |
| 254 | + "batch_size": [1, 4], |
| 255 | + "seq_length": [256], |
| 256 | + "tokenizer": "TheBloke/Llama-3-8B-fp16" |
| 257 | + }, |
242 | 258 | } |
243 | 259 |
|
244 | 260 | vllm_neo_model_spec = { |
@@ -1786,6 +1802,91 @@ def test_text_embedding_model(model, model_spec): |
1786 | 1802 | awscurl_run(req, spec.get("tokenizer"), batch_size) |
1787 | 1803 |
|
1788 | 1804 |
|
| 1805 | +def test_handler_lmcache(model, model_spec, is_baseline): |
| 1806 | + modelspec_checker(model, model_spec) |
| 1807 | + spec = model_spec[args.model] |
| 1808 | + if "worker" in spec: |
| 1809 | + check_worker_number(spec["worker"]) |
| 1810 | + |
| 1811 | + paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized |
| 1812 | + the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data, |
| 1813 | + can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models |
| 1814 | + typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input. |
| 1815 | + As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed. |
| 1816 | + However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required |
| 1817 | + for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation, |
| 1818 | + storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests |
| 1819 | + share common prefixes or context, as the cached states can be reused across different queries.""" |
| 1820 | + shared_prefix = " ".join([paragraph] * 40) |
| 1821 | + params = {"max_new_tokens": 50, "temperature": 0, "seed": 42} |
| 1822 | + |
| 1823 | + warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params} |
| 1824 | + LOGGER.info("Warmup: Populating cache with shared prefix") |
| 1825 | + start = time.time() |
| 1826 | + send_json(warmup_req) |
| 1827 | + warmup_time = time.time() - start |
| 1828 | + time.sleep(1) |
| 1829 | + |
| 1830 | + test_req = {"inputs": shared_prefix + " Test query?", "parameters": params} |
| 1831 | + LOGGER.info("Test: Sending request with shared prefix") |
| 1832 | + start = time.time() |
| 1833 | + send_json(test_req) |
| 1834 | + test_time = time.time() - start |
| 1835 | + |
| 1836 | + speedup = warmup_time / test_time if test_time > 0 else 0 |
| 1837 | + LOGGER.info( |
| 1838 | + f"Warmup time: {warmup_time:.2f}s, Test time: {test_time:.2f}s, Speedup: {speedup:.2f}x" |
| 1839 | + ) |
| 1840 | + |
| 1841 | + |
| 1842 | +def test_handler_lmcache_performance(model, model_spec): |
| 1843 | + modelspec_checker(model, model_spec) |
| 1844 | + spec = model_spec[args.model] |
| 1845 | + if "worker" in spec: |
| 1846 | + check_worker_number(spec["worker"]) |
| 1847 | + |
| 1848 | + paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized |
| 1849 | + the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data, |
| 1850 | + can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models |
| 1851 | + typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input. |
| 1852 | + As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed. |
| 1853 | + However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required |
| 1854 | + for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation, |
| 1855 | + storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests |
| 1856 | + share common prefixes or context, as the cached states can be reused across different queries.""" |
| 1857 | + shared_prefix = " ".join([paragraph] * 40) |
| 1858 | + params = {"max_new_tokens": 50, "temperature": 0, "seed": 42} |
| 1859 | + concurrency = 100 |
| 1860 | + warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params} |
| 1861 | + LOGGER.info("Warmup: Populating LMCache with shared prefix") |
| 1862 | + send_json(warmup_req) |
| 1863 | + time.sleep(2) |
| 1864 | + |
| 1865 | + shared_reqs = [{ |
| 1866 | + "inputs": shared_prefix + f" Query {i}?", |
| 1867 | + "parameters": params |
| 1868 | + } for i in range(concurrency)] |
| 1869 | + |
| 1870 | + LOGGER.info( |
| 1871 | + f"Performance test: {concurrency} concurrent requests with shared prefix" |
| 1872 | + ) |
| 1873 | + awscurl_run(shared_reqs, |
| 1874 | + spec.get("tokenizer"), |
| 1875 | + concurrency=concurrency, |
| 1876 | + num_run=1, |
| 1877 | + json_results=True, |
| 1878 | + dataset=True) |
| 1879 | + |
| 1880 | + with open("benchmark.json") as f: |
| 1881 | + metrics = json.load(f) |
| 1882 | + p50_ttft = metrics["p50TimeToFirstByte"] |
| 1883 | + p90_ttft = metrics["p90TimeToFirstByte"] |
| 1884 | + tps = metrics["tps"] |
| 1885 | + LOGGER.info( |
| 1886 | + f"Results: P50 TTFT={p50_ttft:.2f}ms, P90 TTFT={p90_ttft:.2f}ms, TPS={tps:.2f}" |
| 1887 | + ) |
| 1888 | + |
| 1889 | + |
1789 | 1890 | def test_handler_stateful(model, model_spec): |
1790 | 1891 | if model not in model_spec: |
1791 | 1892 | raise ValueError( |
@@ -1899,6 +2000,10 @@ def run(raw_args): |
1899 | 2000 | test_handler_rolling_batch_chat(args.model, vllm_chat_model_spec) |
1900 | 2001 | elif args.handler == "vllm_tool": |
1901 | 2002 | test_handler_rolling_batch_tool(args.model, vllm_tool_model_spec) |
| 2003 | + elif args.handler == "vllm_lmcache": |
| 2004 | + test_handler_lmcache(args.model, vllm_model_spec, False) |
| 2005 | + elif args.handler == "vllm_lmcache_performance": |
| 2006 | + test_handler_lmcache_performance(args.model, vllm_model_spec) |
1902 | 2007 | elif args.handler == "vllm_neo": |
1903 | 2008 | test_handler_rolling_batch(args.model, vllm_neo_model_spec) |
1904 | 2009 | elif args.handler == "handler_performance": |
|
0 commit comments