Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cd/benchmark/benchmark_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ model_text:
- Qwen/Qwen2.5-72B-Instruct
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen3-0.6B
- Qwen/Qwen3-30B-A3B-Instruct-2507
- ibm-granite/granite-8b-code-instruct-4k
- ibm-granite/granite-20b-code-instruct-8k
- speakleash/Bielik-4.5B-v3.0-Instruct
Expand Down
5 changes: 4 additions & 1 deletion .cd/benchmark/benchmark_scenarios_text.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ qwen25_72b_instruct:
qwen25_7b_instruct:
MODEL: Qwen/Qwen2.5-7B-Instruct

Qwen/Qwen3-0.6B:
qwen3-0.6B:
MODEL: Qwen/Qwen3-0.6B

qwen3-30B-A3B-Instruct-2507:
MODEL: Qwen/Qwen3-30B-A3B-Instruct-2507

granite_8b_code_instruct_4k:
MODEL: ibm-granite/granite-8b-code-instruct-4k

Expand Down
5 changes: 2 additions & 3 deletions .cd/server/server_output.env
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ VLLM_PROMPT_BS_BUCKET_STEP
VLLM_PROMPT_BS_BUCKET_MAX
VLLM_DECODE_BS_BUCKET_MIN
VLLM_DECODE_BS_BUCKET_STEP
VLLM_PROMPT_SEQ_BUCKET_MIN
VLLM_PROMPT_SEQ_BUCKET_STEP
VLLM_PROMPT_QUERY_BUCKET_MIN
VLLM_PROMPT_QUERY_BUCKET_STEP
VLLM_PROMPT_CTX_BUCKET_STEP
VLLM_DECODE_BLOCK_BUCKET_MIN
VLLM_DECODE_BLOCK_BUCKET_STEP
Expand All @@ -19,7 +19,6 @@ VLLM_EXPONENTIAL_BUCKETING
MAX_NUM_BATCHED_TOKENS
PT_HPU_ENABLE_LAZY_COLLECTIVES
GPU_MEM_UTILIZATION
VLLM_GRAPH_PROMPT_RATIO
VLLM_GRAPH_RESERVED_MEM
MAX_NUM_SEQS
VLLM_CONTIGUOUS_PA
Expand Down
2 changes: 1 addition & 1 deletion .cd/server/server_user.env
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ VLLM_DECODE_BLOCK_BUCKET_STEP
VLLM_DECODE_BS_BUCKET_STEP
VLLM_PROMPT_BS_BUCKET_STEP
VLLM_PROMPT_BS_BUCKET_MAX
VLLM_PROMPT_SEQ_BUCKET_STEP
VLLM_PROMPT_QUERY_BUCKET_STEP
VLLM_PROMPT_CTX_BUCKET_STEP
VLLM_SKIP_WARMUP
MAX_MODEL_LEN
Expand Down
41 changes: 21 additions & 20 deletions .cd/server/settings_vllm.csv
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_PROMPT_CTX_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,VLLM_PROMPT_BS_BUCKET_MAX,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_SKIP_WARMUP,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,ENABLE_PREFIX_CACHING,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,0,10,9,128,1,32,1,32,128,256,1,128,256,1,32,4096,8,32,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,0,20,5,128,1,32,1,32,128,256,1,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,0,20,5,128,1,32,1,32,128,256,1,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,0,5,3,128,1,32,1,32,128,256,1,128,256,1,16,2048,8,32,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,0,10,5,128,1,32,1,32,128,256,1,128,256,1,28,3072,8,24,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,0,10,5,128,1,32,1,32,128,256,1,128,256,1,32,4096,8,32,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,0,12,8,128,1,32,1,32,128,256,1,128,256,1,56,6144,8,48,2,65536,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,1
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,0,10,9,128,1,32,1,32,128,256,1,128,256,1,32,4096,8,32,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,0,20,5,128,1,32,1,32,128,256,1,128,256,1,126,16384,8,128,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,1
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,0,10,12,128,1,32,1,32,128,256,1,128,256,1,48,5120,8,40,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,0,20,5,128,1,32,1,32,128,256,1,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,0,12,16,128,1,32,1,32,128,256,1,128,256,1,64,5120,8,40,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,1
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,0,10,6,128,1,32,1,32,128,256,1,128,256,1,80,8192,8,64,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,1
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,0,10,5,128,1,32,1,32,128,256,1,128,256,1,28,3584,4,28,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20,0,10,8,128,1,32,1,32,128,256,1,128,256,1,36,4096,8,32,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,40133986304,2,2,37.37,0,10,4,128,1,32,1,32,128,256,1,128,256,1,52,6144,1,48,2,65536,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,0,12,4,128,1,32,1,32,128,256,1,128,256,1,28,3584,4,28,2,32768,1,FALSE,FALSE,2048,FALSE,FALSE,FALSE,1,0
Qwen/Qwen3-0.6B,1,4352,128,2,1.61E+09,2,2,1.5,0,10,5,128,1,32,1,32,128,256,1,128,256,1,28,1024,8,16,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
speakleash/Bielik-4.5B-v3.0-Instruct,1,4352,128,2,14483464192,2,2,13.48877716,0,10,9,128,1,32,1,32,128,256,1,128,256,1,32,4096,8,32,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_QUERY_BUCKET_MIN,VLLM_PROMPT_QUERY_BUCKET_STEP,VLLM_PROMPT_CTX_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,VLLM_PROMPT_BS_BUCKET_MAX,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,HEAD_DIM,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_SKIP_WARMUP,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,ENABLE_PREFIX_CACHING,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,0,10,9,128,1,32,1,32,128,256,4,128,256,1,32,4096,8,32,,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,0,20,5,128,1,32,1,32,128,256,4,128,256,1,80,8192,8,64,,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,0,20,5,128,1,32,1,32,128,256,4,128,256,1,80,8192,8,64,,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,0,5,3,128,1,32,1,32,128,256,4,128,256,1,16,2048,8,32,,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,0,10,5,128,1,32,1,32,128,256,4,128,256,1,28,3072,8,24,,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,0,10,5,128,1,32,1,32,128,256,4,128,256,1,32,4096,8,32,,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,0,12,8,128,1,32,1,32,128,256,4,128,256,1,56,6144,8,48,,2,65536,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,1
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,0,10,9,128,1,32,1,32,128,256,4,128,256,1,32,4096,8,32,,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,0,20,5,128,1,32,1,32,128,256,4,128,256,1,126,16384,8,128,,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,1
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,0,10,12,128,1,32,1,32,128,256,4,128,256,1,48,5120,8,40,,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,0,20,5,128,1,32,1,32,128,256,4,128,256,1,80,8192,8,64,,2,131072,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,0,12,16,128,1,32,1,32,128,256,4,128,256,1,64,5120,8,40,,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,1
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,0,10,6,128,1,32,1,32,128,256,4,128,256,1,80,8192,8,64,,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,1
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,0,10,5,128,1,32,1,32,128,256,4,128,256,1,28,3584,4,28,,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20,0,10,8,128,1,32,1,32,128,256,4,128,256,1,36,4096,8,32,,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,40133986304,2,2,37.37,0,10,4,128,1,32,1,32,128,256,4,128,256,1,52,6144,1,48,,2,65536,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,0,12,4,128,1,32,1,32,128,256,4,128,256,1,28,3584,4,28,,2,32768,1,FALSE,FALSE,2048,FALSE,FALSE,FALSE,1,0
Qwen/Qwen3-0.6B,1,4352,128,2,1.61E+09,2,2,1.5,0,10,5,128,1,32,1,32,128,256,4,128,256,1,28,1024,8,16,128,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
speakleash/Bielik-4.5B-v3.0-Instruct,1,4352,128,2,9514520576,2,2,8.86,0,10,5,128,1,32,1,32,128,256,4,128,256,1,60,2048,2,16,,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
Qwen/Qwen3-30B-A3B-Instruct-2507,1,4352,128,2,61064245248,2,2,56.87,0,10,5,128,1,32,1,32,128,256,4,128,256,1,48,2048,4,32,128,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
53 changes: 33 additions & 20 deletions .cd/server/vllm_autocalc_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,16 @@ def calc_GPU_MEM_UTILIZATION(ctx):
return math.floor(ctx['GPU_MEMORY_UTIL_TEMP'] * 100) / 100


def calc_HEAD_DIM(ctx):
if not ctx['HEAD_DIM'] or math.isnan(ctx['HEAD_DIM']):
return ctx['HIDDEN_SIZE'] / ctx['NUM_ATTENTION_HEADS']
else:
return ctx['HEAD_DIM']


def calc_KV_CACHE_PER_SEQ(ctx):
return ((2 * ctx['MAX_MODEL_LEN'] * ctx['NUM_HIDDEN_LAYERS'] * ctx['HIDDEN_SIZE'] * ctx['NUM_KEY_VALUE_HEADS'] *
ctx['CACHE_DTYPE_BYTES']) / ctx['NUM_ATTENTION_HEADS']) / (1024 * 1024 * 1024)
return (2 * ctx['MAX_MODEL_LEN'] * ctx['NUM_HIDDEN_LAYERS'] * calc_HEAD_DIM(ctx) * ctx['NUM_KEY_VALUE_HEADS'] *
ctx['CACHE_DTYPE_BYTES']) / (1024 * 1024 * 1024)


def calc_EST_MAX_NUM_SEQS(ctx):
Expand Down Expand Up @@ -145,15 +152,16 @@ def calc_PROMPT_SEQ_RAMP_GRAPHS(ctx):
if ctx['VLLM_EXPONENTIAL_BUCKETING']:
return 1 + math.ceil(math.log(ctx['MAX_NUM_BATCHED_TOKENS'], 2))
else:
return 1 + int(math.log(ctx['VLLM_PROMPT_SEQ_BUCKET_STEP'] / ctx['VLLM_PROMPT_SEQ_BUCKET_MIN'], 2))
return 1 + int(math.log(ctx['VLLM_PROMPT_QUERY_BUCKET_STEP'] / ctx['VLLM_PROMPT_QUERY_BUCKET_MIN'], 2))


def calc_PROMPT_SEQ_STEP_GRAPHS(ctx):
if ctx['VLLM_EXPONENTIAL_BUCKETING']:
return 0
else:
return int(1 + (min(ctx['MAX_NUM_BATCHED_TOKENS'], ctx['MAX_MODEL_LEN']) - ctx['VLLM_PROMPT_SEQ_BUCKET_STEP']) /
ctx['VLLM_PROMPT_SEQ_BUCKET_STEP'])
return int(1 +
(min(ctx['MAX_NUM_BATCHED_TOKENS'], ctx['MAX_MODEL_LEN']) - ctx['VLLM_PROMPT_QUERY_BUCKET_STEP']) /
ctx['VLLM_PROMPT_QUERY_BUCKET_STEP'])


def calc_EST_NUM_PROMPT_GRAPHS(ctx):
Expand All @@ -162,7 +170,7 @@ def calc_EST_NUM_PROMPT_GRAPHS(ctx):
graphs_2d = prompt_bs_graphs * prompt_seq_graphs
if prompt_bs_graphs > 1:
graphs_2d = graphs_2d / 2
ctx_blocks_max = max(1, (ctx['MAX_MODEL_LEN'] - ctx['VLLM_PROMPT_SEQ_BUCKET_MIN']) / ctx['BLOCK_SIZE'])
ctx_blocks_max = max(1, (ctx['MAX_MODEL_LEN'] - ctx['VLLM_PROMPT_QUERY_BUCKET_MIN']) / ctx['BLOCK_SIZE'])
ctx_blocks_min = max(1, (ctx['MAX_MODEL_LEN'] - ctx['MAX_NUM_BATCHED_TOKENS']) / ctx['BLOCK_SIZE'])
if ctx['VLLM_EXPONENTIAL_BUCKETING']:
ctx_block_graphs_max = max(1, math.ceil(math.log(ctx_blocks_max, 2)))
Expand All @@ -174,24 +182,35 @@ def calc_EST_NUM_PROMPT_GRAPHS(ctx):
return graphs_3d


def calc_EST_GRAPH_PROMPT_RATIO(ctx):
est_prompt_graph_mem = ctx['EST_NUM_PROMPT_GRAPHS'] * ctx['APPROX_MEM_PER_GRAPH_MB']
def calc_EST_PROMPT_GRAPH_MEM(ctx):
if ctx['VLLM_EXPONENTIAL_BUCKETING']:
est_prompt_graph_mem = ctx['EST_NUM_PROMPT_GRAPHS'] * ctx['APPROX_MEM_PER_GRAPH_MB']
else:
# Graph mem is function of context size for prompt
est_prompt_graph_mem = ctx['EST_NUM_PROMPT_GRAPHS'] * ctx['APPROX_MEM_PER_GRAPH_MB'] * pow(
max(1, ctx['MAX_MODEL_LEN'] / 4352), 0.8552)
return est_prompt_graph_mem


def calc_EST_DECODE_GRAPH_MEM(ctx):
est_decode_graph_mem = ctx['NUM_DECODE_GRAPHS'] * ctx['APPROX_MEM_PER_GRAPH_MB']
est_graph_prompt_ratio = est_prompt_graph_mem / (est_prompt_graph_mem + est_decode_graph_mem)
return est_graph_prompt_ratio
return est_decode_graph_mem


def calc_VLLM_GRAPH_PROMPT_RATIO(ctx):
return math.ceil(min(max(ctx['EST_GRAPH_PROMPT_RATIO'], 0.01), 0.99) * 100) / 100
def calc_EST_GRAPH_PROMPT_RATIO(ctx):
est_prompt_graph_mem = calc_EST_PROMPT_GRAPH_MEM(ctx)
est_decode_graph_mem = calc_EST_DECODE_GRAPH_MEM(ctx)
est_graph_prompt_ratio = est_prompt_graph_mem / (est_prompt_graph_mem + est_decode_graph_mem)
return est_graph_prompt_ratio


def calc_DECODE_GRAPH_TARGET_GB(ctx):
return math.ceil(ctx['NUM_DECODE_GRAPHS'] * ctx['APPROX_MEM_PER_GRAPH_MB'] / 1024 * 100) / 100
return math.ceil(calc_EST_DECODE_GRAPH_MEM(ctx) / 1024 * 100) / 100


def calc_EST_GRAPH_RESERVE_MEM(ctx):
return math.ceil(ctx['DECODE_GRAPH_TARGET_GB'] / (ctx['USABLE_MEM'] * ctx['GPU_MEM_UTILIZATION'] *
(1 - ctx['VLLM_GRAPH_PROMPT_RATIO'])) * 100) / 100
(1 - ctx['EST_GRAPH_PROMPT_RATIO'])) * 100) / 100


def calc_VLLM_GRAPH_RESERVED_MEM(ctx):
Expand Down Expand Up @@ -229,10 +248,6 @@ def calc_VLLM_DECODE_BLOCK_BUCKET_MAX(ctx):
return max(128, math.ceil((ctx['MAX_NUM_SEQS'] * ctx['MAX_MODEL_LEN']) / 128))


def calc_VLLM_PROMPT_SEQ_BUCKET_MAX(ctx):
return ctx['MAX_MODEL_LEN']


# Map parameter names to calculation functions
PARAM_CALC_FUNCS = {
"VLLM_PROMPT_BS_BUCKET_MAX": calc_VLLM_PROMPT_BS_BUCKET_MAX,
Expand Down Expand Up @@ -263,12 +278,10 @@ def calc_VLLM_PROMPT_SEQ_BUCKET_MAX(ctx):
"PROMPT_SEQ_STEP_GRAPHS": calc_PROMPT_SEQ_STEP_GRAPHS,
"EST_NUM_PROMPT_GRAPHS": calc_EST_NUM_PROMPT_GRAPHS,
"EST_GRAPH_PROMPT_RATIO": calc_EST_GRAPH_PROMPT_RATIO,
"VLLM_GRAPH_PROMPT_RATIO": calc_VLLM_GRAPH_PROMPT_RATIO,
"DECODE_GRAPH_TARGET_GB": calc_DECODE_GRAPH_TARGET_GB,
"EST_GRAPH_RESERVE_MEM": calc_EST_GRAPH_RESERVE_MEM,
"VLLM_GRAPH_RESERVED_MEM": calc_VLLM_GRAPH_RESERVED_MEM,
"KV_CACHE_MEM": calc_KV_CACHE_MEM,
"MAX_NUM_SEQS": calc_MAX_NUM_SEQS,
"VLLM_DECODE_BLOCK_BUCKET_MAX": calc_VLLM_DECODE_BLOCK_BUCKET_MAX,
"VLLM_PROMPT_SEQ_BUCKET_MAX": calc_VLLM_PROMPT_SEQ_BUCKET_MAX,
}
8 changes: 7 additions & 1 deletion .cd/templates/template_vllm_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

#@VARS

if [ "$VLLM_CONTIGUOUS_PA" == "True" ]; then # Checks if using contigous pa
if [ $PT_HPU_LAZY_MODE -eq 0 ]; then
printf "\nEager bridge exports\n"
export PT_ENABLE_INT64_SUPPORT=0
export PT_HPU_ENABLE_EAGER_CACHE=true
fi

if [ "$VLLM_CONTIGUOUS_PA" = "True" ]; then # Checks if using contigous pa
EXTRA_ARGS+=" --no-enable-prefix-caching"
fi

Expand Down
44 changes: 22 additions & 22 deletions .cd/tests/test_vllm_autocalc_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def test_calc_KV_CACHE_PER_SEQ():
'HIDDEN_SIZE': 4,
'NUM_KEY_VALUE_HEADS': 2,
'CACHE_DTYPE_BYTES': 2,
'NUM_ATTENTION_HEADS': 2
'NUM_ATTENTION_HEADS': 2,
'HEAD_DIM': math.nan
}
expected = ((2 * 128 * 2 * 4 * 2 * 2) / 2) / (1024 * 1024 * 1024)
assert rules.calc_KV_CACHE_PER_SEQ(ctx) == expected
Expand Down Expand Up @@ -160,8 +161,8 @@ def test_calc_PROMPT_BS_STEP_GRAPHS(exp):
@pytest.mark.parametrize("exp", [True, False])
def test_calc_PROMPT_SEQ_RAMP_GRAPHS(exp):
ctx = {
'VLLM_PROMPT_SEQ_BUCKET_STEP': 16,
'VLLM_PROMPT_SEQ_BUCKET_MIN': 2,
'VLLM_PROMPT_QUERY_BUCKET_STEP': 16,
'VLLM_PROMPT_QUERY_BUCKET_MIN': 2,
'MAX_NUM_BATCHED_TOKENS': 32,
'VLLM_EXPONENTIAL_BUCKETING': exp
}
Expand All @@ -174,7 +175,7 @@ def test_calc_PROMPT_SEQ_STEP_GRAPHS(exp):
ctx = {
'MAX_NUM_BATCHED_TOKENS': 32,
'MAX_MODEL_LEN': 64,
'VLLM_PROMPT_SEQ_BUCKET_STEP': 8,
'VLLM_PROMPT_QUERY_BUCKET_STEP': 8,
'VLLM_EXPONENTIAL_BUCKETING': exp
}
expected = 0 if exp else int(1 + (32 - 8) / 8)
Expand All @@ -190,7 +191,7 @@ def test_calc_EST_NUM_PROMPT_GRAPHS(exp):
'PROMPT_SEQ_STEP_GRAPHS': 5 if not exp else 0,
'MAX_NUM_BATCHED_TOKENS': 2048,
'MAX_MODEL_LEN': 4352,
'VLLM_PROMPT_SEQ_BUCKET_MIN': 128,
'VLLM_PROMPT_QUERY_BUCKET_MIN': 128,
'VLLM_PROMPT_CTX_BUCKET_STEP': 1,
'BLOCK_SIZE': 128,
'VLLM_EXPONENTIAL_BUCKETING': exp
Expand All @@ -199,26 +200,30 @@ def test_calc_EST_NUM_PROMPT_GRAPHS(exp):
assert rules.calc_EST_NUM_PROMPT_GRAPHS(ctx) == expected


def test_calc_EST_GRAPH_PROMPT_RATIO():
ctx = {'EST_NUM_PROMPT_GRAPHS': 10, 'NUM_DECODE_GRAPHS': 30, 'APPROX_MEM_PER_GRAPH_MB': 10}
expected = math.ceil(10 / (10 + 30) * 100) / 100
assert rules.calc_EST_GRAPH_PROMPT_RATIO(ctx) == expected

@pytest.mark.parametrize("exp", [True, False])
def test_calc_EST_GRAPH_PROMPT_RATIO(exp):
ctx = {
'EST_NUM_PROMPT_GRAPHS': 10,
'NUM_DECODE_GRAPHS': 30,
'APPROX_MEM_PER_GRAPH_MB': 10,
'MAX_MODEL_LEN': 8448,
'VLLM_EXPONENTIAL_BUCKETING': exp
}

def test_calc_VLLM_GRAPH_PROMPT_RATIO():
ctx = {'EST_GRAPH_PROMPT_RATIO': 0.5}
expected = math.ceil(min(max(0.5, 0.1), 0.9) * 10) / 10
assert rules.calc_VLLM_GRAPH_PROMPT_RATIO(ctx) == expected
est_decode_graph_mem = 30 * 10
est_prompt_graph_mem = 10 * 10 if exp else 10 * 10 * pow(max(1, 8448 / 4352), 0.8552)
expected = est_prompt_graph_mem / (est_prompt_graph_mem + est_decode_graph_mem)
assert rules.calc_EST_GRAPH_PROMPT_RATIO(ctx) == expected


def test_calc_DECODE_GRAPH_TARGET_GB():
ctx = {'NUM_DECODE_GRAPHS': 10, 'APPROX_MEM_PER_GRAPH_MB': 512}
expected = math.ceil(10 * 512 / 1024 * 10) / 10
ctx = {'NUM_DECODE_GRAPHS': 50, 'APPROX_MEM_PER_GRAPH_MB': 12}
expected = math.ceil(50 * 12 / 1024 * 100) / 100
assert rules.calc_DECODE_GRAPH_TARGET_GB(ctx) == expected


def test_calc_EST_GRAPH_RESERVE_MEM():
ctx = {'DECODE_GRAPH_TARGET_GB': 5, 'USABLE_MEM': 10, 'GPU_MEM_UTILIZATION': 0.8, 'VLLM_GRAPH_PROMPT_RATIO': 0.2}
ctx = {'DECODE_GRAPH_TARGET_GB': 5, 'USABLE_MEM': 10, 'GPU_MEM_UTILIZATION': 0.8, 'EST_GRAPH_PROMPT_RATIO': 0.2}
expected = math.ceil(5 / (10 * 0.8 * (1 - 0.2)) * 100) / 100
assert rules.calc_EST_GRAPH_RESERVE_MEM(ctx) == expected

Expand All @@ -239,8 +244,3 @@ def test_calc_VLLM_DECODE_BLOCK_BUCKET_MAX():
ctx = {'MAX_NUM_SEQS': 16, 'MAX_MODEL_LEN': 128}
expected = max(128, math.ceil((16 * 128) / 128))
assert rules.calc_VLLM_DECODE_BLOCK_BUCKET_MAX(ctx) == expected


def test_calc_VLLM_PROMPT_SEQ_BUCKET_MAX():
ctx = {'MAX_MODEL_LEN': 4096}
assert rules.calc_VLLM_PROMPT_SEQ_BUCKET_MAX(ctx) == 4096
Loading
Loading