From 9fb66ff010eec7a6ff0cd0f2196121ab0eef761b Mon Sep 17 00:00:00 2001 From: Duncan Moss Date: Mon, 18 Aug 2025 13:40:06 -0700 Subject: [PATCH 001/231] feat: bf16 x mxfp4 cutlass fused moe for hopper Signed-off-by: Duncan Moss --- .../layers/quantization/mxfp4.py | 90 ++++++++++++++++++- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 03fbcf158338..160e78434949 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -25,7 +25,7 @@ if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): - # from flashinfer.fused_moe import cutlass_fused_moe + from flashinfer.fused_moe import cutlass_fused_moe from flashinfer import (mxfp8_quantize, shuffle_matrix_a, shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe) @@ -192,7 +192,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, def process_weights_after_loading(self, layer): if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16) and current_platform.is_device_capability(100): layer.gemm1_alpha = Parameter(torch.tensor( [1.702] * self.num_experts, dtype=torch.float32).cuda(), requires_grad=False) @@ -313,6 +313,49 @@ def swap_every_two_rows(x, axis=-1): layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape( self.num_experts, -1), requires_grad=False) + elif envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 and current_platform.is_device_capability(90): + assert layer.w13_weight.dtype == torch.uint8, f"layer.w13_weight.dtype: {layer.w13_weight.dtype}, expected: {torch.uint8}" + assert layer.w2_weight.dtype == torch.uint8, f"layer.w2_weight.dtype: {layer.w2_weight.dtype}, expected: {torch.uint8}" + assert layer.w13_weight_scale.dtype == torch.uint8, f"layer.w13_weight_scale.dtype: {layer.w13_weight_scale.dtype}, expected: {torch.uint8}" + assert layer.w2_weight_scale.dtype == torch.uint8, f"layer.w2_weight_scale.dtype: {layer.w2_weight_scale.dtype}, expected: {torch.uint8}" + assert layer.w13_bias.dtype == torch.bfloat16, f"layer.w13_bias.dtype: {layer.w13_bias.dtype}, expected: {torch.bfloat16}" + assert layer.w2_bias.dtype == torch.bfloat16, f"layer.w2_bias.dtype: {layer.w2_bias.dtype}, expected: {torch.bfloat16}" + + layer.gemm1_alpha = Parameter(torch.tensor( + [1.702] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + layer.gemm1_beta = Parameter(torch.tensor( + [1.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + layer.gemm1_clamp_limit = Parameter(torch.tensor( + [7.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + sf_block_size = 32 # mxfp4 block size + + assert (layer.w13_weight.dim() == 3 + and layer.w13_weight.shape[0] == self.num_experts + and layer.w13_weight.shape[1] == self.intermediate_size * 2 + and layer.w13_weight.shape[2] == self.hidden_size // 2) + assert (layer.w13_weight_scale.dim() == 3 + and layer.w13_weight_scale.shape[0] == self.num_experts + and layer.w13_weight_scale.shape[1] + == self.intermediate_size * 2 + and layer.w13_weight_scale.shape[2] + == self.hidden_size // sf_block_size) + assert (layer.w2_weight.dim() == 3 + and layer.w2_weight.shape[0] == self.num_experts + and layer.w2_weight.shape[1] == self.hidden_size and + layer.w2_weight.shape[2] == self.intermediate_size // 2) + assert (layer.w2_weight_scale.dim() == 3 + and layer.w2_weight_scale.shape[1] == self.hidden_size + and layer.w2_weight_scale.shape[2] + == self.intermediate_size // sf_block_size) + assert (layer.w13_bias.dim() == 2 + and layer.w13_bias.shape[0] == self.num_experts + and layer.w13_bias.shape[1] == self.intermediate_size * 2) + assert (layer.w2_bias.dim() == 2 + and layer.w2_bias.shape[0] == self.num_experts + and layer.w2_bias.shape[1] == self.hidden_size) else: from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig @@ -408,7 +451,7 @@ def apply( with this configuration.") if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16) and current_platform.is_device_capability(100): assert not self.moe.use_ep, ( "EP is not supported for flashinfer mxfp4 moe backend yet.") if envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: @@ -448,6 +491,47 @@ def apply( True, # do finalize )[0] return trtllm_gen_output + elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16) and current_platform.is_device_capability(90): + + assert x.dtype == torch.bfloat16 + + quant_scales = [ + layer.w13_weight_scale.view(torch.int32), + layer.w2_weight_scale.view(torch.int32), + ] + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + output = torch.zeros_like(x) + + _ = cutlass_fused_moe( + input=x, + token_selected_experts=topk_ids, + token_final_scales=topk_weights, + fc1_expert_weights=layer.w13_weight, + fc2_expert_weights=layer.w2_weight, + output_dtype=torch.bfloat16, + quant_scales=quant_scales, + fc1_expert_biases=layer.w13_bias, + fc2_expert_biases=layer.w2_bias, + swiglu_alpha=layer.gemm1_alpha, + swiglu_beta=layer.gemm1_beta, + swiglu_limit=layer.gemm1_clamp_limit, + use_w4_group_scaling=True, + output=output, + ) + return output else: return triton_kernel_moe_forward( hidden_states=x, From 07e376d3226feca7bf08ba6606cc7d0a5a7b2260 Mon Sep 17 00:00:00 2001 From: Will Eaton Date: Wed, 13 Aug 2025 23:09:07 -0400 Subject: [PATCH 002/231] [CI][Entrypoints]: add filter to generation to filter out invalid tool calls (#22826) Signed-off-by: Will Eaton Signed-off-by: Duncan Moss --- .../entrypoints/openai/test_openai_schema.py | 48 ++++++++++++------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index 771119d04ea3..246bd014aa69 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy): op = context.operation assert op is not None - def no_file_type(case: schemathesis.models.Case): + def no_invalid_types(case: schemathesis.models.Case): """ - This filter skips test cases for the `POST /tokenize` endpoint where the - HTTP request body uses `"type": "file"` in any message's content. - We expect these cases to fail because that type isn't implemented here - https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095 + This filter skips test cases with invalid data that schemathesis + incorrectly generates due to permissive schema configurations. + + 1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in + message content, which isn't implemented. + + 2. Skips tool_calls with `"type": "custom"` which schemathesis + incorrectly generates instead of the valid `"type": "function"`. Example test cases that are skipped: curl -X POST -H 'Content-Type: application/json' \ - -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ + -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ http://localhost:8000/tokenize curl -X POST -H 'Content-Type: application/json' \ - -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ - http://localhost:8000/tokenize + -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \ + http://localhost:8000/v1/chat/completions """ # noqa: E501 - if (op.method.lower() == "post" and op.path == "/tokenize" - and hasattr(case, "body") and isinstance(case.body, dict) + if (hasattr(case, "body") and isinstance(case.body, dict) and "messages" in case.body and isinstance(case.body["messages"], list) and len(case.body["messages"]) > 0): + for message in case.body["messages"]: if not isinstance(message, dict): continue - content = message.get("content", []) - if not isinstance(content, list) or len(content) == 0: - continue - if any(item.get("type") == "file" for item in content): - return False + + # Check for invalid file type in tokenize endpoint + if op.method.lower() == "post" and op.path == "/tokenize": + content = message.get("content", []) + if (isinstance(content, list) and len(content) > 0 and any( + item.get("type") == "file" for item in content)): + return False + + # Check for invalid tool_calls with non-function types + tool_calls = message.get("tool_calls", []) + if isinstance(tool_calls, list): + for tool_call in tool_calls: + if isinstance(tool_call, dict): + if tool_call.get("type") != "function": + return False + if "custom" in tool_call: + return False return True - return strategy.filter(no_file_type) + return strategy.filter(no_invalid_types) @schema.parametrize() From 3f7d7ac761d22b2befddfba17213db353f7c5f56 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Thu, 14 Aug 2025 05:09:30 +0200 Subject: [PATCH 003/231] [CI] Fix `tests/distributed/test_ca_buffer_sharing.py` (#22849) Signed-off-by: ilmarkov Co-authored-by: ilmarkov Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: Duncan Moss --- vllm/distributed/device_communicators/custom_all_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 7dd104a4fcc4..8dfb7959a510 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -297,7 +297,7 @@ def create_shared_buffer(size_in_bytes: int, @staticmethod def free_shared_buffer(pointers: list[int], group: Optional[ProcessGroup] = None, - rank: Optional[int] = 0) -> None: + rank: Optional[int] = None) -> None: if rank is None: rank = dist.get_rank(group=group) if ops is not None: From eb4cfac99d3feb28c1415f45093db6c63ba6f98b Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Thu, 14 Aug 2025 00:41:51 -0400 Subject: [PATCH 004/231] [CI] remove flaky v0 test (#22864) Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw Signed-off-by: Duncan Moss --- tests/entrypoints/openai/test_default_mm_loras.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py index 1fc87c8b42a7..372e9b1fecd4 100644 --- a/tests/entrypoints/openai/test_default_mm_loras.py +++ b/tests/entrypoints/openai/test_default_mm_loras.py @@ -24,18 +24,7 @@ @pytest.fixture(scope="module") -def monkeypatch_module(): - from _pytest.monkeypatch import MonkeyPatch - mpatch = MonkeyPatch() - yield mpatch - mpatch.undo() - - -@pytest.fixture(scope="module", params=[False, True]) -def multimodal_server(request, monkeypatch_module): # noqa: F811 - - use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') +def multimodal_server(): # noqa: F811 args = [ # use half precision for speed and memory savings in CI environment From 4e78f74d03398534e335a627e77f490e90283847 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Thu, 14 Aug 2025 00:12:17 -0700 Subject: [PATCH 005/231] vLLM Benchmark suite improvement (#22119) Signed-off-by: Tsai, Louie Signed-off-by: Louie Tsai Co-authored-by: Li, Jiang Signed-off-by: Duncan Moss --- .buildkite/nightly-benchmarks/README.md | 32 ++-- .../scripts/compare-json-results.py | 175 ++++++++++++++++-- .../convert-results-json-to-markdown.py | 163 +++++++++++++++- .../scripts/run-performance-benchmarks.sh | 93 ++++++---- .../tests/latency-tests-cpu.json | 4 +- .../tests/serving-tests-cpu-snc2.json | 49 +++-- .../tests/serving-tests-cpu-snc3.json | 52 +++--- .../tests/serving-tests-cpu.json | 30 +-- .../tests/throughput-tests-cpu.json | 4 +- docs/contributing/benchmarks.md | 2 +- 10 files changed, 452 insertions(+), 152 deletions(-) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index 3f2e2da39797..b39f9899a8f2 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm. - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. -See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. +See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. ## Performance benchmark quick overview @@ -138,28 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. -`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. +`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. +If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. -Here is an example using the script to compare result_a and result_b without detail test name. -`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name` - -| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | -|----|----------------------------------------|----------------------------------------|----------| -| 0 | 142.633982 | 156.526018 | 1.097396 | -| 1 | 241.620334 | 294.018783 | 1.216863 | -| 2 | 218.298905 | 262.664916 | 1.203235 | -| 3 | 242.743860 | 299.816190 | 1.235113 | - -Here is an example using the script to compare result_a and result_b with detail test name. +Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps. `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` -| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio | -|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------| -| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 | -| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 | -| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 | -| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 | -| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 | +| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | +|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------| +| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 | +| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 | + +A comparison diagram will be generated below the table. +Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 +image ## Nightly test details diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 20c106234935..12c4ba6aa69a 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -1,24 +1,38 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse +import json +import os import pandas as pd def compare_data_columns( - files, name_column, data_column, drop_column, ignore_test_name=False + files, name_column, data_column, info_cols, drop_column, debug=False ): print("\ncompare_data_column: " + data_column) frames = [] + raw_data_cols = [] compare_frames = [] for file in files: data_df = pd.read_json(file) serving_df = data_df.dropna(subset=[drop_column], ignore_index=True) - if ignore_test_name is False: + # Show all info columns in the first couple columns + if not frames: + for col in info_cols: + if col not in serving_df.columns: + print(f"Skipping missing column: {col}") + continue + frames.append(serving_df[col]) + # only show test name under debug mode + if debug is True: serving_df = serving_df.rename(columns={name_column: file + "_name"}) frames.append(serving_df[file + "_name"]) + + file = "/".join(file.split("/")[:-1]) serving_df = serving_df.rename(columns={data_column: file}) frames.append(serving_df[file]) + raw_data_cols.append(file) compare_frames.append(serving_df[file]) if len(compare_frames) >= 2: # Compare numbers among two files @@ -27,7 +41,68 @@ def compare_data_columns( compare_frames.pop(1) concat_df = pd.concat(frames, axis=1) - return concat_df + print(raw_data_cols) + return concat_df, raw_data_cols + + +def split_json_by_tp_pp( + input_file: str = "benchmark_results.json", output_root: str = "." +) -> list[str]: + """ + Split a benchmark JSON into separate folders by (TP Size, PP Size). + + Creates: /tp{TP}_pp{PP}/benchmark_results.json + Returns: list of file paths written. + """ + # Load JSON data into DataFrame + with open(input_file, encoding="utf-8") as f: + data = json.load(f) + + # If the JSON is a dict with a list under common keys, use that list + if isinstance(data, dict): + for key in ("results", "serving_results", "benchmarks", "data"): + if isinstance(data.get(key), list): + data = data[key] + break + + df = pd.DataFrame(data) + + # Handle alias column names + rename_map = { + "tp_size": "TP Size", + "tensor_parallel_size": "TP Size", + "pp_size": "PP Size", + "pipeline_parallel_size": "PP Size", + } + df.rename( + columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True + ) + + # Ensure TP/PP columns exist (default to 1 if missing) + if "TP Size" not in df.columns: + df["TP Size"] = 1 + if "PP Size" not in df.columns: + df["PP Size"] = 1 + + # make sure TP/PP are numeric ints with no NaN + df["TP Size"] = ( + pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int) + ) + df["PP Size"] = ( + pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int) + ) + + # Split into separate folders + saved_paths: list[str] = [] + for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False): + folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}") + os.makedirs(folder_name, exist_ok=True) + filepath = os.path.join(folder_name, "benchmark_results.json") + group_df.to_json(filepath, orient="records", indent=2, force_ascii=False) + print(f"Saved: {filepath}") + saved_paths.append(filepath) + + return saved_paths if __name__ == "__main__": @@ -36,31 +111,105 @@ def compare_data_columns( "-f", "--file", action="append", type=str, help="input file name" ) parser.add_argument( - "--ignore_test_name", action="store_true", help="ignore_test_name or not" + "--debug", action="store_true", help="show all information for debugging" + ) + parser.add_argument( + "--plot", + action=argparse.BooleanOptionalAction, + default=True, + help="plot perf diagrams or not --no-plot --plot", + ) + parser.add_argument( + "-x", + "--xaxis", + type=str, + default="# of max concurrency.", + help="column name to use as X Axis in comparision graph", ) args = parser.parse_args() - files = args.file - print("comparing : " + ", ".join(files)) drop_column = "P99" name_column = "Test name" + info_cols = [ + "Model", + "Dataset Name", + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", + ] data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] html_msgs_for_data_cols = [ "Compare Output Tokens /n", "Median TTFT /n", "Median TPOT /n", ] - ignore_test_name = args.ignore_test_name + + if len(args.file) == 1: + files = split_json_by_tp_pp(args.file[0], output_root="splits") + info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] + else: + files = args.file + print("comparing : " + ", ".join(files)) + debug = args.debug + plot = args.plot + # For Plot feature, assign y axis from one of info_cols + y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 with open("perf_comparison.html", "w") as text_file: for i in range(len(data_cols_to_compare)): - output_df = compare_data_columns( + output_df, raw_data_cols = compare_data_columns( files, name_column, data_cols_to_compare[i], + info_cols, drop_column, - ignore_test_name=ignore_test_name, + debug=debug, ) - print(output_df) - html = output_df.to_html() - text_file.write(html_msgs_for_data_cols[i]) - text_file.write(html) + + # For Plot feature, insert y axis from one of info_cols + raw_data_cols.insert(0, info_cols[y_axis_index]) + + filtered_info_cols = info_cols[:-2] + existing_group_cols = [ + c for c in filtered_info_cols if c in output_df.columns + ] + if not existing_group_cols: + raise ValueError( + f"No valid group-by columns " + f"Expected subset: {filtered_info_cols}, " + f"but DataFrame has: {list(output_df.columns)}" + ) + + output_df_sorted = output_df.sort_values(by=existing_group_cols) + output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) + for name, group in output_groups: + html = group.to_html() + text_file.write(html_msgs_for_data_cols[i]) + text_file.write(html) + + if plot is True: + import pandas as pd + import plotly.express as px + + df = group[raw_data_cols] + df_sorted = df.sort_values(by=info_cols[y_axis_index]) + # Melt DataFrame for plotting + df_melted = df_sorted.melt( + id_vars=info_cols[y_axis_index], + var_name="Configuration", + value_name=data_cols_to_compare[i], + ) + title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] + # Create Plotly line chart + fig = px.line( + df_melted, + x=info_cols[y_axis_index], + y=data_cols_to_compare[i], + color="Configuration", + title=title, + markers=True, + ) + # Export to HTML + text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 554256b4bdb8..496ee6083abd 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,17 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse import json import os +import re +import shlex from importlib import util from pathlib import Path +from typing import Any import pandas as pd import psutil from tabulate import tabulate -results_folder = Path("results/") - # latency results and the keys that will be printed into markdown latency_results = [] latency_column_mapping = { @@ -42,14 +44,22 @@ serving_results = [] serving_column_mapping = { "test_name": "Test name", + "model_id": "Model", + "dataset_name": "Dataset Name", + "input_len": "Input Len", + "output_len": "Output Len", + "tp_size": "TP Size", + "pp_size": "PP Size", + "dtype": "dtype", "gpu_type": "GPU", "completed": "# of req.", + "qps": "qps", "max_concurrency": "# of max concurrency.", "request_throughput": "Tput (req/s)", "total_token_throughput": "Total Token Tput (tok/s)", "output_throughput": "Output Tput (tok/s)", - "total_input_tokens": "Total input tokens", - "total_output_tokens": "Total output tokens", + # "total_input_tokens": "Total input tokens", + # "total_output_tokens": "Total output tokens", "mean_ttft_ms": "Mean TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)", @@ -94,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"): bytes /= factor +def _coerce(val: str) -> Any: + """Best-effort type coercion from string to Python types.""" + low = val.lower() + if low == "null": + return None + if low == "true": + return True + if low == "false": + return False + # integers + if re.fullmatch(r"[+-]?\d+", val): + try: + return int(val) + except ValueError: + pass + # floats (keep 'inf'/'-inf'/'nan' as strings) + if re.fullmatch(r"[+-]?\d*\.\d+", val): + try: + return float(val) + except ValueError: + pass + return val + + +def parse_client_command(cmd: str) -> dict[str, Any]: + """Parse the client_command shell string into {executable, script, args}.""" + toks = shlex.split(cmd) + if len(toks) < 2: + raise ValueError("client_command must include an executable and a script") + executable, script = toks[0], toks[1] + args: dict[str, Any] = {} + + i = 2 + while i < len(toks): + t = toks[i] + if t.startswith("--"): + # --key=value or --key (value) or boolean flag + if "=" in t: + key, val = t.split("=", 1) + if key == "--metadata": + md = {} + if val: + if "=" in val: + k, v = val.split("=", 1) + md[k] = _coerce(v) + else: + md[val] = True + args[key] = md + else: + args[key] = _coerce(val) + i += 1 + continue + + key = t + + # Special: consume metadata k=v pairs until next --flag + if key == "--metadata": + i += 1 + md = {} + while i < len(toks) and not toks[i].startswith("--"): + pair = toks[i] + if "=" in pair: + k, v = pair.split("=", 1) + md[k] = _coerce(v) + else: + md[pair] = True + i += 1 + args[key] = md + continue + + # Standard: check if next token is a value (not a flag) + if i + 1 < len(toks) and not toks[i + 1].startswith("--"): + args[key] = _coerce(toks[i + 1]) + i += 2 + else: + # lone flag -> True + args[key] = True + i += 1 + else: + # unexpected positional; skip + i += 1 + + return {"executable": executable, "script": script, "args": args} + + if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-r", + "--result", + type=str, + default="results", + help="Folder name for benchmark output results.", + ) + args = parser.parse_args() + results_folder = Path(args.result) + if not results_folder.exists(): + raise FileNotFoundError(f"results folder does not exist: {results_folder}") # collect results for test_file in results_folder.glob("*.json"): with open(test_file) as f: @@ -102,7 +209,6 @@ def get_size_with_unit(bytes, suffix="B"): if "serving" in str(test_file): # this result is generated via `vllm bench serve` command - # attach the benchmarking command to raw_result try: with open(test_file.with_suffix(".commands")) as f: @@ -110,12 +216,44 @@ def get_size_with_unit(bytes, suffix="B"): except OSError as e: print(e) continue - + # Parse Server Command Arg + out: dict[str, Any] = { + "server_command": parse_client_command(command["server_command"]) + } + parse_args = [ + "--tensor-parallel-size", + "--pipeline-parallel-size", + "--dtype", + ] + col_mapping = ["tp_size", "pp_size", "dtype"] + for index, arg in enumerate(parse_args): + if arg in out["server_command"]["args"]: + raw_result.update( + {col_mapping[index]: out["server_command"]["args"][arg]} + ) + + # Parse Client Command Arg + out: dict[str, Any] = { + "client_command": parse_client_command(command["client_command"]) + } + parse_args = [ + "--dataset-name", + "--random-input-len", + "--random-output-len", + "--request-rate", + ] + col_mapping = ["dataset_name", "input_len", "output_len", "qps"] + + for index, arg in enumerate(parse_args): + if arg in out["client_command"]["args"]: + raw_result.update( + {col_mapping[index]: out["client_command"]["args"][arg]} + ) + # Add Server, Client command raw_result.update(command) # update the test name of this result raw_result.update({"test_name": test_file.stem}) - # add the result to raw_result serving_results.append(raw_result) continue @@ -205,7 +343,10 @@ def get_size_with_unit(bytes, suffix="B"): columns=latency_column_mapping ) if not serving_results.empty: - serving_results = serving_results[list(serving_column_mapping.keys())].rename( + valid_columns = [ + col for col in serving_column_mapping if col in serving_results.columns + ] + serving_results = serving_results[valid_columns].rename( columns=serving_column_mapping ) if not throughput_results.empty: @@ -245,7 +386,9 @@ def get_size_with_unit(bytes, suffix="B"): ) # document the result - with open(results_folder / "benchmark_results.md", "w") as f: + md_file = "benchmark_results.md" + json_file = "benchmark_results.json" + with open(results_folder / md_file, "w") as f: results = read_markdown( "../.buildkite/nightly-benchmarks/" + "performance-benchmarks-descriptions.md" @@ -260,7 +403,7 @@ def get_size_with_unit(bytes, suffix="B"): f.write(results) # document benchmarking results in json - with open(results_folder / "benchmark_results.json", "w") as f: + with open(results_folder / json_file, "w") as f: results = ( latency_results.to_dict(orient="records") + throughput_results.to_dict(orient="records") diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 2c57666a81aa..b1b7d2d77a44 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -194,9 +194,11 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -261,9 +263,11 @@ run_throughput_tests() { # check if there is enough GPU to run the test tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -329,12 +333,21 @@ run_serving_tests() { qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" + max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') + if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then + num_prompts=$(echo "$client_params" | jq -r '.num_prompts') + max_concurrency_list="[$num_prompts]" + fi + max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') + echo "Running over max concurrency list $max_concurrency_list" # check if there is enough resources to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -390,35 +403,39 @@ run_serving_tests() { echo "now qps is $qps" fi - new_test_name=$test_name"_qps_"$qps - - # pass the tensor parallel size to the client so that it can be displayed - # on the benchmark dashboard - client_command="vllm bench serve \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --metadata "tensor_parallel_size=$tp" \ - $client_args $client_remote_args " - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - bash -c "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - + # iterate over different max_concurrency + for max_concurrency in $max_concurrency_list; do + new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency + echo " new test name $new_test_name" + # pass the tensor parallel size to the client so that it can be displayed + # on the benchmark dashboard + client_command="vllm bench serve \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --max-concurrency $max_concurrency \ + --metadata "tensor_parallel_size=$tp" \ + $client_args $client_remote_args " + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + bash -c "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done done # clean up diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json index da93fdd1dbac..569117aae852 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json @@ -6,7 +6,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "num_iters_warmup": 5, @@ -20,7 +20,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "num_iters_warmup": 5, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json index dd0e24edff98..2d88a0b30c4f 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -1,7 +1,8 @@ [ { "test_name": "serving_llama8B_tp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -10,7 +11,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -23,17 +24,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp2_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -42,7 +43,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -55,17 +56,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp4_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -74,7 +75,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -87,17 +88,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp1_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -106,7 +107,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -120,19 +121,19 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_tp2_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -141,7 +142,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -155,19 +156,19 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_tp4_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -176,7 +177,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -190,13 +191,11 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, - "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } } diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index f1bda65a7590..823abbaa99f8 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -1,7 +1,8 @@ [ { "test_name": "serving_llama8B_pp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -10,7 +11,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -23,17 +24,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_pp3_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -42,7 +43,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 3, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -55,17 +56,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { - "test_name": "serving_llama8B_tp2pp6_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "test_name": "serving_llama8B_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -74,7 +75,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "pipeline_parallel_size": 3, "dtype": "bfloat16", @@ -88,17 +89,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_pp1_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -107,7 +108,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -121,28 +122,28 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_pp3_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL:": 1, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 3, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -156,19 +157,19 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_tp2pp3_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -177,7 +178,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "pipeline_parallel_size": 3, "dtype": "bfloat16", @@ -192,13 +193,12 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } } diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index f150b9abeea4..e21c8df0a9fe 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -2,6 +2,7 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -10,7 +11,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -23,17 +24,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp2_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -42,7 +43,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -55,17 +56,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp4_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -74,7 +75,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -87,17 +88,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp4_random_1024_128", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -106,7 +107,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -120,19 +121,19 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 1024, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 100, "num_prompts": 100 } }, { "test_name": "serving_llama8B_pp6_random_1024_128", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -141,7 +142,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 6, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -155,13 +156,12 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 1024, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 100, "num_prompts": 100 } } diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json index f159c30637d3..48c015aa8403 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json @@ -6,7 +6,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -21,7 +21,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 0ebd99ba5ae1..2bbed778f3c6 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -11,7 +11,7 @@ vLLM contains two sets of benchmarks: The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. -The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai). +The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). From 62df10fab588d05088ee86a2e1fdbf825267ec69 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 14 Aug 2025 17:35:43 +0800 Subject: [PATCH 006/231] [Bugfix] Fix `PixtralHFImagePixelInputs` dynamic shape check (#22827) Signed-off-by: Isotr0py Signed-off-by: Duncan Moss --- tests/models/multimodal/test_tensor_schema.py | 2 +- vllm/model_executor/models/llava.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index a4cb1a68833a..92390d8c2f7e 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -153,4 +153,4 @@ def validate_model_input(model): if hasattr(model, method_name): getattr(model, method_name)(**mm_kwargs) - vllm_model.apply_model(validate_model_input) + vllm_model.apply_model(validate_model_input) \ No newline at end of file diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 89d2817b57e0..4927d6b62c6d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -72,8 +72,9 @@ class PixtralHFImagePixelInputs(TensorSchema): in which case the data is passed as a list instead of a batched tensor. """ type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral" - pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bn", "c", "h", "w")] + pixel_values: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"})] class LlavaImageEmbeddingInputs(TensorSchema): From 275a3343ac30867c422cdead25bd0cf5bb81b56e Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 14 Aug 2025 03:44:29 -0700 Subject: [PATCH 007/231] [BugFix] Threadsafe close async zmq sockets (#22877) Signed-off-by: Nick Hill Co-authored-by: Isotr0py Signed-off-by: Duncan Moss --- vllm/utils/__init__.py | 24 ++++++++++- vllm/v1/engine/core_client.py | 79 ++++++++++++++++++++++++----------- 2 files changed, 77 insertions(+), 26 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 095829db8394..cae4eecc0dee 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -709,8 +709,28 @@ def cancel_tasks(): def cancel_task_threadsafe(task: Task): - if task and not task.done() and not (loop := task.get_loop()).is_closed(): - loop.call_soon_threadsafe(task.cancel) + if task and not task.done(): + run_in_loop(task.get_loop(), task.cancel) + + +def close_sockets(sockets: Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]): + for sock in sockets: + if sock is not None: + sock.close(linger=0) + + +def run_in_loop(loop: AbstractEventLoop, function: Callable, *args): + if in_loop(loop): + function(*args) + elif not loop.is_closed(): + loop.call_soon_threadsafe(function, *args) + + +def in_loop(event_loop: AbstractEventLoop) -> bool: + try: + return asyncio.get_running_loop() == event_loop + except RuntimeError: + return False def make_async( diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 05b4d7260896..5ffa555570a2 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -23,8 +23,8 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.tasks import SupportedTask -from vllm.utils import (cancel_task_threadsafe, get_open_port, - get_open_zmq_inproc_path, make_zmq_socket) +from vllm.utils import (close_sockets, get_open_port, get_open_zmq_inproc_path, + in_loop, make_zmq_socket) from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, ReconfigureDistributedRequest, ReconfigureRankType, @@ -317,7 +317,7 @@ class BackgroundResources: """Used as a finalizer for clean shutdown, avoiding circular reference back to the client object.""" - ctx: Union[zmq.Context] + ctx: zmq.Context # If CoreEngineProcManager, it manages local engines; # if CoreEngineActorManager, it manages all engines. engine_manager: Optional[Union[CoreEngineProcManager, @@ -326,6 +326,8 @@ class BackgroundResources: output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None first_req_send_socket: Optional[zmq.asyncio.Socket] = None + first_req_rcv_socket: Optional[zmq.asyncio.Socket] = None + stats_update_socket: Optional[zmq.asyncio.Socket] = None output_queue_task: Optional[asyncio.Task] = None stats_update_task: Optional[asyncio.Task] = None shutdown_path: Optional[str] = None @@ -343,23 +345,47 @@ def __call__(self): if self.coordinator is not None: self.coordinator.close() - cancel_task_threadsafe(self.output_queue_task) - cancel_task_threadsafe(self.stats_update_task) + if isinstance(self.output_socket, zmq.asyncio.Socket): + # Async case. + loop = self.output_socket._get_loop() + asyncio.get_running_loop() + sockets = (self.output_socket, self.input_socket, + self.first_req_send_socket, self.first_req_rcv_socket, + self.stats_update_socket) + + tasks = (self.output_queue_task, self.stats_update_task) + + def close_sockets_and_tasks(): + close_sockets(sockets) + for task in tasks: + if task is not None and not task.done(): + task.cancel() + + if in_loop(loop): + close_sockets_and_tasks() + elif not loop.is_closed(): + loop.call_soon_threadsafe(close_sockets_and_tasks) + else: + # Loop has been closed, try to clean up directly. + del tasks + del close_sockets_and_tasks + close_sockets(sockets) + del self.output_queue_task + del self.stats_update_task + else: + # Sync case. - # ZMQ context termination can hang if the sockets - # aren't explicitly closed first. - for socket in (self.output_socket, self.input_socket, - self.first_req_send_socket): - if socket is not None: - socket.close(linger=0) + # ZMQ context termination can hang if the sockets + # aren't explicitly closed first. + close_sockets((self.output_socket, self.input_socket)) - if self.shutdown_path is not None: - # We must ensure that the sync output socket is - # closed cleanly in its own thread. - with self.ctx.socket(zmq.PAIR) as shutdown_sender: - shutdown_sender.connect(self.shutdown_path) - # Send shutdown signal. - shutdown_sender.send(b'') + if self.shutdown_path is not None: + # We must ensure that the sync output socket is + # closed cleanly in its own thread. + with self.ctx.socket(zmq.PAIR) as shutdown_sender: + shutdown_sender.connect(self.shutdown_path) + # Send shutdown signal. + shutdown_sender.send(b'') def validate_alive(self, frames: Sequence[zmq.Frame]): if len(frames) == 1 and (frames[0].buffer @@ -969,14 +995,19 @@ def _ensure_stats_update_task(self): self.engine_ranks_managed[-1] + 1) async def run_engine_stats_update_task(): - with make_zmq_socket(self.ctx, self.stats_update_address, - zmq.XSUB) as socket, make_zmq_socket( - self.ctx, - self.first_req_sock_addr, - zmq.PAIR, - bind=False) as first_req_rcv_socket: + with (make_zmq_socket(self.ctx, + self.stats_update_address, + zmq.XSUB, + linger=0) as socket, + make_zmq_socket(self.ctx, + self.first_req_sock_addr, + zmq.PAIR, + bind=False, + linger=0) as first_req_rcv_socket): assert isinstance(socket, zmq.asyncio.Socket) assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket) + self.resources.stats_update_socket = socket + self.resources.first_req_rcv_socket = first_req_rcv_socket # Send subscription message. await socket.send(b'\x01') From 95ba0df34862a327dcce45566b68839d89a61faa Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 14 Aug 2025 12:03:49 +0100 Subject: [PATCH 008/231] Remove Phi 4 Flash configuration workaround (#22723) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Duncan Moss --- vllm/transformers_utils/config.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 02ea0814ddef..d8c964fb2a4a 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -449,23 +449,6 @@ def get_config( raise e config = _maybe_remap_hf_config_attrs(config) - # Phi4Flash misuses this config as list[int]. Convert it to int and add - # the layer_types list[str] to make it HF compatible - if (config.model_type == "phi4flash"): - # TODO: Remove after the following PR is merged: - # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/6 - if not hasattr(config, "layer_types"): - config.layer_types = [ - "sliding_attention" if i < config.num_hidden_layers // 2 - and i % 2 == 1 else "full_attention" - for i in range(config.num_hidden_layers) - ] - # TODO: Remove after the following PR is merged: - # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/7 - if isinstance(config.sliding_window, list): - config.sliding_window = next( - filter(None, config.sliding_window), None) - elif config_format == ConfigFormat.MISTRAL: # This function loads a params.json config which # should be used when loading models in mistral format From 70bcc4b6de6a9489eb94ce3a2e52b3cfd8fce763 Mon Sep 17 00:00:00 2001 From: iAmir97 <71513472+iAmir97@users.noreply.github.com> Date: Thu, 14 Aug 2025 18:04:18 +0700 Subject: [PATCH 009/231] [Bugfix] Add reset prefix cache for online serving (#22726) Signed-off-by: iAmir97 Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Co-authored-by: iAmir97 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Duncan Moss --- vllm/engine/async_llm_engine.py | 1 + vllm/v1/engine/async_llm.py | 1 + 2 files changed, 2 insertions(+) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index b6ee4105340a..73726eeab5fc 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1092,6 +1092,7 @@ async def reset_prefix_cache(self, self.engine.reset_prefix_cache(device) async def sleep(self, level: int = 1) -> None: + await self.reset_prefix_cache() self.engine.sleep(level) async def wake_up(self, tags: Optional[list[str]] = None) -> None: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a2706327914c..edc2e235c3c3 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -576,6 +576,7 @@ async def reset_prefix_cache(self, await self.engine_core.reset_prefix_cache_async() async def sleep(self, level: int = 1) -> None: + await self.reset_prefix_cache() await self.engine_core.sleep_async(level) async def wake_up(self, tags: Optional[list[str]] = None) -> None: From 52729d5676277fd3dafc8b0af62e146b7ffcca0a Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:06:13 +0200 Subject: [PATCH 010/231] [Doc] fix dead link (#22898) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniele Trifirò Signed-off-by: Duncan Moss --- docs/getting_started/installation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index f6ecceb85d86..0ee680f5c688 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -18,7 +18,7 @@ vLLM supports the following hardware platforms: ## Hardware Plugins The backends below live **outside** the main `vllm` repository and follow the -[Hardware-Pluggable RFC](../design/plugin_system.md). +[Hardware-Pluggable RFC](../../design/plugin_system.md). | Accelerator | PyPI / package | Repository | |-------------|----------------|------------| From e396d2f45c702148c20ea94d3263cfead298f766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 14 Aug 2025 13:34:34 +0200 Subject: [PATCH 011/231] [CI] Re-enable transcriptions `test_long_audio_request` (#22890) Signed-off-by: NickLucche Signed-off-by: Duncan Moss --- tests/entrypoints/openai/test_transcription_validation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 28fd02171b95..e103bd206b54 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -80,9 +80,6 @@ async def test_bad_requests(mary_had_lamb): async def test_long_audio_request(mary_had_lamb, model_name): server_args = ["--enforce-eager"] - if model_name.startswith("openai"): - return - mary_had_lamb.seek(0) audio, sr = librosa.load(mary_had_lamb) # Add small silence after each audio for repeatability in the split process From 16ef1432942372643a1e4fff194c1bbeaeea8417 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 14 Aug 2025 08:28:09 -0400 Subject: [PATCH 012/231] [Perf] Dont create unnecessary pooling params (#22876) Signed-off-by: Lucas Wilkinson Signed-off-by: Duncan Moss --- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a03e860a91c7..8fb9641844fb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -341,13 +341,13 @@ def _init_model_kwargs(self, num_tokens: int): model_kwargs = dict[str, Any]() num_reqs = self.input_batch.num_reqs - pooling_params = self.input_batch.pooling_metadata.pooling_params - - num_pooling_reqs = len(pooling_params) + num_pooling_reqs = len(self.input_batch.pooling_params) if num_pooling_reqs == 0: return model_kwargs + pooling_params = self.input_batch.pooling_metadata.pooling_params + assert num_pooling_reqs == num_reqs token_type_id_requests = dict[int, Any]() From f611b92182218ec6b50f67184d1789bf294f6492 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 14 Aug 2025 20:28:50 +0800 Subject: [PATCH 013/231] [Model] Modify the gate implementation of glm4_moe (#22832) Signed-off-by: Jee Jee Li Signed-off-by: Duncan Moss --- docs/models/supported_models.md | 2 +- vllm/model_executor/models/glm4_moe.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index dbbbc5122b80..a24fa4bcce33 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ | +| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 131c042c3c2d..aff491f9596c 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -118,14 +117,15 @@ def __init__( if config.hidden_act != "silu": raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") - - self.gate = ReplicatedLinear(config.hidden_size, - config.n_routed_experts, - bias=False, - quant_config=None, - params_dtype=torch.float32, - prefix=f"{prefix}.gate") - + # NOTE In the transformers implementation, the gate isn't an nn.Linear, + # so we cannot use ReplicatedLinear here. + # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260 + self.gate = nn.Linear( + config.hidden_size, + config.n_routed_experts, + bias=False, + dtype=torch.float32, + ) self.gate.e_score_correction_bias = nn.Parameter( torch.empty(config.n_routed_experts, dtype=torch.float32)) @@ -181,7 +181,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if self.n_shared_experts is not None: shared_output = self.shared_experts(hidden_states) - router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32)) + router_logits = self.gate(hidden_states.to(dtype=torch.float32)) final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits) * self.routed_scaling_factor From a21961830dffce0b9ade76b09e05e88a5076bf9d Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Thu, 14 Aug 2025 23:09:27 +0800 Subject: [PATCH 014/231] [Bugfix] Replace custom Encoding class with BatchEncoding in MistralTokenizer (#22786) Signed-off-by: zjy0516 Signed-off-by: Duncan Moss --- vllm/transformers_utils/tokenizers/mistral.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 6ccc636efaf1..4dd8b2439b3f 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, Union, cast import huggingface_hub import regex as re from huggingface_hub import HfApi, hf_hub_download +from transformers.tokenization_utils_base import BatchEncoding from vllm.logger import init_logger from vllm.transformers_utils.tokenizer_base import TokenizerBase @@ -27,11 +27,6 @@ logger = init_logger(__name__) -@dataclass -class Encoding: - input_ids: Union[list[int], list[list[int]]] - - def maybe_serialize_tool_calls(request: "ChatCompletionRequest"): # SEE: https://github.com/vllm-project/vllm/pull/9951 # Credits go to: @gcalmettes @@ -359,7 +354,7 @@ def __call__( # For str, single prompt text else: input_ids = self.encode_one(text, truncation, max_length) - return Encoding(input_ids=input_ids) + return BatchEncoding({"input_ids": input_ids}) def get_vocab(self) -> dict[str, int]: # NB: the dictionary form of the vocabulary collapses token ids that map From 65ad49495e454132e12fa7a5cc1263dec78be09f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 14 Aug 2025 23:09:44 +0800 Subject: [PATCH 015/231] [Bugfix] Fix parsing of `--disable-mm-preprocessor-cache` (#22909) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c058001ceb97..dd1072da0844 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -711,7 +711,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]) multimodal_group.add_argument("--disable-mm-preprocessor-cache", - type=bool, + action="store_true", deprecated=True) multimodal_group.add_argument( "--interleave-mm-strings", From 72fda977e281cc05584c528ed50c3701f84515a4 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 14 Aug 2025 20:01:16 +0200 Subject: [PATCH 016/231] [CI] [Hybrid] Bump min transformers version for Bamba and Jamba (#22908) Signed-off-by: Thomas Parnell Signed-off-by: Duncan Moss --- tests/models/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index eb48c0f6a773..3efc9a99ea41 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -151,7 +151,7 @@ def check_available_online( "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5", trust_remote_code=True), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1", - min_transformers_version="4.55.1", + min_transformers_version="4.56.0", extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}), @@ -227,7 +227,7 @@ def check_available_online( trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini", - min_transformers_version="4.55.1", + min_transformers_version="4.56.0", extras={ "tiny": "ai21labs/Jamba-tiny-dev", "random": "ai21labs/Jamba-tiny-random", # noqa: E501 From c446fb4cd9885ff007a874388e2f96af6cdc23ba Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Fri, 15 Aug 2025 02:23:22 +0800 Subject: [PATCH 017/231] [Kernel] [Quantization] Add MXFP4 and bias support for marlin kernel (#22428) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: rongfu.leng Signed-off-by: Jinzhen Lin Signed-off-by: Huzaifa Sidhpurwala Signed-off-by: Varun Sundar Rabindranath Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Jee Jee Li Signed-off-by: mgoin Signed-off-by: Animesh Jain Signed-off-by: Rui Qiao Signed-off-by: Xiongfei Wei Signed-off-by: Nick Hill Signed-off-by: yewentao256 Signed-off-by: kf Signed-off-by: vllmellm Signed-off-by: NickLucche Signed-off-by: Dipika Sikka Signed-off-by: Sage Moore Signed-off-by: tjtanaavllm Signed-off-by: Yong Hoon Shin Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: Roger Wang Signed-off-by: Vadim Gimpelson Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: DarkLight1337 Signed-off-by: Thomas Parnell Signed-off-by: yan Signed-off-by: Yan Ma Signed-off-by: Xiao Liu Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: Isotr0py Signed-off-by: Ye (Charlotte) Qi Signed-off-by: LopezCastroRoberto Signed-off-by: Andy Xie Signed-off-by: Haibin Lin Signed-off-by: David Ben-David Signed-off-by: Woosuk Kwon Signed-off-by: jiang1.li Signed-off-by: Seiji Eicher Signed-off-by: zitian.zhao Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Signed-off-by: Abirdcfly Signed-off-by: Giancarlo Delfin Signed-off-by: Tyler Michael Smith Signed-off-by: huangweixiao Signed-off-by: alyosha-swamy Signed-off-by: Eric Hanley Signed-off-by: Abatom Signed-off-by: CLFutureX <775523362@qq.com> Signed-off-by: Linkun Chen Signed-off-by: tjtanaa Signed-off-by: Gregory Shtrasberg Signed-off-by: tlipoca9 Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Signed-off-by: zitian zhao Signed-off-by: mgoin Signed-off-by: wang.yuqi Signed-off-by: Benji Beck Signed-off-by: Siyuan Liu Signed-off-by: Benjamin Chislett Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Chen Zhang Signed-off-by: simon-mo Signed-off-by: LucasWilkinson Signed-off-by: Zhang Jason Signed-off-by: Yongye Zhu Signed-off-by: asafg Signed-off-by: Siyuan Fu Signed-off-by: Lain Signed-off-by: Max de Bayser Signed-off-by: Lucas Wilkinson Signed-off-by: Kunshang Ji Signed-off-by: Tao He Signed-off-by: Michael Goin Signed-off-by: QscQ Signed-off-by: qingjun Signed-off-by: Syed Muhammad Bin Asif Signed-off-by: Lionel Villard Signed-off-by: ycyaw66 <497410282@qq.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: Linkun Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> Signed-off-by: Ming Yang Signed-off-by: Adrian Garcia Signed-off-by: shaojunqi Signed-off-by: Ricardo Decal Signed-off-by: Andrew Chan Signed-off-by: Felix Marty Signed-off-by: Andrew Sansom Signed-off-by: Zhiyu Cheng Signed-off-by: Shu Wang Signed-off-by: Po-Han Huang Signed-off-by: Shu Wang. Signed-off-by: XIn Li Signed-off-by: Junhao Li Signed-off-by: chaunceyjiang Signed-off-by: iAmir97 Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Signed-off-by: Signed-off-by: Guy Stone Signed-off-by: Signed-off-by: yyw Signed-off-by: Russell Bryant Signed-off-by: Pradyun Ramadorai Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Signed-off-by: Jinzhen Lin Co-authored-by: rongfu.leng Co-authored-by: Huzaifa Sidhpurwala Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant Co-authored-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Jee Jee Li Co-authored-by: Michael Goin Co-authored-by: Animesh Jain Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Co-authored-by: XiongfeiWei Co-authored-by: Nick Hill Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JartX Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Co-authored-by: vllmellm Co-authored-by: kf Co-authored-by: Nicolò Lucchesi Co-authored-by: Dipika Sikka Co-authored-by: Sage Moore Co-authored-by: tjtanaavllm Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Roger Wang Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Co-authored-by: Yuxuan Zhang <2448370773@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung Co-authored-by: Thomas Parnell Co-authored-by: Yan Ma Co-authored-by: Xiao Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Isotr0py Co-authored-by: Ye (Charlotte) Qi Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: Ning Xie Co-authored-by: H Co-authored-by: David Ben-David Co-authored-by: David Ben-David Co-authored-by: Woosuk Kwon Co-authored-by: Li, Jiang Co-authored-by: TankNee Co-authored-by: Cyrus Leung Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Co-authored-by: ZiTian.Zhao Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Abirdcfly Co-authored-by: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com> Co-authored-by: Chenxi Yang Co-authored-by: Chenxi Yang Co-authored-by: Tyler Michael Smith Co-authored-by: Weixiao Huang Co-authored-by: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com> Co-authored-by: ericehanley Co-authored-by: Zhonghua Deng Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com> Co-authored-by: PiteXChen <44110731+CLFutureX@users.noreply.github.com> Co-authored-by: lkchen Co-authored-by: TJian Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: tlipoca9 <160737620+tlipoca9@users.noreply.github.com> Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Co-authored-by: wang.yuqi Co-authored-by: Benji Beck Co-authored-by: youkaichao Co-authored-by: Siyuan Liu Co-authored-by: Benjamin Chislett Co-authored-by: LiuXiaoxuanPKU Co-authored-by: simon-mo Co-authored-by: Chen Zhang Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu Co-authored-by: Lucas Wilkinson Co-authored-by: Zhang Jason Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Co-authored-by: asafg Co-authored-by: Lain Co-authored-by: tc-mb <157115220+tc-mb@users.noreply.github.com> Co-authored-by: imning3 Co-authored-by: Maximilien de Bayser Co-authored-by: Kunshang Ji Co-authored-by: Tao He Co-authored-by: qscqesze Co-authored-by: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com> Co-authored-by: Lionel Villard Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: ycyaw66 <497410282@qq.com> Co-authored-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> Co-authored-by: Ming Yang Co-authored-by: Adrián García García Co-authored-by: Michael Goin Co-authored-by: JaceyShao <65159281+JaceyShao@users.noreply.github.com> Co-authored-by: shaojunqi Co-authored-by: Ricardo Decal Co-authored-by: Andrew Chan Co-authored-by: fxmarty-amd Co-authored-by: Andrew Sansom Co-authored-by: Zhiyu Co-authored-by: Shu Wang Co-authored-by: XIn Li Co-authored-by: Junhao Li Co-authored-by: Chauncey Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Co-authored-by: iAmir97 Co-authored-by: Hong Hanh Co-authored-by: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com> Co-authored-by: yewentao256 Co-authored-by: Guy Stone Co-authored-by: yyweiss <70619747+yyweiss@users.noreply.github.com> Co-authored-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Co-authored-by: Pradyun Ramadorai Co-authored-by: Nicolò Lucchesi --- CMakeLists.txt | 7 + benchmarks/kernels/benchmark_machete.py | 1 + csrc/core/scalar_type.hpp | 2 + csrc/moe/marlin_moe_wna16/generate_kernels.py | 15 ++ csrc/moe/marlin_moe_wna16/kernel.h | 26 +-- csrc/moe/marlin_moe_wna16/marlin_template.h | 137 ++++++++++--- csrc/moe/marlin_moe_wna16/ops.cu | 181 ++++++++++++------ csrc/moe/torch_bindings.cpp | 3 +- csrc/quantization/gptq_marlin/dequant.h | 23 ++- .../gptq_marlin/generate_kernels.py | 17 +- csrc/quantization/gptq_marlin/gptq_marlin.cu | 162 +++++++++++----- csrc/quantization/gptq_marlin/kernel.h | 5 +- .../gptq_marlin/marlin_template.h | 139 +++++++++++--- csrc/torch_bindings.cpp | 1 + tests/kernels/moe/test_moe.py | 175 +++++++++++++---- .../kernels/quantization/test_marlin_gemm.py | 95 +++++++-- tests/kernels/utils.py | 21 +- vllm/_custom_ops.py | 18 +- vllm/envs.py | 11 ++ .../layers/fused_moe/fused_marlin_moe.py | 30 ++- vllm/model_executor/layers/fused_moe/layer.py | 20 +- .../layers/quantization/awq_marlin.py | 13 +- .../compressed_tensors_moe.py | 6 + .../model_executor/layers/quantization/fp8.py | 2 + .../layers/quantization/gptq_marlin.py | 10 +- .../layers/quantization/hqq_marlin.py | 9 +- .../kernels/mixed_precision/marlin.py | 8 +- .../layers/quantization/modelopt.py | 2 + .../layers/quantization/mxfp4.py | 91 ++++++++- .../layers/quantization/utils/marlin_utils.py | 15 +- .../quantization/utils/marlin_utils_fp4.py | 169 +++++++++++++--- .../quantization/utils/marlin_utils_fp8.py | 30 ++- .../layers/quantization/utils/mxfp4_utils.py | 2 +- vllm/scalar_type.py | 2 + 34 files changed, 1126 insertions(+), 322 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 093330caa4f9..5c1a200d1899 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -351,6 +351,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set_gencode_flags_for_srcs( SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" CUDA_ARCHS "${MARLIN_ARCHS}") + set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) @@ -364,7 +366,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set_gencode_flags_for_srcs( SRCS "${MARLIN_SRCS}" CUDA_ARCHS "${MARLIN_ARCHS}") + set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu" + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") + message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" @@ -854,6 +859,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set_gencode_flags_for_srcs( SRCS "${MOE_WNAA16_MARLIN_SRC}" CUDA_ARCHS "${MARLIN_MOE_ARCHS}") + set_source_files_properties(${MOE_WNAA16_MARLIN_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index f73d0511e01f..975d10f2e92e 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -236,6 +236,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: a=bt.a, c=None, b_q_weight=w_q, + b_bias=None, b_scales=w_s, global_scale=None, b_zeros=w_zp, diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index d0f85e23609b..68a8750f583b 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -321,6 +321,8 @@ static inline constexpr auto kFE3M2f = ScalarType::float_(3, 2, true, ScalarType::NAN_NONE); static inline constexpr auto kFE4M3fn = ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN); +static inline constexpr auto kFE8M0fnu = + ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN); static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2); static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7); static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10); diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py index 49f33718a21e..698deb107cc0 100644 --- a/csrc/moe/marlin_moe_wna16/generate_kernels.py +++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py @@ -20,6 +20,7 @@ TEMPLATE = ("template __global__ void Marlin<" "{{scalar_t}}, " "{{w_type_id}}, " + "{{s_type_id}}, " "{{threads}}, " "{{thread_m_blocks}}, " "{{thread_n_blocks}}, " @@ -77,6 +78,7 @@ def generate_new_kernels(): if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]: continue # nvfp4 only supports group_size == 16 + # mxfp4 only supports group_size == 32 if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]: continue # other quantization methods don't support group_size = 16 @@ -89,9 +91,22 @@ def generate_new_kernels(): c_dtype = "half" if dtype == "fp16" else "nv_bfloat16" + if scalar_type == "vllm::kFE2M1f" and group_blocks == 1: + s_type = "vllm::kFE4M3fn" + elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2: + s_type = "vllm::kFE8M0fnu" + if dtype == "fp16": + # we cannot safely dequantize e8m0 to fp16, so skip this + continue + elif dtype == "fp16": + s_type = "vllm::kFloat16" + elif dtype == "bf16": + s_type = "vllm::kBFloat16" + template_str = jinja2.Template(TEMPLATE).render( scalar_t=c_dtype, w_type_id=scalar_type + ".id()", + s_type_id=s_type + ".id()", threads=threads, thread_m_blocks=max(m_blocks, 1), thread_n_blocks=n_blocks, diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h index 537282aba8c8..6190f7ee21ec 100644 --- a/csrc/moe/marlin_moe_wna16/kernel.h +++ b/csrc/moe/marlin_moe_wna16/kernel.h @@ -7,23 +7,25 @@ #include "quantization/gptq_marlin/marlin_dtypes.cuh" #include "core/scalar_type.hpp" -#define MARLIN_KERNEL_PARAMS \ - const int4 *__restrict__ A, const int4 *__restrict__ B, \ - int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ - const int4 *__restrict__ scales_ptr, \ - const uint16_t *__restrict__ scale2_ptr, \ - const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \ - const int32_t *__restrict__ sorted_token_ids_ptr, \ - const int32_t *__restrict__ expert_ids_ptr, \ - const int32_t *__restrict__ num_tokens_past_padded_ptr, \ - const float *__restrict__ topk_weights_ptr, int top_k, \ - bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \ - int prob_n, int prob_k, int *locks, bool use_atomic_add, \ +#define MARLIN_KERNEL_PARAMS \ + const int4 *__restrict__ A, const int4 *__restrict__ B, \ + int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ + const int4 *__restrict__ b_bias_ptr, \ + const int4 *__restrict__ scales_ptr, \ + const uint16_t *__restrict__ scale2_ptr, \ + const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \ + const int32_t *__restrict__ sorted_token_ids_ptr, \ + const int32_t *__restrict__ expert_ids_ptr, \ + const int32_t *__restrict__ num_tokens_past_padded_ptr, \ + const float *__restrict__ topk_weights_ptr, int top_k, \ + bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \ + int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \ bool use_fp32_reduce, int max_shared_mem namespace MARLIN_NAMESPACE_NAME { template ::value) { + static_assert(s_type == vllm::kBFloat16); + } else if constexpr (std::is_same::value) { + static_assert(s_type == vllm::kFloat16); + } + constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8; constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 || w_type == vllm::kU4B8 || w_type == vllm::kU8B128; // see comments of dequant.h for more details constexpr bool dequant_skip_flop = - !is_int_type || + w_type == vllm::kFE4M3fn || + w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn || has_zp && !is_zp_float && !std::is_same::value || has_zp && !is_zp_float && !(w_type == vllm::kU8); @@ -365,6 +379,7 @@ __global__ void Marlin( const int zp_expert_stride = is_zp_float ? prob_n * prob_k / group_size / 8 : prob_n * prob_k / group_size / (pack_factor * 4); + const int b_bias_expert_stride = prob_n / 8; // parallel: num valid moe blocks int num_tokens_past_padded = num_tokens_past_padded_ptr[0]; @@ -475,7 +490,7 @@ __global__ void Marlin( for (int i = 0; i < 4; i++) { int idx = tid4 * 4 + i; idx = idx < block_num_valid_tokens ? idx : 0; - if constexpr (w_type == vllm::kFE2M1f) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { sh_block_topk_weights[idx] = __hmul2( global_scale, Dtype::num2num2(Dtype::float2num( topk_weights_ptr[sh_block_sorted_ids[idx]]))); @@ -513,7 +528,7 @@ __global__ void Marlin( expert_id = expert_ids_ptr[block_id]; } - if constexpr (w_type == vllm::kFE2M1f) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { uint16_t val = scale2_ptr[expert_id]; global_scale = Dtype::num2num2(*reinterpret_cast(&val)); } @@ -526,6 +541,9 @@ __global__ void Marlin( if constexpr (has_act_order) { g_idx += (expert_id - old_expert_id) * prob_k; } + if (has_bias) { + b_bias_ptr += (expert_id - old_expert_id) * b_bias_expert_stride; + } read_moe_block_data(block_id); }; @@ -721,7 +739,7 @@ __global__ void Marlin( s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4; - s_sh_rd = s_sh_rd * 2 + warp_row % 2; + s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2; } else if constexpr (group_blocks != -1) s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + @@ -734,6 +752,18 @@ __global__ void Marlin( s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4; + int bias_sh_rd; + if constexpr (m_block_size_8) { + bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 8; + } else { + bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) % 4; + } + + int bias_sh_wr = threadIdx.x; + int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x; + // Zero-points have the same read layout as the scales // (without column-wise case) constexpr int num_col_threads = 8; @@ -793,7 +823,19 @@ __global__ void Marlin( constexpr int sh_b_size = stages * b_sh_stage; int4* sh_b = sh_new; int4* sh_red = sh_new; - int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + + constexpr int sh_size_b_red_min = + (sh_red_size < sh_b_size ? sh_red_size : sh_b_size); + constexpr int sh_size_b_red_max = + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + constexpr int sh_bias_size = (thread_n_blocks * 16 / 8); + constexpr int sh_b_red_bias_size = + sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size) + ? sh_size_b_red_max + : (sh_size_b_red_min + sh_bias_size); + + int4* sh_bias = sh_new + sh_size_b_red_min; + int4* sh_g_idx = sh_new + sh_b_red_bias_size; int4* sh_zp = sh_g_idx + (stages * g_idx_stage); constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride) : (stages * s_sh_stage); @@ -803,9 +845,9 @@ __global__ void Marlin( static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <= stages * b_sh_stage); int4* sh_a = sh_s + sh_s_size; - constexpr int shm_size_used = - moe_block_size + stages * (g_idx_stage + zp_sh_stage) + sh_s_size + - (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + constexpr int shm_size_used = moe_block_size + + stages * (g_idx_stage + zp_sh_stage) + + sh_s_size + sh_b_red_bias_size; // all remaining shared memory is used to cache A (input) // sh_a_max_row is at least ` stages * 16 * thread_m_blocks ` @@ -816,7 +858,8 @@ __global__ void Marlin( FragA frag_a[2][thread_m_blocks]; I4 frag_b_quant[2][b_thread_vecs]; FragC frag_c[thread_m_blocks][4][2]; - FragS frag_s[2][4]; // No act-order + FragS frag_s[2][4]; // No act-order + FragS frag_bias[2][4]; FragS act_frag_s[2][4][4]; // For act-order int frag_qzp[2][num_ints_per_thread]; // Zero-points FragZP frag_zp; // Zero-points in fp16 @@ -1065,10 +1108,15 @@ __global__ void Marlin( if constexpr (w_type_id != vllm::kFE2M1f.id()) { reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride]; - } else { + } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) { reinterpret_cast(&frag_s[k % 2])[0] = reinterpret_cast( sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)]; + } else { + reinterpret_cast(&frag_s[k % 2])[0] = + reinterpret_cast( + sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) + + k % 2]; } } } @@ -1281,9 +1329,9 @@ __global__ void Marlin( int s_quant_0 = reinterpret_cast(frag_s[k2])[0]; int s_quant_1 = reinterpret_cast(frag_s[k2])[1]; - dequant_fp8_scales(s_quant_0, - reinterpret_cast(&frag_s[k2])); - dequant_fp8_scales( + dequant_fp8_scales( + s_quant_0, reinterpret_cast(&frag_s[k2])); + dequant_fp8_scales( s_quant_1, reinterpret_cast(&frag_s[k2]) + 2); } @@ -1566,7 +1614,7 @@ __global__ void Marlin( // Write out the reduce final result in the correct layout. We only actually // reshuffle matrix fragments in this step, the reduction above is performed // in fragment layout. - auto write_result = [&]() { + auto write_result = [&](bool last) { int c_gl_stride = prob_n / 8; constexpr int c_sh_stride = 2 * thread_n_blocks + 1; int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); @@ -1592,7 +1640,7 @@ __global__ void Marlin( // We first reorder in shared memory to guarantee the most efficient final // global write patterns - auto write = [&](int idx, float c0, float c1, FragS& s) { + auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) { scalar_t2 res = Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); @@ -1601,14 +1649,27 @@ __global__ void Marlin( if constexpr (!has_act_order && group_blocks == -1 && w_type.size_bits() == 4 && (has_zp && dequant_skip_flop || !has_zp)) { - res = __hmul2(res, s[0]); + scalar_t2 tmp_scale = s[0]; + if constexpr (m_block_size_8) { + tmp_scale = Dtype::num2num2( + reinterpret_cast(&s[0])[(threadIdx.x % 8) / 4]); + } + res = __hmul2(res, tmp_scale); } - if constexpr (w_type == vllm::kFE2M1f) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { if (!mul_topk_weights) { res = __hmul2(res, global_scale); } } + if (has_bias && last) { + scalar_t2 tmp_bias = b_bias[0]; + if constexpr (m_block_size_8) { + tmp_bias = Dtype::num2num2( + reinterpret_cast(&b_bias[0])[(threadIdx.x % 8) / 4]); + } + res = __hadd2(res, tmp_bias); + } if constexpr (m_block_size_8) { ((scalar_t*)sh_red)[idx] = res.x; @@ -1626,19 +1687,25 @@ __global__ void Marlin( if constexpr (m_block_size_8) { int wr = c_sh_wr + 16 * j; write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1], - frag_s[j / 2][2 * (j % 2) + 0]); + frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3], - frag_s[j / 2][2 * (j % 2) + 1]); + frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); } else { int wr = c_sh_wr + 8 * j; write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); } } c_sh_wr += 16 * (4 * c_sh_stride); @@ -1805,6 +1872,14 @@ __global__ void Marlin( } thread_block_reduce(); + + if (has_bias && last) { + __syncthreads(); + cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd], + threadIdx.x < 16 * thread_n_blocks / 8); + cp_async_fence(); + } + if constexpr (!has_act_order && group_blocks == -1 && (has_zp && dequant_skip_flop || !has_zp)) { if (w_type.size_bits() == 8 || (last || use_atomic_add)) { @@ -1867,11 +1942,20 @@ __global__ void Marlin( } barrier_release(&locks[locks_off], last); } + + if (has_bias && last) { + cp_async_wait<0>(); + __syncthreads(); + reinterpret_cast(&frag_bias)[0] = sh_bias[bias_sh_rd]; + reinterpret_cast(&frag_bias)[1] = sh_bias[bias_sh_rd + 4]; + __syncthreads(); + } + if (use_atomic_add && slice_count > 1 && slice_idx != 0) wait_negative_and_add(&locks[locks_off]); if (last || use_atomic_add) // only the last block in a slice actually writes the result - write_result(); + write_result(last); int old_slice_row = slice_row; slice_row = 0; slice_col_par++; @@ -1904,6 +1988,7 @@ __global__ void Marlin( for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; } + bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x; // Update slice k/n for scales loading if constexpr (has_act_order) { slice_k_start = tb_k * slice_row; diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu index 2cff04f699b0..601e2aa6f991 100644 --- a/csrc/moe/marlin_moe_wna16/ops.cu +++ b/csrc/moe/marlin_moe_wna16/ops.cu @@ -51,8 +51,9 @@ __global__ void permute_cols_kernel( } // namespace marlin torch::Tensor moe_wna16_marlin_gemm( - torch::Tensor& a, std::optional const& c_or_none, - torch::Tensor& b_q_weight, torch::Tensor& b_scales, + torch::Tensor& a, std::optional c_or_none, + torch::Tensor& b_q_weight, + std::optional const& b_bias_or_none, torch::Tensor& b_scales, std::optional const& b_zeros_or_none, std::optional const& g_idx_or_none, std::optional const& perm_or_none, torch::Tensor& workspace, @@ -212,7 +213,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, // Get B size int tb_k = th_config.thread_k; int tb_n = th_config.thread_n; - int tb_m = thread_m_blocks * (m_block_size_8 ? 8 : 16); + int tb_m = thread_m_blocks * 16; // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32) @@ -220,6 +221,11 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, int sh_a_size = pipe_stages * (tb_m * tb_k) * 2; int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4; int sh_red_size = tb_m * (tb_n + 8) * 2; + int sh_bias_size = tb_n * 2; + int tmp_size = + (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size; + tmp_size = max(max(sh_b_size, sh_red_size), tmp_size); + int sh_s_size = get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full); @@ -234,8 +240,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, sh_zp_size = sh_s_size / 2; } - int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size + - sh_zp_size + sh_g_idx_size + sh_block_meta_size; + int total_size = tmp_size + sh_a_size + sh_s_size + sh_zp_size + + sh_g_idx_size + sh_block_meta_size; return total_size; } @@ -270,20 +276,25 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8, int cache_size = get_kernel_cache_size( th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float); - return cache_size <= max_shared_mem; + return cache_size + 512 <= max_shared_mem; } - #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ - else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - m_block_size_8 == M_BLOCK_SIZE_8 && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ - is_zp_float == IS_ZP_FLOAT) { \ - kernel = Marlin; \ + #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ + M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ + else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + m_block_size_8 == M_BLOCK_SIZE_8 && \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ + is_zp_float == IS_ZP_FLOAT) { \ + constexpr auto S_TYPE = \ + W_TYPE == vllm::kFE2M1f \ + ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu) \ + : (std::is_same::value ? vllm::kFloat16 \ + : vllm::kBFloat16); \ + kernel = Marlin; \ } // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false) @@ -335,30 +346,44 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8, _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \ _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \ - \ _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) - #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + #define BIGGROUP_GET_IF(W_TYPE) \ + BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256) \ + BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128) \ + BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \ + BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128) + + #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) - #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) - #define FP4_GET_IF(W_TYPE) \ - FP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ - FP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ - FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ - FP4_GET_IF_M234(W_TYPE, 8, 4, 128) + #define NVFP4_GET_IF(W_TYPE) \ + NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ + NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ + NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ + NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128) - #define BIGGROUP_GET_IF(W_TYPE) \ - BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256) \ - BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128) \ - BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \ - BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128) + #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) + + #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) + + #define MXFP4_GET_IF(W_TYPE) \ + MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ + MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ + MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ + MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128) // We currently have 4-bit models only with group_blocks == 4 #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ @@ -408,12 +433,17 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type, COMMON_GET_IF(vllm::kU4B8) COMMON_GET_IF(vllm::kU8B128) - BIGGROUP_GET_IF(vllm::kFE4M3fn) + NVFP4_GET_IF(vllm::kFE2M1f) - FP4_GET_IF(vllm::kFE2M1f) + BIGGROUP_GET_IF(vllm::kFE4M3fn) ACT_GET_IF(vllm::kU4B8) ACT_GET_IF(vllm::kU8B128) + if (std::is_same::value) { + if (false) { + } + MXFP4_GET_IF(vllm::kFE2M1f) + } return kernel; } @@ -482,16 +512,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m, } template -void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, - void* s2, void* zp, void* g_idx, void* perm, void* a_tmp, - void* sorted_token_ids, void* expert_ids, +void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, + void* s, void* s2, void* zp, void* g_idx, void* perm, + void* a_tmp, void* sorted_token_ids, void* expert_ids, void* num_tokens_past_padded, void* topk_weights, int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep, int prob_m, int prob_n, int prob_k, void* workspace, - vllm::ScalarType const& q_type, bool has_act_order, - bool is_k_full, bool has_zp, int num_groups, int group_size, - int dev, cudaStream_t stream, int thread_k, int thread_n, - int sms, bool use_atomic_add, bool use_fp32_reduce, + vllm::ScalarType const& q_type, bool has_bias, + bool has_act_order, bool is_k_full, bool has_zp, int num_groups, + int group_size, int dev, cudaStream_t stream, int thread_k, + int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) { int thread_m_blocks = div_ceil(moe_block_size, 16); bool m_block_size_8 = moe_block_size == 8; @@ -538,6 +568,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, const int4* B_ptr = (const int4*)B; int4* C_ptr = (int4*)C; int4* C_tmp_ptr = (int4*)C_tmp; + const int4* bias_ptr = (const int4*)b_bias; const int4* s_ptr = (const int4*)s; const uint16_t* s2_ptr = (const uint16_t*)s2; const int4* zp_ptr = (const int4*)zp; @@ -648,10 +679,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, // avoid ">>>" being formatted to "> > >" // clang-format off kernel<<>>( - A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, + A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr, topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m, - prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce, max_shared_mem); + prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem); // clang-format on } @@ -659,7 +690,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, torch::Tensor moe_wna16_marlin_gemm( torch::Tensor& a, std::optional const& c_or_none, - torch::Tensor& b_q_weight, torch::Tensor& b_scales, + torch::Tensor& b_q_weight, + std::optional const& b_bias_or_none, torch::Tensor& b_scales, std::optional const& global_scale_or_none, std::optional const& b_zeros_or_none, std::optional const& g_idx_or_none, @@ -766,7 +798,6 @@ torch::Tensor moe_wna16_marlin_gemm( num_groups = b_scales.size(1); torch::Tensor g_idx, perm, a_tmp; - ; if (g_idx_or_none.has_value() && perm_or_none.has_value()) { g_idx = g_idx_or_none.value(); perm = perm_or_none.value(); @@ -815,12 +846,24 @@ torch::Tensor moe_wna16_marlin_gemm( torch::Tensor global_scale; if (global_scale_or_none.has_value()) { global_scale = global_scale_or_none.value(); - TORCH_CHECK(b_q_type == vllm::kFE2M1f, - "global_scale can only be used for float4_e2m1f."); + TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16, + "global_scale can only be used for nvfp4 format."); } else { global_scale = torch::empty({0}, options); - TORCH_CHECK(!(b_q_type == vllm::kFE2M1f), - "the global_scale parameter must be passed for float4_e2m1f."); + TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16), + "the global_scale parameter must be passed for nvfp4 format."); + } + + bool has_bias = b_bias_or_none.has_value(); + torch::Tensor b_bias; + if (has_bias) { + b_bias = b_bias_or_none.value(); + TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU"); + TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous"); + TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n"); + TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1"); + } else { + b_bias = torch::empty({0}, options); } torch::Tensor b_zeros; @@ -832,7 +875,6 @@ torch::Tensor moe_wna16_marlin_gemm( b_zeros = torch::empty({0}, options); } bool has_zp = b_zeros.size(-1) > 0; - if (has_zp) { TORCH_CHECK( b_q_type == vllm::kU4 || b_q_type == vllm::kU8, @@ -890,41 +932,58 @@ torch::Tensor moe_wna16_marlin_gemm( if (a.scalar_type() == at::ScalarType::Half) { void* scales_ptr; if (b_q_type == vllm::kFE2M1f) { - scales_ptr = b_scales.data_ptr(); + if (group_size == 16) + scales_ptr = b_scales.data_ptr(); + else if (group_size == 32) + scales_ptr = b_scales.data_ptr(); + else + TORCH_CHECK(false, + "float4_e2m1f only supports group_size == 16 (NVFP4) ", + "and group_size == 32 (MXFP4)"); } else { scales_ptr = b_scales.data_ptr(); } MARLIN_NAMESPACE_NAME::marlin_mm( a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), - c_tmp.data_ptr(), scales_ptr, global_scale.data_ptr(), - b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), - a_tmp.data_ptr(), sorted_token_ids.data_ptr(), - expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(), - topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep, - size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order, - is_k_full, has_zp, num_groups, group_size, dev, + c_tmp.data_ptr(), b_bias.data_ptr(), scales_ptr, + global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), + perm.data_ptr(), a_tmp.data_ptr(), + sorted_token_ids.data_ptr(), expert_ids.data_ptr(), + num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(), + moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k, + workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full, + has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float); } else if (a.scalar_type() == at::ScalarType::BFloat16) { void* scales_ptr; if (b_q_type == vllm::kFE2M1f) { - scales_ptr = b_scales.data_ptr(); + if (group_size == 16) + scales_ptr = b_scales.data_ptr(); + else if (group_size == 32) + scales_ptr = b_scales.data_ptr(); + else + TORCH_CHECK(false, + "float4_e2m1f only supports group_size == 16 (NVFP4) ", + "and group_size == 32 (MXFP4)"); } else { scales_ptr = b_scales.data_ptr(); } MARLIN_NAMESPACE_NAME::marlin_mm( a.data_ptr(), b_q_weight.data_ptr(), - c.data_ptr(), c_tmp.data_ptr(), scales_ptr, + c.data_ptr(), c_tmp.data_ptr(), + b_bias.data_ptr(), scales_ptr, global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(), expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k, - workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, - num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float); + workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full, + has_zp, num_groups, group_size, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, + use_atomic_add, use_fp32_reduce, is_zp_float); } else { TORCH_CHECK(false, "moe_wna16_marlin_gemm only supports bfloat16 and float16"); diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index d96e082f6ef1..7e49f68f6243 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -35,7 +35,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { m.def( "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none," - "Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale, Tensor? " + "Tensor! b_q_weight, Tensor? b_bias_or_none," + "Tensor! b_scales, Tensor? global_scale, Tensor? " "b_zeros_or_none," "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace," "Tensor sorted_token_ids," diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h index ae0d6c0f2002..e8b0c302b202 100644 --- a/csrc/quantization/gptq_marlin/dequant.h +++ b/csrc/quantization/gptq_marlin/dequant.h @@ -470,11 +470,12 @@ __device__ inline void dequant( frag_b[0] = __hmul2(frag_b[0], bias_reg); } -template +template __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b); template <> -__device__ inline void dequant_fp8_scales(int q, half2* frag_b) { +__device__ inline void dequant_fp8_scales( + int q, half2* frag_b) { int Out1 = (q & 0xFF00FF00) >> 1; ; q <<= 8; @@ -486,8 +487,8 @@ __device__ inline void dequant_fp8_scales(int q, half2* frag_b) { }; template <> -__device__ inline void dequant_fp8_scales(int q, - nv_bfloat162* frag_b) { +__device__ inline void dequant_fp8_scales( + int q, nv_bfloat162* frag_b) { constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8; constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT; constexpr int MASK = 0x7F007F00; @@ -502,6 +503,20 @@ __device__ inline void dequant_fp8_scales(int q, frag_b[0] = *reinterpret_cast(&Out2); } +template <> +__device__ inline void dequant_fp8_scales( + int q, nv_bfloat162* frag_b) { + // In this conversion, 2 ** -127 in FP8E8M0 would become 0 in BF16, + // but we assume that such a extreme value would not occur in real models. + int Out1 = (q & 0xFF00FF00) >> 1; + q <<= 7; + int Out2 = q & 0x7F807F80; + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +} + #endif } // namespace MARLIN_NAMESPACE_NAME diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py index 18fb6c1a81f8..7576e0548abe 100644 --- a/csrc/quantization/gptq_marlin/generate_kernels.py +++ b/csrc/quantization/gptq_marlin/generate_kernels.py @@ -20,6 +20,7 @@ TEMPLATE = ("template __global__ void Marlin<" "{{scalar_t}}, " "{{w_type_id}}, " + "{{s_type_id}}, " "{{threads}}, " "{{thread_m_blocks}}, " "{{thread_n_blocks}}, " @@ -78,7 +79,8 @@ def generate_new_kernels(): if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]: continue # nvfp4 only supports group_size == 16 - if scalar_type == "vllm::kFE2M1f" and group_blocks != 1: + # mxfp4 only supports group_size == 32 + if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]: continue # other quantization methods don't support group_size = 16 if scalar_type != "vllm::kFE2M1f" and group_blocks == 1: @@ -97,10 +99,23 @@ def generate_new_kernels(): # 4bit quantization and fp16 is_zp_float_list.append(True) + if scalar_type == "vllm::kFE2M1f" and group_blocks == 1: + s_type = "vllm::kFE4M3fn" + elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2: + s_type = "vllm::kFE8M0fnu" + if dtype == "fp16": + # we cannot safely dequantize e8m0 to fp16, so skip this + continue + elif dtype == "fp16": + s_type = "vllm::kFloat16" + elif dtype == "bf16": + s_type = "vllm::kBFloat16" + for is_zp_float in is_zp_float_list: template_str = jinja2.Template(TEMPLATE).render( scalar_t=c_dtype, w_type_id=scalar_type + ".id()", + s_type_id=s_type + ".id()", threads=threads, thread_m_blocks=max(m_blocks, 1), thread_n_blocks=n_blocks, diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 4a242f2050d5..cc30abcf0080 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -48,7 +48,8 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr, torch::Tensor gptq_marlin_gemm( torch::Tensor& a, std::optional c_or_none, - torch::Tensor& b_q_weight, torch::Tensor& b_scales, + torch::Tensor& b_q_weight, + std::optional const& b_bias_or_none, torch::Tensor& b_scales, std::optional const& b_zeros_or_none, std::optional const& g_idx_or_none, std::optional const& perm_or_none, torch::Tensor& workspace, @@ -187,7 +188,12 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks, int tb_m = thread_m_blocks * 16; int sh_a_size = pipe_stages * (tb_m * tb_k) * 2; int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4; - int sh_red_size = tb_m * (tb_n + 8); + int sh_red_size = tb_m * (tb_n + 8) * 2; + int sh_bias_size = tb_n * 2; + int tmp_size = + (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size; + tmp_size = max(max(sh_b_size, sh_red_size), tmp_size); + int sh_s_size = get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full); @@ -202,8 +208,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks, sh_zp_size = sh_s_size / 2; } - int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size + - sh_zp_size + sh_g_idx_size; + int total_size = + tmp_size + sh_a_size + sh_s_size + sh_zp_size + sh_g_idx_size; return total_size; } @@ -237,20 +243,25 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks, int cache_size = get_kernel_cache_size( th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float); - return cache_size <= max_shared_mem; + return cache_size + 512 <= max_shared_mem; } - #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ - else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - m_block_size_8 == M_BLOCK_SIZE_8 && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ - is_zp_float == IS_ZP_FLOAT) { \ - kernel = Marlin; \ + #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ + M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ + else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + m_block_size_8 == M_BLOCK_SIZE_8 && \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ + is_zp_float == IS_ZP_FLOAT) { \ + constexpr auto S_TYPE = \ + W_TYPE == vllm::kFE2M1f \ + ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu) \ + : (std::is_same::value ? vllm::kFloat16 \ + : vllm::kBFloat16); \ + kernel = Marlin; \ } // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false) @@ -315,22 +326,39 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks, BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128) \ BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128) - #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) - #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) - #define FP4_GET_IF(W_TYPE) \ - FP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ - FP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ - FP4_GET_IF_M1(W_TYPE, 4, 8, 128) \ - FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ - FP4_GET_IF_M234(W_TYPE, 8, 4, 128) \ - FP4_GET_IF_M234(W_TYPE, 4, 8, 128) + #define NVFP4_GET_IF(W_TYPE) \ + NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ + NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ + NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128) \ + NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ + NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128) \ + NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128) + + #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) + + #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) + + #define MXFP4_GET_IF(W_TYPE) \ + MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ + MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ + MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128) \ + MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ + MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128) \ + MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128) // We currently have 4-bit models only with group_blocks == 4 #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ @@ -384,7 +412,7 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type, COMMON_GET_IF(vllm::kU4B8) COMMON_GET_IF(vllm::kU8B128) - FP4_GET_IF(vllm::kFE2M1f) + NVFP4_GET_IF(vllm::kFE2M1f) BIGGROUP_GET_IF(vllm::kFE4M3fn) @@ -396,6 +424,11 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type, } FZP_GET_IF(vllm::kU4) } + if (std::is_same::value) { + if (false) { + } + MXFP4_GET_IF(vllm::kFE2M1f) + } return kernel; } @@ -453,12 +486,12 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m, } template -void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, - void* s2, void* zp, void* g_idx, void* perm, void* a_tmp, - int prob_m, int prob_n, int prob_k, int lda, void* workspace, - vllm::ScalarType const& q_type, bool has_act_order, - bool is_k_full, bool has_zp, int num_groups, int group_size, - int dev, cudaStream_t stream, int thread_k_init, +void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, + void* s, void* s2, void* zp, void* g_idx, void* perm, + void* a_tmp, int prob_m, int prob_n, int prob_k, int lda, + void* workspace, vllm::ScalarType const& q_type, bool has_bias, + bool has_act_order, bool is_k_full, bool has_zp, int num_groups, + int group_size, int dev, cudaStream_t stream, int thread_k_init, int thread_n_init, int sms, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) { if (has_zp) { @@ -503,6 +536,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, const int4* B_ptr = (const int4*)B; int4* C_ptr = (int4*)C; int4* C_tmp_ptr = (int4*)C_tmp; + const int4* bias_ptr = (const int4*)b_bias; const int4* s_ptr = (const int4*)s; const uint16_t* s2_ptr = (const uint16_t*)s2; const int4* zp_ptr = (const int4*)zp; @@ -623,8 +657,9 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, // avoid ">>>" being formatted to "> > >" // clang-format off kernel<<>>( - A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, num_groups, - prob_m_split, prob_n, prob_k, lda, locks, part_use_atomic_add, + A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, + g_idx_ptr, num_groups, + prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add, use_fp32_reduce, max_shared_mem_new); // clang-format on @@ -638,7 +673,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, torch::Tensor gptq_marlin_gemm( torch::Tensor& a, std::optional c_or_none, - torch::Tensor& b_q_weight, torch::Tensor& b_scales, + torch::Tensor& b_q_weight, + std::optional const& b_bias_or_none, torch::Tensor& b_scales, std::optional const& global_scale_or_none, std::optional const& b_zeros_or_none, std::optional const& g_idx_or_none, @@ -785,12 +821,24 @@ torch::Tensor gptq_marlin_gemm( torch::Tensor global_scale; if (global_scale_or_none.has_value()) { global_scale = global_scale_or_none.value(); - TORCH_CHECK(b_q_type == vllm::kFE2M1f, - "global_scale can only be used for float4_e2m1f."); + TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16, + "global_scale can only be used for nvfp4 format."); } else { global_scale = torch::empty({0}, options); - TORCH_CHECK(!(b_q_type == vllm::kFE2M1f), - "the global_scale parameter must be passed for float4_e2m1f."); + TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16), + "the global_scale parameter must be passed for nvfp4 format."); + } + + bool has_bias = b_bias_or_none.has_value(); + torch::Tensor b_bias; + if (has_bias) { + b_bias = b_bias_or_none.value(); + TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU"); + TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous"); + TORCH_CHECK(b_bias.size(0) == size_n, "b_bias.size(0) != size_n"); + TORCH_CHECK(b_bias.stride(0) == 1, "b_bias.stride(0) != 1"); + } else { + b_bias = torch::empty({0}, options); } torch::Tensor b_zeros; @@ -857,34 +905,50 @@ torch::Tensor gptq_marlin_gemm( if (a.scalar_type() == at::ScalarType::Half) { void* scales_ptr; if (b_q_type == vllm::kFE2M1f) { - scales_ptr = b_scales.data_ptr(); + if (group_size == 16) + scales_ptr = b_scales.data_ptr(); + else if (group_size == 32) + scales_ptr = b_scales.data_ptr(); + else + TORCH_CHECK(false, + "float4_e2m1f only supports group_size == 16 (NVFP4) ", + "and group_size == 32 (MXFP4)"); } else { scales_ptr = b_scales.data_ptr(); } marlin::marlin_mm( a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), - c_tmp.data_ptr(), scales_ptr, global_scale.data_ptr(), - b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), - a_tmp.data_ptr(), size_m, size_n, size_k, a.stride(0), - workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, - num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float); + c_tmp.data_ptr(), b_bias.data_ptr(), scales_ptr, + global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), + perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, + a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order, + is_k_full, has_zp, num_groups, group_size, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, + use_atomic_add, use_fp32_reduce, is_zp_float); } else if (a.scalar_type() == at::ScalarType::BFloat16) { void* scales_ptr; if (b_q_type == vllm::kFE2M1f) { - scales_ptr = b_scales.data_ptr(); + if (group_size == 16) + scales_ptr = b_scales.data_ptr(); + else if (group_size == 32) + scales_ptr = b_scales.data_ptr(); + else + TORCH_CHECK(false, + "float4_e2m1f only supports group_size == 16 (NVFP4) ", + "and group_size == 32 (MXFP4)"); } else { scales_ptr = b_scales.data_ptr(); } marlin::marlin_mm( a.data_ptr(), b_q_weight.data_ptr(), - c.data_ptr(), c_tmp.data_ptr(), scales_ptr, + c.data_ptr(), c_tmp.data_ptr(), + b_bias.data_ptr(), scales_ptr, global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type, - has_act_order, is_k_full, has_zp, num_groups, group_size, dev, + has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float); } else { diff --git a/csrc/quantization/gptq_marlin/kernel.h b/csrc/quantization/gptq_marlin/kernel.h index f92056589d20..bb454f6aff22 100644 --- a/csrc/quantization/gptq_marlin/kernel.h +++ b/csrc/quantization/gptq_marlin/kernel.h @@ -10,15 +10,18 @@ #define MARLIN_KERNEL_PARAMS \ const int4 *__restrict__ A, const int4 *__restrict__ B, \ int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ + const int4 *__restrict__ b_bias_ptr, \ const int4 *__restrict__ scales_ptr, \ const uint16_t *__restrict__ scale2_ptr, \ const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \ int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \ - bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem + bool has_bias, bool use_atomic_add, bool use_fp32_reduce, \ + int max_shared_mem namespace MARLIN_NAMESPACE_NAME { template ::FragZP; static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id); + static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id); + if constexpr (w_type == vllm::kFE2M1f) { + static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 || + s_type == vllm::kFE8M0fnu && group_blocks == 2); + } else if constexpr (std::is_same::value) { + static_assert(s_type == vllm::kBFloat16); + } else if constexpr (std::is_same::value) { + static_assert(s_type == vllm::kFloat16); + } + constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8; constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 || w_type == vllm::kU4B8 || w_type == vllm::kU8B128; // see comments of dequant.h for more details constexpr bool dequant_skip_flop = - !is_int_type || + w_type == vllm::kFE4M3fn || + w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn || has_zp && !is_zp_float && !std::is_same::value || has_zp && !is_zp_float && !(w_type == vllm::kU8); scalar_t2 global_scale; - - if constexpr (w_type == vllm::kFE2M1f) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { + // NVFP4 format requires global scale uint16_t val = scale2_ptr[0]; global_scale = Dtype::num2num2(*reinterpret_cast(&val)); } @@ -589,7 +604,7 @@ __global__ void Marlin( s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4; - s_sh_rd = s_sh_rd * 2 + warp_row % 2; + s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2; } else if constexpr (group_blocks != -1) s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + @@ -602,6 +617,18 @@ __global__ void Marlin( s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4; + int bias_sh_rd; + if constexpr (m_block_size_8) { + bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 8; + } else { + bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) % 4; + } + + int bias_sh_wr = threadIdx.x; + int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x; + // Zero-points have the same read layout as the scales // (without column-wise case) constexpr int num_col_threads = 8; @@ -670,7 +697,19 @@ __global__ void Marlin( constexpr int sh_b_size = stages * b_sh_stage; int4* sh_b = sh; int4* sh_red = sh; - int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + + constexpr int sh_size_b_red_min = + (sh_red_size < sh_b_size ? sh_red_size : sh_b_size); + constexpr int sh_size_b_red_max = + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + constexpr int sh_bias_size = (thread_n_blocks * 16 / 8); + constexpr int sh_b_red_bias_size = + sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size) + ? sh_size_b_red_max + : (sh_size_b_red_min + sh_bias_size); + + int4* sh_bias = sh + sh_size_b_red_min; + int4* sh_g_idx = sh + sh_b_red_bias_size; int4* sh_zp = sh_g_idx + (stages * g_idx_stage); constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride) : (stages * s_sh_stage); @@ -680,15 +719,13 @@ __global__ void Marlin( static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <= stages * b_sh_stage); int4* sh_a = sh_s + sh_s_size; - // constexpr int shm_size_used = - // stages * (g_idx_stage + zp_sh_stage) + sh_s_size + - // (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); // Register storage for double buffer of shared memory reads. FragA frag_a[2][thread_m_blocks]; I4 frag_b_quant[2][b_thread_vecs]; FragC frag_c[thread_m_blocks][4][2]; - FragS frag_s[2][4]; // No act-order + FragS frag_s[2][4]; // No act-order + FragS frag_bias[2][4]; FragS act_frag_s[2][4][4]; // For act-order int frag_qzp[2][num_ints_per_thread]; // Zero-points FragZP frag_zp; // Zero-points in fp16 @@ -923,10 +960,15 @@ __global__ void Marlin( if constexpr (w_type_id != vllm::kFE2M1f.id()) { reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride]; - } else { + } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) { reinterpret_cast(&frag_s[k % 2])[0] = reinterpret_cast( sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)]; + } else { + reinterpret_cast(&frag_s[k % 2])[0] = + reinterpret_cast( + sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) + + k % 2]; } } } @@ -1139,9 +1181,9 @@ __global__ void Marlin( int s_quant_0 = reinterpret_cast(frag_s[k2])[0]; int s_quant_1 = reinterpret_cast(frag_s[k2])[1]; - dequant_fp8_scales(s_quant_0, - reinterpret_cast(&frag_s[k2])); - dequant_fp8_scales( + dequant_fp8_scales( + s_quant_0, reinterpret_cast(&frag_s[k2])); + dequant_fp8_scales( s_quant_1, reinterpret_cast(&frag_s[k2]) + 2); } @@ -1411,7 +1453,7 @@ __global__ void Marlin( // Write out the reduce final result in the correct layout. We only actually // reshuffle matrix fragments in this step, the reduction above is performed // in fragment layout. - auto write_result = [&]() { + auto write_result = [&](bool last) { int c_gl_stride = prob_n / 8; constexpr int c_sh_stride = 2 * thread_n_blocks + 1; int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); @@ -1438,7 +1480,7 @@ __global__ void Marlin( int c_gl_wr_end = c_gl_stride * prob_m; // We first reorder in shared memory to guarantee the most efficient final // global write patterns - auto write = [&](int idx, float c0, float c1, FragS& s) { + auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) { scalar_t2 res = Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); @@ -1447,12 +1489,25 @@ __global__ void Marlin( if constexpr (!has_act_order && group_blocks == -1 && w_type.size_bits() == 4 && (has_zp && dequant_skip_flop || !has_zp)) { - res = __hmul2(res, s[0]); + scalar_t2 tmp_scale = s[0]; + if constexpr (m_block_size_8) { + tmp_scale = Dtype::num2num2( + reinterpret_cast(&s[0])[(threadIdx.x % 8) / 4]); + } + res = __hmul2(res, tmp_scale); } - if constexpr (w_type == vllm::kFE2M1f) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { res = __hmul2(res, global_scale); } + if (has_bias && last) { + scalar_t2 tmp_bias = b_bias[0]; + if constexpr (m_block_size_8) { + tmp_bias = Dtype::num2num2( + reinterpret_cast(&b_bias[0])[(threadIdx.x % 8) / 4]); + } + res = __hadd2(res, tmp_bias); + } if constexpr (m_block_size_8) { ((scalar_t*)sh_red)[idx] = res.x; @@ -1470,19 +1525,25 @@ __global__ void Marlin( if constexpr (m_block_size_8) { int wr = c_sh_wr + 16 * j; write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1], - frag_s[j / 2][2 * (j % 2) + 0]); + frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3], - frag_s[j / 2][2 * (j % 2) + 1]); + frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); } else { int wr = c_sh_wr + 8 * j; write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); } } c_sh_wr += 16 * (4 * c_sh_stride); @@ -1622,6 +1683,14 @@ __global__ void Marlin( } thread_block_reduce(); + + if (has_bias && last) { + __syncthreads(); + cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd], + threadIdx.x < 16 * thread_n_blocks / 8); + cp_async_fence(); + } + if constexpr (!has_act_order && group_blocks == -1 && (has_zp && dequant_skip_flop || !has_zp)) { if (w_type.size_bits() == 8 || (last || use_atomic_add)) { @@ -1684,11 +1753,20 @@ __global__ void Marlin( } barrier_release(&locks[locks_off], last); } + + if (has_bias && last) { + cp_async_wait<0>(); + __syncthreads(); + reinterpret_cast(&frag_bias)[0] = sh_bias[bias_sh_rd]; + reinterpret_cast(&frag_bias)[1] = sh_bias[bias_sh_rd + 4]; + __syncthreads(); + } + if (use_atomic_add && slice_count > 1 && slice_idx != 0) wait_negative_and_add(&locks[locks_off]); if (last || use_atomic_add) // only the last block in a slice actually writes the result - write_result(); + write_result(last); slice_row = 0; slice_col_par++; slice_col++; @@ -1706,6 +1784,7 @@ __global__ void Marlin( for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; } + bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x; // Update slice k/n for scales loading if constexpr (has_act_order) { slice_k_start = tb_k * slice_row; diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 85b6abef00b0..8c207be083d8 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -326,6 +326,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // gptq_marlin Optimized Quantized GEMM for GPTQ. ops.def( "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, " + "Tensor? b_bias_or_none," "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? " "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 49c097718e30..b82c74a42ab3 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -24,8 +24,10 @@ fused_topk, modular_triton_fused_moe) from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( fused_moe as iterative_moe) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + marlin_permute_bias) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( - rand_marlin_weight_fp4_like) + rand_marlin_weight_mxfp4_like, rand_marlin_weight_nvfp4_like) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( marlin_quant_fp8_torch) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( @@ -476,8 +478,11 @@ def is_invalid(m, n, k, e, topk, ep_size, dtype, group_size, act_order, if quant_type == scalar_types.float8_e4m3fn and \ group_size not in [-1, 128]: return False - if quant_type == scalar_types.float4_e2m1f and group_size != 16: - return False + if quant_type == scalar_types.float4_e2m1f: + if group_size not in [16, 32]: + return False + if dtype == torch.float16 and group_size == 32: + return False if quant_type != scalar_types.float4_e2m1f and group_size == 16: return False @@ -520,31 +525,6 @@ def test_fused_marlin_moe( torch.cuda.manual_seed(0) has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8] - if quant_type == scalar_types.float8_e4m3fn: - if group_size not in [-1, 128]: - return - if act_order: - return - - # Filter act_order - if act_order: - if quant_type == scalar_types.float8_e4m3fn: - return - if group_size == -1: - return - if group_size in (k, n): - return - if has_zp: - return - else: - if not is_k_full: - return - - if quant_type == scalar_types.float4_e2m1f and group_size != 16: - return - if quant_type != scalar_types.float4_e2m1f and group_size == 16: - return - a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20 w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20 @@ -569,13 +549,19 @@ def test_fused_marlin_moe( for i in range(w1.shape[0]): if quant_type == scalar_types.float4_e2m1f: - w_ref1, qweight1, scales1, global_scale1 = \ - rand_marlin_weight_fp4_like(w1[i], group_size) + if group_size == 16: + w_ref1, qweight1, scales1, global_scale1 = \ + rand_marlin_weight_nvfp4_like(w1[i], group_size) + else: + w_ref1, qweight1, scales1 = \ + rand_marlin_weight_mxfp4_like(w1[i], group_size) + global_scale1 = None w_ref1_l.append(w_ref1.T) qweight1_l.append(qweight1) scales1_l.append(scales1) - global_scale1_l.append(global_scale1) + if global_scale1 is not None: + global_scale1_l.append(global_scale1) elif quant_type == scalar_types.float8_e4m3fn: w_ref1, qweight1, scales1 = marlin_quant_fp8_torch( w1[i], group_size) @@ -620,13 +606,19 @@ def test_fused_marlin_moe( for i in range(w2.shape[0]): if quant_type == scalar_types.float4_e2m1f: - w_ref2, qweight2, scales2, global_scale2 = \ - rand_marlin_weight_fp4_like(w2[i], group_size) + if group_size == 16: + w_ref2, qweight2, scales2, global_scale2 = \ + rand_marlin_weight_nvfp4_like(w2[i], group_size) + else: + w_ref2, qweight2, scales2 = \ + rand_marlin_weight_mxfp4_like(w2[i], group_size) + global_scale2 = None w_ref2_l.append(w_ref2.T) qweight2_l.append(qweight2) scales2_l.append(scales2) - global_scale2_l.append(global_scale2) + if global_scale2 is not None: + global_scale2_l.append(global_scale2) elif quant_type == scalar_types.float8_e4m3fn: w_ref2, qweight2, scales2 = marlin_quant_fp8_torch( w2[i], group_size) @@ -677,6 +669,8 @@ def test_fused_marlin_moe( a, qweight1, qweight2, + None, + None, scales1, scales2, score, @@ -698,6 +692,119 @@ def test_fused_marlin_moe( torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0) +@pytest.mark.flaky(reruns=2) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") +@pytest.mark.parametrize("m", [1, 256]) +def test_fused_marlin_moe_with_bias(m): + torch.cuda.manual_seed(0) + + e, topk = 32, 4 + n, k = 2048, 2048 + group_size = 128 + act_order = False + is_k_full = True + quant_type = scalar_types.uint4b8 + dtype = torch.half + + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + b_bias1 = torch.randn((e, 2 * n), device="cuda", dtype=dtype) / 10 + b_bias2 = torch.randn((e, k), device="cuda", dtype=dtype) / 10 + + b_bias1_l = [] + w_ref1_l = [] + qweight1_l = [] + scales1_l = [] + g_idx1_l = [] + sort_indices1_l = [] + + for i in range(w1.shape[0]): + test_perm = torch.randperm(k) + w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = \ + marlin_quantize(w1[i].transpose(1, 0), quant_type, + group_size, act_order, test_perm) + + w_ref1_l.append(w_ref1.T) + qweight1_l.append(qweight1) + scales1_l.append(scales1) + g_idx1_l.append(g_idx1) + sort_indices1_l.append(sort_indices1) + b_bias1_l.append(marlin_permute_bias(b_bias1[i])) + + w_ref1 = stack_and_dev(w_ref1_l) + qweight1 = stack_and_dev(qweight1_l).contiguous() + scales1 = stack_and_dev(scales1_l) + global_scale1 = None + g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None + zeros1 = None + sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None + marlin_bias1 = stack_and_dev(b_bias1_l) if b_bias1_l else None + + b_bias2_l = [] + w_ref2_l = [] + qweight2_l = [] + scales2_l = [] + g_idx2_l = [] + sort_indices2_l = [] + + for i in range(w2.shape[0]): + test_perm = torch.randperm(n) + w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = \ + marlin_quantize(w2[i].transpose(1, 0), quant_type, + group_size, act_order, test_perm) + + w_ref2_l.append(w_ref2.T) + qweight2_l.append(qweight2) + scales2_l.append(scales2) + g_idx2_l.append(g_idx2) + sort_indices2_l.append(sort_indices2) + b_bias2_l.append(marlin_permute_bias(b_bias2[i])) + + w_ref2 = stack_and_dev(w_ref2_l) + qweight2 = stack_and_dev(qweight2_l).contiguous() + scales2 = stack_and_dev(scales2_l) + global_scale2 = None + g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None + zeros2 = None + sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None + marlin_bias2 = stack_and_dev(b_bias2_l) if b_bias2_l else None + + score = torch.randn((m, e), device="cuda", dtype=dtype) + + topk_weights, topk_ids, _ = fused_topk(a, score, topk, False) + + with set_current_vllm_config(vllm_config): + torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, b_bias1, + b_bias2) + + marlin_output = torch.ops.vllm.fused_marlin_moe( + a, + qweight1, + qweight2, + marlin_bias1, + marlin_bias2, + scales1, + scales2, + score, + topk_weights, + topk_ids, + global_num_experts=e, + expert_map=None, + global_scale1=global_scale1, + global_scale2=global_scale2, + g_idx1=g_idx1, + g_idx2=g_idx2, + sort_indices1=sort_indices1, + sort_indices2=sort_indices2, + w1_zeros=zeros1, + w2_zeros=zeros2, + quant_type_id=quant_type.id, + is_k_full=is_k_full) + + torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0) + + def test_moe_align_block_size_opcheck(): num_experts = 4 block_size = 4 diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index 92914bd5cbba..1bd6713ce7fb 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -19,10 +19,11 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx, - marlin_make_workspace_new, marlin_permute_scales, + marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales, query_marlin_supported_quant_types) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( - FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_fp4_like) + FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_mxfp4_like, + rand_marlin_weight_nvfp4_like) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( marlin_quant_fp8_torch) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( @@ -39,7 +40,7 @@ ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] USE_ATOMIC_ADD_OPTS = [False, True] -USE_FP32_REDUCE_OPTS = [False, True] +USE_FP32_REDUCE_OPTS = [True] MARLIN_K_CHUNKS = [128] MARLIN_N_CHUNKS = [64, 256] @@ -202,17 +203,10 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, @pytest.mark.parametrize("is_k_full", K_FULL_OPTS) @pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS) @pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS) -def test_gptq_marlin_gemm( - k_chunk, - n_chunk, - quant_type, - group_size, - mnk_factors, - act_order, - is_k_full, - use_atomic_add, - use_fp32_reduce, -): +@pytest.mark.parametrize("dtype", DTYPES) +def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size, + mnk_factors, act_order, is_k_full, use_atomic_add, + use_fp32_reduce, dtype): m_factor, n_factor, k_factor = mnk_factors has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8] @@ -231,14 +225,23 @@ def test_gptq_marlin_gemm( if size_k % group_size != 0: return - a_input = rand_data((size_m, size_k)) - b_weight = rand_data((size_k, size_n)) + a_input = rand_data((size_m, size_k), dtype) + b_weight = rand_data((size_k, size_n), dtype) if quant_type == scalar_types.float4_e2m1f: - if group_size != 16 or act_order: + if group_size not in [16, 32] or act_order: return - w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like( - b_weight.T, group_size) + if group_size == 32 and dtype == torch.float16: + return + + if group_size == 16: + w_ref, marlin_q_w, marlin_s, marlin_s2 = \ + rand_marlin_weight_nvfp4_like(b_weight.T, group_size) + else: + w_ref, marlin_q_w, marlin_s = \ + rand_marlin_weight_mxfp4_like(b_weight.T, group_size) + marlin_s2 = None + g_idx = None sort_indices = None marlin_zp = None @@ -272,8 +275,8 @@ def test_gptq_marlin_gemm( workspace = marlin_make_workspace_new(w_ref.device) opcheck(torch.ops._C.gptq_marlin_gemm, - (a_input, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, g_idx, - sort_indices, workspace, quant_type.id, a_input.shape[0], + (a_input, None, marlin_q_w, None, marlin_s, marlin_s2, marlin_zp, + g_idx, sort_indices, workspace, quant_type.id, a_input.shape[0], b_weight.shape[1], a_input.shape[1], is_k_full, use_atomic_add, use_fp32_reduce, False), test_utils=DEFAULT_OPCHECK_TEST_UTILS) @@ -282,6 +285,7 @@ def test_gptq_marlin_gemm( a_input, None, marlin_q_w, + None, marlin_s, marlin_s2, marlin_zp, @@ -418,6 +422,7 @@ def test_hqq_marlin_gemm( a_input, None, marlin_w_q, + None, marlin_s, None, marlin_zp, @@ -531,6 +536,7 @@ def test_marlin_gemm_subset_input(): a_input, None, marlin_q_w, + None, marlin_s, None, marlin_zp, @@ -555,6 +561,53 @@ def test_marlin_gemm_subset_input(): assert max_diff < 0.04 +@pytest.mark.parametrize("size_m", [1, 256]) +def test_marlin_gemm_with_bias(size_m): + quant_type = scalar_types.uint4b8 + group_size = 128 + + size_k, size_n = 1024, 2048 + a_input = rand_data((size_m, size_k)) + b_weight = rand_data((size_k, size_n)) + b_bias = rand_data((size_n, )) * 10 + + marlin_bias = marlin_permute_bias(b_bias) + + w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize( + b_weight, quant_type, group_size, False) + + marlin_zp = marlin_make_empty_g_idx(marlin_s.device) + workspace = marlin_make_workspace_new(a_input.device) + + output = ops.gptq_marlin_gemm( + a_input, + None, + marlin_q_w, + marlin_bias, + marlin_s, + None, + marlin_zp, + g_idx, + sort_indices, + workspace, + quant_type, + a_input.shape[0], + b_weight.shape[1], + a_input.shape[1], + is_k_full=True, + use_atomic_add=False, + use_fp32_reduce=True, + is_zp_float=False, + ) + output_ref = torch.matmul(a_input, w_ref) + b_bias.view(1, -1) + + torch.cuda.synchronize() + + max_diff = compute_max_diff(output, output_ref) + + assert max_diff < 0.04 + + def test_marlin_gemm_opcheck(): size_m = 2048 size_n = 4096 diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 2e8febbdcf26..fa4125840a01 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1064,6 +1064,8 @@ def torch_experts( topk_weight: torch.Tensor, topk_ids: torch.Tensor, global_num_experts: int = -1, + b_bias1: Optional[torch.Tensor] = None, + b_bias2: Optional[torch.Tensor] = None, expert_map: Optional[torch.Tensor] = None, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, @@ -1108,8 +1110,13 @@ def torch_experts( if mask.sum(): if quant_dtype is None: tmp1 = a[mask] @ w1[i].transpose(0, 1) + if b_bias1 is not None: + tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype) tmp2 = SiluAndMul()(tmp1) out[mask] = tmp2 @ w2[i].transpose(0, 1) + if b_bias2 is not None: + out[mask] = out[mask] + b_bias2[i].view(1, -1).to( + tmp1.dtype) elif block_shape is not None: # block quantized assert (a_scale is not None and w1_scale is not None @@ -1117,6 +1124,8 @@ def torch_experts( tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask], w1_scale[i], block_shape, out.dtype) + if b_bias1 is not None: + tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype) tmp2 = SiluAndMul()(tmp1) tmp2, b_scale = moe_kernel_quantize_input( tmp2, a2_scale, quant_dtype, per_act_token_quant, @@ -1125,6 +1134,9 @@ def torch_experts( out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale, w2_scale[i], block_shape, out.dtype) + if b_bias2 is not None: + out[mask] = out[mask] + b_bias2[i].view(1, -1).to( + tmp1.dtype) else: assert (a_scale is not None and w1_scale is not None and w2_scale is not None) @@ -1133,6 +1145,8 @@ def torch_experts( tmp1 = a[mask].to(f32) * scales w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1) tmp1 = (tmp1 @ w1_dq).to(out.dtype) + if b_bias1 is not None: + tmp1 = tmp1 + b_bias1[i].view(1, -1).to(out.dtype) tmp2 = SiluAndMul()(tmp1).to(out.dtype) @@ -1144,6 +1158,9 @@ def torch_experts( tmp2 = tmp2.to(f32) * b_scale w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1) out[mask] = (tmp2 @ w2_dq).to(out.dtype) + if b_bias2 is not None: + out[mask] = out[mask] + b_bias2[i].view(1, -1).to( + out.dtype) if apply_router_weights_on_input: return out @@ -1157,12 +1174,14 @@ def torch_moe(a: torch.Tensor, w2: torch.Tensor, score: torch.Tensor, topk: int, + b_bias1: Optional[torch.Tensor] = None, + b_bias2: Optional[torch.Tensor] = None, global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None) -> torch.Tensor: score = torch.softmax(score, dim=-1, dtype=torch.float32) topk_weight, topk_ids = torch.topk(score, topk) return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts, - expert_map) + b_bias1, b_bias2, expert_map) def torch_moe_single(a, w, score, topk): diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 70605d3c5f52..a020b171e894 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -452,6 +452,7 @@ def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, def _gptq_marlin_gemm_fake(a: torch.Tensor, c: Optional[torch.Tensor], b_q_weight: torch.Tensor, + b_bias: Optional[torch.Tensor], b_scales: torch.Tensor, global_scale: Optional[torch.Tensor], b_zeros: Optional[torch.Tensor], @@ -1048,6 +1049,7 @@ def awq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor, def gptq_marlin_gemm(a: torch.Tensor, c: Optional[torch.Tensor], b_q_weight: torch.Tensor, + b_bias: Optional[torch.Tensor], b_scales: torch.Tensor, global_scale: Optional[torch.Tensor], b_zeros: Optional[torch.Tensor], @@ -1062,7 +1064,7 @@ def gptq_marlin_gemm(a: torch.Tensor, use_atomic_add: bool = False, use_fp32_reduce: bool = False, is_zp_float: bool = False) -> torch.Tensor: - return torch.ops._C.gptq_marlin_gemm(a, c, b_q_weight, b_scales, + return torch.ops._C.gptq_marlin_gemm(a, c, b_q_weight, b_bias, b_scales, global_scale, b_zeros, g_idx, perm, workspace, b_q_type.id, size_m, size_n, size_k, is_k_full, @@ -1540,7 +1542,9 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor, def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor], - b_qweight: torch.Tensor, b_scales: torch.Tensor, + b_qweight: torch.Tensor, + b_bias: Optional[torch.Tensor], + b_scales: torch.Tensor, global_scale: Optional[torch.Tensor], b_qzeros: Optional[torch.Tensor], g_idx: Optional[torch.Tensor], @@ -1556,11 +1560,11 @@ def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor], use_fp32_reduce: bool, is_zp_float: bool) -> torch.Tensor: return torch.ops._moe_C.moe_wna16_marlin_gemm( - input, output, b_qweight, b_scales, global_scale, b_qzeros, g_idx, - perm, workspace, sorted_token_ids, expert_ids, num_tokens_past_padded, - topk_weights, moe_block_size, top_k, mul_topk_weights, is_ep, - b_q_type.id, size_m, size_n, size_k, is_k_full, use_atomic_add, - use_fp32_reduce, is_zp_float) + input, output, b_qweight, b_bias, b_scales, global_scale, b_qzeros, + g_idx, perm, workspace, sorted_token_ids, expert_ids, + num_tokens_past_padded, topk_weights, moe_block_size, top_k, + mul_topk_weights, is_ep, b_q_type.id, size_m, size_n, size_k, + is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float) if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"): diff --git a/vllm/envs.py b/vllm/envs.py index 145ec3495a0c..110bb542b120 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -122,6 +122,7 @@ VLLM_MOE_DP_CHUNK_SIZE: int = 256 VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False VLLM_MARLIN_USE_ATOMIC_ADD: bool = False + VLLM_MXFP4_USE_MARLIN: Optional[bool] = None VLLM_V0_USE_OUTLINES_CACHE: bool = False VLLM_V1_USE_OUTLINES_CACHE: bool = False VLLM_TPU_BUCKET_PADDING_GAP: int = 0 @@ -182,6 +183,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: return int(value) +def maybe_convert_bool(value: Optional[str]) -> Optional[bool]: + if value is None: + return None + return bool(int(value)) + + def get_vllm_port() -> Optional[int]: """Get the port from VLLM_PORT environment variable. @@ -906,6 +913,10 @@ def get_vllm_port() -> Optional[int]: "VLLM_MARLIN_USE_ATOMIC_ADD": lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1", + # Whether to use marlin kernel in mxfp4 quantization method + "VLLM_MXFP4_USE_MARLIN": + lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)), + # Whether to turn on the outlines cache for V0 # This cache is unbounded and on disk, so it's not safe to use in # an environment with potentially malicious users. diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 1988c73ba7e2..a49d41c18438 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -18,6 +18,8 @@ def fused_marlin_moe(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, + bias1: Optional[torch.Tensor], + bias2: Optional[torch.Tensor], w1_scale: torch.Tensor, w2_scale: torch.Tensor, gating_output: torch.Tensor, @@ -26,6 +28,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor, quant_type_id: int, apply_router_weight_on_input: bool = False, global_num_experts: int = -1, + activation: Optional[str] = "silu", expert_map: Optional[torch.Tensor] = None, global_scale1: Optional[torch.Tensor] = None, global_scale2: Optional[torch.Tensor] = None, @@ -88,6 +91,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor, assert w2.is_contiguous(), "Expert weights2 must be contiguous" assert hidden_states.dtype in [torch.float16, torch.bfloat16] assert num_bits in [4, 8] + assert topk_weights.dtype == torch.float32 M, K = hidden_states.shape E = w1.shape[0] @@ -138,6 +142,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor, hidden_states, intermediate_cache1, w1, + bias1, w1_scale, global_scale1, w1_zeros, @@ -161,8 +166,28 @@ def fused_marlin_moe(hidden_states: torch.Tensor, use_fp32_reduce=True, is_zp_float=False) - torch.ops._C.silu_and_mul(intermediate_cache2, - intermediate_cache1.view(-1, 2 * N)) + if activation == "silu": + torch.ops._C.silu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, 2 * N)) + elif activation == "swiglu_oai": + # NOTE: in gpt-oss, the gate_proj and up_proj is interleaved + # - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2] + # - origin: gate, up = gate_up[..., :N], gate_up[..., N:] + + @torch.compile(dynamic=True) + def swiglu_oai(gate_up): + alpha = 1.702 + limit = 7.0 + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + gate = gate.clamp(min=None, max=limit) + up = up.clamp(min=-limit, max=limit) + glu = gate * torch.sigmoid(gate * alpha) + return (up + 1) * glu + + intermediate_cache2 = swiglu_oai(intermediate_cache1) + else: + raise ValueError(f"Unsupported activation: {activation}. " + "Only silu and swiglu_oai activations are supported.") if expert_map is not None: intermediate_cache3.zero_() @@ -171,6 +196,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor, intermediate_cache2, intermediate_cache3, w2, + bias2, w2_scale, global_scale2, w2_zeros, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index ddc02168e5c4..36e75825853e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -36,7 +36,7 @@ from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx, - has_triton_kernels, is_torch_equal_or_newer, round_up) + round_up) from vllm.utils.flashinfer import has_flashinfer if current_platform.is_cuda_alike(): @@ -751,19 +751,11 @@ def __init__( self.global_num_experts = num_experts + num_redundant_experts # we padding globally so EP buffer allocation works - if quant_config and quant_config.get_name() == "mxfp4": - if not current_platform.is_device_capability(100): - if not is_torch_equal_or_newer("2.8.0"): - raise RuntimeError( - "Mxfp4 on non-blackwell requires torch >= 2.8.0") - if not has_triton_kernels(): - raise NotImplementedError( - "triton_kernels must be installed for " - "mxfp4 on non-blackwell") - if (current_platform.is_rocm() - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): - hidden_size = round_up(hidden_size, 256) + if (quant_config and quant_config.get_name() == "mxfp4" + and (current_platform.is_rocm() + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16)): + hidden_size = round_up(hidden_size, 256) # For smuggling this layer into the fused moe custom op compilation_config = vllm_config.compilation_config diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 6cf02658a94c..ed7ffb21e85a 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -25,7 +25,7 @@ apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported, check_marlin_supports_layer, check_moe_marlin_supports_layer, marlin_make_empty_g_idx, marlin_make_workspace_new, - marlin_moe_permute_scales, marlin_permute_scales, + marlin_moe_permute_scales, marlin_permute_bias, marlin_permute_scales, moe_awq_to_marlin_zero_points, verify_marlin_supported, verify_marlin_supports_shape) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead @@ -303,6 +303,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.g_idx = marlin_make_empty_g_idx(device) layer.g_idx_sort_indices = marlin_make_empty_g_idx(device) + if hasattr(layer, "bias") and layer.bias is not None: + layer.bias.data = marlin_permute_bias(layer.bias) + def apply( self, layer: torch.nn.Module, @@ -469,6 +472,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: num_bits=self.quant_config.weight_bits) replace_parameter(layer, "w2_qzeros", marlin_w2_zp) + if hasattr(layer, "w13_bias") and layer.w13_bias is not None: + layer.w13_bias.data = marlin_permute_bias(layer.w13_bias) + + if hasattr(layer, "w2_bias") and layer.w2_bias is not None: + layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) + def apply( self, layer: torch.nn.Module, @@ -513,6 +522,8 @@ def apply( x, layer.w13_qweight, layer.w2_qweight, + getattr(layer, "w13_bias", None), + getattr(layer, "w2_bias", None), layer.w13_scales, layer.w2_scales, router_logits, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c04f7c39a5f5..839942beaf40 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -324,6 +324,8 @@ def apply( x, layer.w13_weight, layer.w2_weight, + None, + None, layer.w13_weight_scale, layer.w2_weight_scale, router_logits, @@ -795,6 +797,8 @@ def apply( x, layer.w13_weight, layer.w2_weight, + None, + None, layer.w13_weight_scale, layer.w2_weight_scale, router_logits, @@ -1253,6 +1257,8 @@ def apply( x, layer.w13_weight_packed, layer.w2_weight_packed, + None, + None, layer.w13_weight_scale, layer.w2_weight_scale, router_logits, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 9577fa025b70..5e107c799b9f 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -983,6 +983,8 @@ def apply( x, layer.w13_weight, layer.w2_weight, + None, + None, layer.w13_weight_scale, layer.w2_weight_scale, router_logits, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 9bed5e2e4889..3299221e3af3 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -24,7 +24,7 @@ get_dynamic_override, get_linear_quant_method, override_config) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, check_moe_marlin_supports_layer, - marlin_make_workspace_new, marlin_moe_permute_scales, + marlin_make_workspace_new, marlin_moe_permute_scales, marlin_permute_bias, marlin_repeat_scales_on_all_ranks, verify_marlin_supported) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, GroupQuantScaleParameter, @@ -618,6 +618,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ) replace_parameter(layer, "w2_scales", marlin_w2_scales) + if hasattr(layer, "w13_bias") and layer.w13_bias is not None: + layer.w13_bias.data = marlin_permute_bias(layer.w13_bias) + + if hasattr(layer, "w2_bias") and layer.w2_bias is not None: + layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) + def apply( self, layer: torch.nn.Module, @@ -662,6 +668,8 @@ def apply( x, layer.w13_qweight, layer.w2_qweight, + getattr(layer, "w13_bias", None), + getattr(layer, "w2_bias", None), layer.w13_scales, layer.w2_scales, router_logits, diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py index ee8a0e34b32e..8385ccac32a2 100644 --- a/vllm/model_executor/layers/quantization/hqq_marlin.py +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -14,7 +14,7 @@ QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, - marlin_make_empty_g_idx, marlin_permute_scales) + marlin_make_empty_g_idx, marlin_permute_bias, marlin_permute_scales) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( MarlinWorkspace) from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack @@ -284,6 +284,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.marlin_zeros = marlin_zp layer.marlin_scales = marlin_s + if hasattr(layer, "bias") and layer.bias is not None: + layer.bias.data = marlin_permute_bias(layer.bias) + def apply( self, layer: torch.nn.Module, @@ -307,6 +310,7 @@ def apply( x, None, layer.marlin_qweight, + bias, scales, None, zeros, @@ -326,7 +330,4 @@ def apply( if orig_type != torch.float16: marlin_out = marlin_out.to(orig_type) - if bias is not None: - marlin_out.add_(bias) - return marlin_out diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py index 73e0b17ea85a..5eb99383097b 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -9,8 +9,9 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear, check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx, - marlin_make_workspace_new, marlin_permute_scales, marlin_sort_g_idx, - marlin_zero_points, query_marlin_supported_quant_types, unpack_cols) + marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales, + marlin_sort_g_idx, marlin_zero_points, query_marlin_supported_quant_types, + unpack_cols) from vllm.model_executor.parameter import (BasevLLMParameter, permute_param_layout_) from vllm.platforms import current_platform @@ -111,6 +112,9 @@ def transform_w_s(x): self._transform_param(layer, self.w_q_name, transform_w_q) self._transform_param(layer, self.w_s_name, transform_w_s) + if hasattr(layer, "bias") and layer.bias is not None: + layer.bias.data = marlin_permute_bias(layer.bias) + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index bed502226716..8868c623796a 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1330,6 +1330,8 @@ def apply( x, layer.w13_weight, layer.w2_weight, + None, + None, layer.w13_weight_scale, layer.w2_weight_scale, router_logits, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 160e78434949..594f5136ecc3 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -15,13 +15,17 @@ from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + prepare_moe_fp4_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( _can_support_mxfp4, _swizzle_mxfp4) from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import next_power_of_2, round_up +from vllm.scalar_type import scalar_types +from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer, + next_power_of_2, round_up) if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): @@ -81,6 +85,21 @@ def __init__(self, moe: FusedMoEConfig): super().__init__() self.topk_indices_dtype = None self.moe = moe + self.use_marlin = self._should_use_marlin() + + def _should_use_marlin(self): + if envs.VLLM_MXFP4_USE_MARLIN is not None: + return envs.VLLM_MXFP4_USE_MARLIN + if current_platform.is_cuda() and \ + not current_platform.has_device_capability(100): + if not current_platform.is_device_capability(90): + # marlin kernel has better performance on ampere + return True + if not has_triton_kernels(): + return True + if not is_torch_equal_or_newer("2.8.0"): + return True + return False def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -101,11 +120,29 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, intermediate_size_per_partition_after_pad = \ intermediate_size_per_partition - # pad the intermediate size to be a multiple of 2 * mxfp4_block - # for to hold non-uniform sharded tensor as well as swizzling - # other padding to increase performance - if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + if self.use_marlin: + # The moe marlin kernel requires that for each linear + # n % 256 == 0 and k % 128 == 0. + # In gate_up_proj: + # n = 2 * intermediate_size_per_partition_after_pad + # k = hidden_size + # In down_proj + # n = hidden_size + # k = intermediate_size_per_partition_after_pad + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 128) + hidden_size = round_up(hidden_size, 256) + + layer.params_dtype = params_dtype + layer.num_experts = num_experts + layer.hidden_size = hidden_size + layer.intermediate_size_per_partition = \ + intermediate_size_per_partition_after_pad + elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + # pad the intermediate size to be a multiple of 2 * mxfp4_block + # for to hold non-uniform sharded tensor as well as swizzling + # other padding to increase performance intermediate_size_per_partition_after_pad = round_up( intermediate_size_per_partition, 256) hidden_size = round_up(hidden_size, 256) @@ -191,8 +228,10 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, set_weight_attrs(w2_bias, extra_weight_attrs) def process_weights_after_loading(self, layer): - if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16) and current_platform.is_device_capability(100): + if self.use_marlin: + prepare_moe_fp4_layer_for_marlin(layer) + elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): layer.gemm1_alpha = Parameter(torch.tensor( [1.702] * self.num_experts, dtype=torch.float32).cuda(), requires_grad=False) @@ -442,13 +481,45 @@ def apply( if enable_eplb: raise NotImplementedError("EPLB is not supported for mxfp4") + if self.use_marlin: + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + return torch.ops.vllm.fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + layer.w13_bias, + layer.w2_bias, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + global_scale1=None, + global_scale2=None, + quant_type_id=scalar_types.float4_e2m1f.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + activation=activation, + expert_map=expert_map) + assert _can_support_mxfp4( use_grouped_topk, topk_group, num_expert_group, expert_map, custom_routing_function, e_score_correction_bias, apply_router_weight_on_input, scoring_func, activation, expert_load_view, logical_to_physical_map, - logical_replica_count), ("MXFP4 are not supported\ - with this configuration.") + logical_replica_count), ( + "MXFP4 are not supported with this configuration.") if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16) and current_platform.is_device_capability(100): diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 7540a1516fcb..02057b476c6e 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -261,6 +261,13 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int, return s +def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor: + origin_shape = s.shape + _, scale_perm_single = get_scale_perms() + s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single] + return s.reshape(*origin_shape).contiguous() + + def marlin_moe_permute_scales( s: torch.Tensor, size_k: int, @@ -410,6 +417,7 @@ def apply_gptq_marlin_linear( output = ops.gptq_marlin_gemm(reshaped_x, None, weight, + bias, weight_scale, None, weight_zp, @@ -425,9 +433,6 @@ def apply_gptq_marlin_linear( use_fp32_reduce=use_fp32_reduce, is_zp_float=False) - if bias is not None: - output.add_(bias) # In-place add - return output.reshape(out_shape) @@ -456,6 +461,7 @@ def apply_awq_marlin_linear( output = ops.gptq_marlin_gemm(reshaped_x, None, weight, + bias, weight_scale, None, weight_zp, @@ -470,7 +476,4 @@ def apply_awq_marlin_linear( use_fp32_reduce=use_fp32_reduce, is_zp_float=False) - if bias is not None: - output.add_(bias) # In-place add - return output.reshape(out_shape) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index ca10db69dc16..94ffdcd26ecd 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -8,8 +8,8 @@ import vllm._custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales, - should_use_atomic_add_reduce) + USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias, + marlin_permute_scales, should_use_atomic_add_reduce) from vllm.platforms import current_platform from vllm.scalar_type import scalar_types @@ -22,7 +22,7 @@ def is_fp4_marlin_supported(): return current_platform.has_device_capability(80) -def fp4_marlin_process_scales(marlin_scales): +def nvfp4_marlin_process_scales(marlin_scales): if not (marlin_scales >= 0).all(): logger.warning_once( "NVFP4 Marlin assumes the scales to be >=0, but has encountered " @@ -56,7 +56,20 @@ def fp4_marlin_process_scales(marlin_scales): return marlin_scales -def fp4_marlin_process_global_scale(global_scale): +def mxfp4_marlin_process_scales(marlin_scales): + # 8 is the number of scale number using by one thread + marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8) + marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape( + marlin_scales.size(0) * 2, -1) + + # fit the layout of fp8 dequantization + marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view( + marlin_scales.size(0), -1) + marlin_scales = marlin_scales.to(torch.float8_e8m0fnu) + return marlin_scales + + +def nvfp4_marlin_process_global_scale(global_scale): assert global_scale.dtype in [torch.half, torch.bfloat16] fp4_exponent = 2 if global_scale.dtype == torch.half: @@ -73,7 +86,7 @@ def apply_fp4_marlin_linear( input: torch.Tensor, weight: torch.Tensor, weight_scale: torch.Tensor, - weight_scale_2: torch.Tensor, + weight_scale_2: Optional[torch.Tensor], workspace: torch.Tensor, size_n: int, size_k: int, @@ -94,6 +107,7 @@ def apply_fp4_marlin_linear( output = ops.gptq_marlin_gemm(a=reshaped_x, c=None, b_q_weight=weight, + b_bias=bias, b_scales=weight_scale, global_scale=weight_scale_2, b_zeros=None, @@ -107,9 +121,6 @@ def apply_fp4_marlin_linear( use_atomic_add=use_atomic_add, use_fp32_reduce=use_fp32_reduce) - if bias is not None: - output.add_(bias) # In-place add - return output.reshape(out_shape) @@ -120,6 +131,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None: "be used leveraging the Marlin kernel. This may degrade " "performance for compute-heavy workloads.") + is_nvfp4 = hasattr(layer, "weight_scale_2") + group_size = 16 if is_nvfp4 else 32 + part_size_n = layer.output_size_per_partition part_size_k = layer.input_size_per_partition param_dtype = layer.params_dtype @@ -145,18 +159,35 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None: # WEIGHT SCALES # Permute scales - weight_scale = layer.weight_scale.T.to(param_dtype) + weight_scale = layer.weight_scale.T.contiguous() + + if not is_nvfp4: + weight_scale = weight_scale.view(torch.float8_e8m0fnu) + + weight_scale = weight_scale.to(param_dtype) weight_scale = marlin_permute_scales(s=weight_scale, size_k=part_size_k, size_n=part_size_n, - group_size=16) - weight_scale = fp4_marlin_process_scales(weight_scale) - layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) - - weight_scale_2 = layer.weight_scale_2.to(param_dtype) - weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2) - layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2, - requires_grad=False) + group_size=group_size) + + if is_nvfp4: + weight_scale = nvfp4_marlin_process_scales(weight_scale) + layer.weight_scale = torch.nn.Parameter(weight_scale, + requires_grad=False) + + weight_scale_2 = layer.weight_scale_2.to(param_dtype) + weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2) + layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2, + requires_grad=False) + else: + weight_scale = mxfp4_marlin_process_scales(weight_scale) + layer.weight_scale = torch.nn.Parameter(weight_scale, + requires_grad=False) + + if hasattr(layer, "bias") and layer.bias is not None: + assert layer.bias.shape == (part_size_n, ) + bias = marlin_permute_bias(layer.bias) + layer.bias = torch.nn.Parameter(bias, requires_grad=False) return @@ -168,6 +199,9 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None: "be used leveraging the Marlin kernel. This may degrade " "performance for compute-heavy workloads.") + is_nvfp4 = hasattr(layer, "w13_weight_scale_2") + group_size = 16 if is_nvfp4 else 32 + e = layer.num_experts k = layer.hidden_size n = layer.intermediate_size_per_partition @@ -208,8 +242,13 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None: # WEIGHT SCALES # Permute scales for name in ["w13", "w2"]: - scales = getattr(layer, name + "_weight_scale").to(param_dtype) - global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype) + scales = getattr(layer, name + "_weight_scale") + if not is_nvfp4: + scales = scales.view(torch.float8_e8m0fnu) + scales = scales.to(param_dtype) + if is_nvfp4: + global_scale = getattr(layer, + name + "_weight_scale_2").to(param_dtype) tensor_list = [] if "w13" in name: @@ -218,23 +257,47 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None: size_n, size_k = k, n for i in range(e): - marlin_scales = marlin_permute_scales(s=scales[i].T, + scale = scales[i].T + + marlin_scales = marlin_permute_scales(s=scale, size_k=size_k, size_n=size_n, - group_size=16) - marlin_scales = fp4_marlin_process_scales(marlin_scales) + group_size=group_size) + if is_nvfp4: + marlin_scales = nvfp4_marlin_process_scales(marlin_scales) + else: + marlin_scales = mxfp4_marlin_process_scales(marlin_scales) tensor_list.append(marlin_scales) scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) scales = torch.nn.Parameter(scales, requires_grad=False) setattr(layer, name + "_weight_scale", scales) - global_scale = fp4_marlin_process_global_scale(global_scale) - global_scale = torch.nn.Parameter(global_scale, requires_grad=False) - setattr(layer, name + "_weight_scale_2", global_scale) + if is_nvfp4: + global_scale = nvfp4_marlin_process_global_scale(global_scale) + global_scale = torch.nn.Parameter(global_scale, + requires_grad=False) + setattr(layer, name + "_weight_scale_2", global_scale) + + # BIAS + # Permute bias + for name in ["w13_bias", "w2_bias"]: + if not hasattr(layer, name): + continue + bias = getattr(layer, name).to(param_dtype) + + tensor_list = [] + for i in range(e): + expert_bias = bias[i] + + tensor_list.append(marlin_permute_bias(expert_bias)) + + bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + bias = torch.nn.Parameter(bias, requires_grad=False) + setattr(layer, name, bias) -def rand_marlin_weight_fp4_like(weight, group_size): +def rand_marlin_weight_nvfp4_like(weight, group_size): assert group_size > 0 size_n, size_k = weight.shape device = weight.device @@ -276,8 +339,58 @@ def rand_marlin_weight_fp4_like(weight, group_size): size_k=size_k, size_n=size_n, group_size=group_size) - marlin_scales = fp4_marlin_process_scales(marlin_scales) + marlin_scales = nvfp4_marlin_process_scales(marlin_scales) - global_scale = fp4_marlin_process_global_scale(global_scale) + global_scale = nvfp4_marlin_process_global_scale(global_scale) return weight_ref.T, marlin_qweight, marlin_scales, global_scale + + +def rand_marlin_weight_mxfp4_like(weight, group_size): + assert group_size > 0 + size_n, size_k = weight.shape + device = weight.device + + scales = torch.randint(100, + 125, (size_n, size_k // group_size), + dtype=torch.uint8, + device=weight.device) + scales = scales.view(torch.float8_e8m0fnu) + + fp4_weight = torch.randint(0, + 256, (size_n, size_k // 2), + dtype=torch.uint8, + device=weight.device) + fp4_weight_part_1 = ((fp4_weight & 0b10000000) | + ((fp4_weight & 0b01110000) >> 2)) + fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn) + fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6) + + fp4_weight2 = fp4_weight << 4 + fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) | + ((fp4_weight2 & 0b01110000) >> 2)) + fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn) + fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6) + + weight_ref = torch.cat( + [fp4_weight_part_2.unsqueeze(2), + fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k) + weight_ref = weight_ref * \ + scales.repeat_interleave(group_size, 1).to(weight.dtype) + + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=fp4_weight.view(torch.int32).T.contiguous(), + perm=torch.empty(0, dtype=torch.int, device=device), + size_k=size_k, + size_n=size_n, + num_bits=4, + ) + + marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype), + size_k=size_k, + size_n=size_n, + group_size=group_size) + + marlin_scales = mxfp4_marlin_process_scales(marlin_scales) + + return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 5372c49d9838..511e19545d5a 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -8,8 +8,8 @@ import vllm._custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales, - should_use_atomic_add_reduce) + USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias, + marlin_permute_scales, should_use_atomic_add_reduce) from vllm.platforms import current_platform from vllm.scalar_type import scalar_types @@ -58,6 +58,7 @@ def apply_fp8_marlin_linear( output = ops.gptq_marlin_gemm(a=reshaped_x, c=None, b_q_weight=weight, + b_bias=bias, b_scales=weight_scale, global_scale=None, b_zeros=None, @@ -71,9 +72,6 @@ def apply_fp8_marlin_linear( use_atomic_add=use_atomic_add, use_fp32_reduce=use_fp32_reduce) - if bias is not None: - output.add_(bias) # In-place add - return output.reshape(out_shape) @@ -160,6 +158,11 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module, marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales) layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False) + if hasattr(layer, "bias") and layer.bias is not None: + assert layer.bias.shape == (part_size_n, ) + bias = marlin_permute_bias(layer.bias) + layer.bias = torch.nn.Parameter(bias, requires_grad=False) + def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module, size_k_first: bool = True) -> None: @@ -274,6 +277,23 @@ def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module, setattr(layer, name + "_weight_scale", scales) + # BIAS + # Permute bias + for name in ["w13_bias", "w2_bias"]: + if not hasattr(layer, name): + continue + bias = getattr(layer, name).to(layer.orig_dtype) + + tensor_list = [] + for i in range(e): + expert_bias = bias[i] + + tensor_list.append(marlin_permute_bias(expert_bias)) + + bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + bias = torch.nn.Parameter(bias, requires_grad=False) + setattr(layer, name, bias) + def pack_fp8_to_int32(fp8_tensor: torch.Tensor, size_k_first: bool = True) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 95eabe149d89..deeb69bcad0e 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -61,7 +61,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, scoring_func: str = "softmax", - activation: str = "silu", + activation: str = "swiglu_oai", expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None): diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 9060b55c79b0..6f11ab8e0300 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -327,6 +327,8 @@ class scalar_types: uint8 = ScalarType.uint(8, None) float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN) float8_e5m2 = ScalarType.float_IEEE754(5, 2) + float8_e8m0fnu = ScalarType(8, 0, False, 0, True, + NanRepr.EXTD_RANGE_MAX_MIN) float16_e8m7 = ScalarType.float_IEEE754(8, 7) float16_e5m10 = ScalarType.float_IEEE754(5, 10) From adb7678a7a5bdf80921dda052e15a93a8aa93eb3 Mon Sep 17 00:00:00 2001 From: Nir Date: Thu, 14 Aug 2025 22:56:54 +0300 Subject: [PATCH 018/231] docs: update fastsafetensors usage instructions (#22891) Signed-off-by: Nir Levy Signed-off-by: Duncan Moss --- docs/models/extensions/fastsafetensor.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md index 531d58690014..2a5a18102dc2 100644 --- a/docs/models/extensions/fastsafetensor.md +++ b/docs/models/extensions/fastsafetensor.md @@ -2,4 +2,5 @@ Loading Model weights with fastsafetensors =================================================================== Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details. -For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true`` + +To enable this feature, use the ``--load-format fastsafetensors`` command-line argument From d24c6bbe8459bc803e68f02a7e46031791c49eae Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 14 Aug 2025 15:59:16 -0400 Subject: [PATCH 019/231] [CI] Temporarily disable flaky test (#22930) Signed-off-by: Lucas Wilkinson Signed-off-by: Duncan Moss --- tests/v1/e2e/test_spec_decode.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index dde95fbe590b..7b3f45831279 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -162,6 +162,12 @@ def test_eagle_correctness( mm_enabled: bool, attn_backend: str, ): + if attn_backend == "TREE_ATTN": + # TODO: Fix this flaky test + pytest.skip( + "TREE_ATTN is flaky in the test disable for now until it can be " + "reolved (see https://github.com/vllm-project/vllm/issues/22922)") + # Generate test prompts inside the function instead of using fixture test_prompts = get_test_prompts(mm_enabled) ''' From 79e4f5a864d6d33c363d145772756fcfacc5de5a Mon Sep 17 00:00:00 2001 From: nvjullin Date: Fri, 15 Aug 2025 04:03:55 +0800 Subject: [PATCH 020/231] [Kernel] Add nvfp4 gemm flashinfer backends (#22346) Signed-off-by: Julien Lin Signed-off-by: mgoin Co-authored-by: mgoin Signed-off-by: Duncan Moss --- .buildkite/test-pipeline.yaml | 1 + .../test_flashinfer_nvfp4_scaled_mm.py | 139 ++++++++++++++++++ .../quantization/test_nvfp4_scaled_mm.py | 3 + vllm/envs.py | 7 + .../schemes/compressed_tensors_w4a4_nvfp4.py | 64 ++++++-- .../layers/quantization/modelopt.py | 84 ++++++++--- vllm/model_executor/warmup/kernel_warmup.py | 39 ++++- vllm/utils/flashinfer.py | 71 +++++++++ vllm/v1/worker/gpu_worker.py | 4 +- 9 files changed, 371 insertions(+), 41 deletions(-) create mode 100644 tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 740be2bc8770..942a8d3f9bfd 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -669,6 +669,7 @@ steps: - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py # Fusion - pytest -v -s tests/compile/test_fusion_all_reduce.py diff --git a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py new file mode 100644 index 000000000000..131086a5f703 --- /dev/null +++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, + convert_swizzled_to_linear, dequantize_nvfp4_to_dtype) + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm + +if not current_platform.has_device_capability(100): + pytest.skip( + reason="Nvfp4 Requires compute capability of 10 or above.", + allow_module_level=True, + ) + +DTYPES = [torch.float16, torch.bfloat16] +# m, n, k +SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)] +PAD_SHAPES = [(150, 128, 64), (128, 128, 96)] +SHAPES.extend(PAD_SHAPES) + +SEEDS = [42] +CUDA_DEVICES = ["cuda:0"] + + +def get_ref_results( + a_fp4, + b_fp4, + a_sf, + b_sf, + a_global_scale, + b_global_scale, + m, + n, + dtype, + block_size, + device, +): + _, m_k = a_fp4.shape + _, n_k = b_fp4.shape + assert m_k == n_k + a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4, + a_sf, + a_global_scale, + dtype=dtype, + device=device, + block_size=block_size) + b_in_dtype = dequantize_nvfp4_to_dtype(b_fp4, + b_sf, + b_global_scale, + dtype=dtype, + device=device, + block_size=block_size) + return torch.matmul(a_in_dtype, b_in_dtype.t()) + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("backend", ["cutlass", "trtllm"]) +@pytest.mark.parametrize("autotune", [False, True]) +@torch.inference_mode() +def test_flashinfer_nvfp4_gemm( + dtype: torch.dtype, + shape: tuple[int, int, int], + seed: int, + device: str, + backend: str, + autotune: bool, +) -> None: + if backend == "trtllm" and dtype == torch.float16: + pytest.skip( + "Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations") + + current_platform.seed_everything(seed) + m, n, packed_k = shape + k = packed_k * 2 + block_size = 16 + a_dtype = torch.randn((m, k), dtype=dtype, device=device) + b_dtype = torch.randn((n, k), dtype=dtype, device=device) + + a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / + torch.amax(a_dtype.flatten(), dim=-1)).to(torch.float32) + b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / + torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32) + alpha = 1.0 / (a_global_scale * b_global_scale) + # ops.scaled_fp4_quant returns swizzled scales, while weights + # from checkpoints are in linear scales. + # So instead of needing to swizzle for cutlass as in modelopt.py, + # we need to unswizzle for trtllm here. + a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale) + b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale) + + # get_ref_results unswizzles the scales internally. + expected_out = get_ref_results( + a_fp4, + b_fp4, + a_scale_interleaved, + b_scale_interleaved, + a_global_scale, + b_global_scale, + m, + n, + dtype, + block_size, + device, + ) + + import flashinfer + + if backend == "trtllm": + epilogue_tile_m = 128 + b_fp4 = flashinfer.shuffle_matrix_a(b_fp4.view(torch.uint8), + epilogue_tile_m) + + b_scale_interleaved = convert_swizzled_to_linear( + b_scale_interleaved, n, k, block_size) + b_scale_interleaved = (flashinfer.shuffle_matrix_sf_a( + b_scale_interleaved.view(torch.uint8), epilogue_tile_m).reshape( + b_scale_interleaved.shape).view(torch.float8_e4m3fn)) + + with flashinfer.autotune(autotune): + out = flashinfer_scaled_fp4_mm( + a_fp4, + b_fp4, + a_scale_interleaved, + b_scale_interleaved, + alpha, + dtype, + backend=backend, + ) + + torch.testing.assert_close(out, + expected_out.to(dtype=dtype), + atol=1e-1, + rtol=1e-1) diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py index 0b45c2298175..67e041f2b71c 100644 --- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py +++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py @@ -65,9 +65,12 @@ def test_nvfp4_gemm( b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32) alpha = 1. / (a_global_scale * b_global_scale) + # ops.scaled_fp4_quant returns swizzled scales, while weights + # from checkpoints are in linear scales. a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale) b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale) + # get_ref_results unswizzles the scales internally. expected_out = get_ref_results(a_fp4, b_fp4, a_scale_interleaved, b_scale_interleaved, a_global_scale, b_global_scale, m, n, dtype, block_size, diff --git a/vllm/envs.py b/vllm/envs.py index 110bb542b120..2f0bafa01cc2 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1101,6 +1101,12 @@ def get_vllm_port() -> Optional[int]: "VLLM_USE_TRTLLM_ATTENTION": lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None), + # If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer. + # Otherwise, uses the first available of: flashinfer cutlass GEMM, + # vllm cutlass GEMM, marlin GEMM. + "VLLM_USE_TRTLLM_FP4_GEMM": + lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))), + # Controls garbage collection during CUDA graph capture. # If set to 0 (default), enables GC freezing to speed up capture time. # If set to 1, allows GC to run during capture. @@ -1208,6 +1214,7 @@ def factorize(name: str): "VLLM_DP_SIZE", "VLLM_USE_STANDALONE_COMPILE", "VLLM_FUSED_MOE_CHUNK_SIZE", + "VLLM_USE_TRTLLM_FP4_GEMM", ] for key in environment_variables_to_hash: if key in environment_variables: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index 8ba72162921a..63bfe565b121 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -15,6 +15,7 @@ from vllm.model_executor.parameter import (GroupQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter) +from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer logger = init_logger(__name__) @@ -24,6 +25,13 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): def __init__(self): + if envs.VLLM_USE_TRTLLM_FP4_GEMM: + assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer" + self.backend = "flashinfer-trtllm" + elif has_flashinfer(): + self.backend = "flashinfer-cutlass" + else: + self.backend = "cutlass" self.group_size = 16 @classmethod @@ -108,16 +116,36 @@ def process_weights_after_loading(self, layer) -> None: layer.weight_global_scale.max().to(torch.float32), requires_grad=False) - swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale) - layer.weight_scale_swizzled = Parameter(swizzled_weight_scale, - requires_grad=False) - - # required by cutlass kernel; need Parameter, not ModelWeightParameter - layer.weight = Parameter(layer.weight_packed.data, requires_grad=False) - - layer.alpha = Parameter(layer.input_global_scale * - layer.weight_global_scale, - requires_grad=False) + if self.backend == "flashinfer-trtllm": + # FlashInfer TRTLLM FP4 GEMM requires a different weight layout. + # FlashInfer provides nvfp4_quantize to quantize + shuffle the + # layout but we use our own quantization so we have to call + # shuffles ourselves. + from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a + + weight = layer.weight_packed.data + weight_scale = layer.weight_scale.data + + epilogue_tile_m = 128 + weight = shuffle_matrix_a(weight.view(torch.uint8), + epilogue_tile_m) + weight_scale = (shuffle_matrix_sf_a(weight_scale.view( + torch.uint8), epilogue_tile_m).reshape( + weight_scale.shape).view(torch.float8_e4m3fn)) + + layer.weight_scale_swizzled = Parameter(weight_scale, + requires_grad=False) + layer.weight_packed = Parameter(weight, requires_grad=False) + else: + swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale) + layer.weight_scale_swizzled = Parameter(swizzled_weight_scale, + requires_grad=False) + layer.weight_packed = Parameter(layer.weight_packed.data, + requires_grad=False) + + layer.alpha = Parameter( + 1 / (layer.input_global_scale * layer.weight_global_scale), + requires_grad=False) def apply_weights(self, layer: torch.nn.Module, @@ -128,7 +156,7 @@ def apply_weights(self, out = run_nvfp4_emulations( x=x, input_global_scale=layer.input_global_scale, - weight=layer.weight, + weight=layer.weight_packed, weight_scale_swizzled=layer.weight_scale_swizzled, weight_global_scale=layer.weight_global_scale) if bias is not None: @@ -136,14 +164,20 @@ def apply_weights(self, return out output_dtype = x.dtype - output_shape = [x.shape[0], layer.weight.shape[0]] + output_shape = [x.shape[0], layer.weight_packed.shape[0]] # quantize BF16 or FP16 to (FP4 and interleaved block scale) x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale) - out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale, - layer.weight_scale_swizzled, - 1 / layer.alpha, output_dtype) + mm_args = (x_fp4, layer.weight_packed, x_blockscale, + layer.weight_scale_swizzled, layer.alpha, output_dtype) + if self.backend == "flashinfer-trtllm": + out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm") + elif self.backend == "flashinfer-cutlass": + out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass") + else: + out = cutlass_scaled_fp4_mm(*mm_args) + if bias is not None: out = out + bias return out.view(*output_shape) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 8868c623796a..8f9ca73bc505 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -38,7 +38,8 @@ PerTensorScaleParameter) from vllm.scalar_type import scalar_types from vllm.utils import next_power_of_2 -from vllm.utils.flashinfer import has_flashinfer_moe +from vllm.utils.flashinfer import (flashinfer_scaled_fp4_mm, has_flashinfer, + has_flashinfer_moe) logger = init_logger(__name__) @@ -724,16 +725,20 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): def __init__(self, quant_config: ModelOptNvFp4Config) -> None: self.quant_config = quant_config - self.cutlass_nvfp4_supported = cutlass_fp4_supported() - self.use_marlin = False - if not self.cutlass_nvfp4_supported: - if is_fp4_marlin_supported(): - self.use_marlin = True - else: - raise ValueError("Current platform does not support NVFP4" - " quantization. Please use Blackwell and" - " above.") + if envs.VLLM_USE_TRTLLM_FP4_GEMM: + assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer" + self.backend = "flashinfer-trtllm" + elif has_flashinfer(): + self.backend = "flashinfer-cutlass" + elif cutlass_fp4_supported(): + self.backend = "cutlass" + elif is_fp4_marlin_supported(): + self.backend = "marlin" + else: + raise ValueError("Current platform does not support NVFP4" + " quantization. Please use Blackwell and" + " above.") def create_weights( self, @@ -815,17 +820,38 @@ def process_weights_after_loading(self, layer: Module) -> None: # block_size = 16; assert (layer.weight_scale.dtype == torch.float8_e4m3fn), ( "Weight Block scale must be represented as FP8-E4M3") - swizzled_weight_scale = swizzle_blockscale(layer.weight_scale) - layer.weight_scale_swizzled = Parameter(swizzled_weight_scale, - requires_grad=False) - layer.weight = Parameter(layer.weight.data, requires_grad=False) + if self.backend == "flashinfer-trtllm": + # FlashInfer TRTLLM FP4 GEMM requires a different weight layout. + # FlashInfer provides nvfp4_quantize to quantize + shuffle the + # layout but we use our own quantization so we have to call + # shuffles ourselves. + from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a + + weight = layer.weight.data + weight_scale = layer.weight_scale.data + + epilogue_tile_m = 128 + weight = shuffle_matrix_a(weight.view(torch.uint8), + epilogue_tile_m) + weight_scale = (shuffle_matrix_sf_a(weight_scale.view( + torch.uint8), epilogue_tile_m).reshape( + weight_scale.shape).view(torch.float8_e4m3fn)) + + layer.weight_scale_swizzled = Parameter(weight_scale, + requires_grad=False) + layer.weight = Parameter(weight, requires_grad=False) + else: + swizzled_weight_scale = swizzle_blockscale(layer.weight_scale) + layer.weight_scale_swizzled = Parameter(swizzled_weight_scale, + requires_grad=False) + layer.weight = Parameter(layer.weight.data, requires_grad=False) - if self.use_marlin: - prepare_fp4_layer_for_marlin(layer) - del layer.alpha - del layer.input_scale - del layer.weight_scale_swizzled + if self.backend == "marlin": + prepare_fp4_layer_for_marlin(layer) + del layer.alpha + del layer.input_scale + del layer.weight_scale_swizzled def apply( self, @@ -833,7 +859,7 @@ def apply( x: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - if self.use_marlin: + if self.backend == "marlin": return apply_fp4_marlin_linear( input=x, weight=layer.weight, @@ -859,9 +885,21 @@ def apply( assert (layer.weight_scale_swizzled.dtype == torch.float8_e4m3fn) assert (layer.alpha.dtype == torch.float32) - out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale, - layer.weight_scale_swizzled, layer.alpha, - output_dtype) + mm_args = ( + x_fp4, + layer.weight, + x_blockscale, + layer.weight_scale_swizzled, + layer.alpha, + output_dtype, + ) + if self.backend == "flashinfer-trtllm": + out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm") + elif self.backend == "flashinfer-cutlass": + out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass") + else: + out = cutlass_scaled_fp4_mm(*mm_args) + if bias is not None: out = out + bias return out.view(*output_shape) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 10f2dc0252a1..761172e4d361 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -5,16 +5,53 @@ This is useful specifically for JIT'ed kernels as we don't want JIT'ing to happen during model execution. """ +from typing import TYPE_CHECKING + import torch import vllm.envs as envs from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup +from vllm.platforms import current_platform from vllm.utils.deep_gemm import is_deep_gemm_supported +from vllm.utils.flashinfer import has_flashinfer + +if TYPE_CHECKING: + from vllm.v1.worker.gpu_model_runner import GPUModelRunner + from vllm.v1.worker.gpu_worker import Worker -def kernel_warmup(model: torch.nn.Module, max_tokens: int): +def kernel_warmup(worker: "Worker"): + # Deep GEMM warmup do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM and is_deep_gemm_supported() and not envs.VLLM_SKIP_DEEP_GEMM_WARMUP) if do_deep_gemm_warmup: + model = worker.get_model() + max_tokens = worker.scheduler_config.max_num_batched_tokens deep_gemm_warmup(model, max_tokens) + + # FlashInfer autotune for Blackwell (SM 10.0) GPUs + if has_flashinfer() and current_platform.is_device_capability(100): + flashinfer_autotune(worker.model_runner) + + +def flashinfer_autotune(runner: "GPUModelRunner") -> None: + """ + Autotune FlashInfer operations. + FlashInfer have many implementations for the same operation, + autotuning runs benchmarks for each implementation and stores + the results. The results are cached transparently and + future calls to FlashInfer will use the best implementation. + Without autotuning, FlashInfer will rely on heuristics, which may + be significantly slower. + """ + from vllm.utils.flashinfer import autotune + + with torch.inference_mode(), autotune(): + # We skip EPLB here since we don't want to record dummy metrics + # When autotuning with number of tokens m, flashinfer will autotune + # operations for all number of tokens up to m. + # So we only need to run with the max number of tokens. + runner._dummy_run(runner.scheduler_config.max_num_batched_tokens, + skip_eplb=True, + is_profile=True) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 6b23ed426806..0d7d4b694f07 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -14,6 +14,7 @@ from typing import Any, Callable, NoReturn, Optional import requests +import torch import vllm.envs as envs from vllm.logger import init_logger @@ -193,6 +194,75 @@ def use_trtllm_attention( return use_trtllm +if has_flashinfer(): + + @torch.library.custom_op( + "vllm::flashinfer_mm_fp4", + mutates_args=[], + device_types="cuda", + ) + def flashinfer_mm_fp4( + A: torch.Tensor, + B: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + g_scale: torch.Tensor, + dtype: torch.dtype, + backend: str, + ) -> torch.Tensor: + from flashinfer import mm_fp4 as flashinfer_mm_fp4_ + return flashinfer_mm_fp4_(A, + B, + A_scale, + B_scale, + g_scale, + dtype, + block_size=16, + backend=backend) + + @torch.library.register_fake("vllm::flashinfer_mm_fp4", ) + def flashinfer_mm_fp4_fake( + A: torch.Tensor, + B: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + g_scale: torch.Tensor, + dtype: torch.dtype, + backend: str, + ) -> torch.Tensor: + return torch.empty(A.shape[0], + B.shape[1], + dtype=dtype, + device=A.device) + + +def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor, + block_scale_a: torch.Tensor, + block_scale_b: torch.Tensor, alpha: torch.Tensor, + out_dtype: torch.dtype, + backend: str) -> torch.Tensor: + assert a.ndim == 2 and b.ndim == 2 + assert block_scale_a.ndim == 2 and block_scale_b.ndim == 2 + assert a.stride(-1) == 1 and b.stride(-1) == 1 + assert a.shape[1] == b.shape[1] + assert block_scale_a.shape[1] == a.shape[1] // 8 + assert block_scale_b.shape[1] == b.shape[1] // 8 + + if backend == "cutlass": + block_scale_a = block_scale_a.view(torch.uint8) + block_scale_b = block_scale_b.view(torch.uint8) + + return flashinfer_mm_fp4( + a, + b.t(), + block_scale_a, + block_scale_b.t(), + alpha, + out_dtype, + backend=backend, + ) + + __all__ = [ "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", @@ -205,4 +275,5 @@ def use_trtllm_attention( "has_flashinfer_cutlass_fused_moe", "has_nvidia_artifactory", "use_trtllm_attention", + "flashinfer_scaled_fp4_mm", ] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0ea23921a080..84f065f25f2e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -310,6 +310,7 @@ def compile_or_warm_up_model(self) -> None: for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size, skip_eplb=True) + if not self.model_config.enforce_eager: self.model_runner.capture_model() @@ -340,8 +341,7 @@ def compile_or_warm_up_model(self) -> None: hidden_states=last_hidden_states) # Warmup kernels used during model execution - kernel_warmup(self.get_model(), - max_tokens=self.scheduler_config.max_num_batched_tokens) + kernel_warmup(self) # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. From 3d2b6c37249250892fc1e3778de8e28a8837237a Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 14 Aug 2025 17:32:09 -0400 Subject: [PATCH 021/231] [Quantization]: Support compressed-tensors mixed-precision model loading (#22468) Signed-off-by: Dipika Sikka Signed-off-by: Duncan Moss --- .../compressed_tensors/compressed_tensors.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 69bced7c0b8e..637a84372990 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -192,7 +192,15 @@ def _quantization_scheme_map_from_config( quant_config.get("weights")) target_scheme_map[target]["input_activations"] = None - if is_activation_quantization_format(quant_format): + target_scheme_map[target]["format"] = quant_config.get( + "format") + format = target_scheme_map[target].get("format") + # If no per-config format defined, use global format in config + act_quant_format = is_activation_quantization_format( + format + ) if format is not None else is_activation_quantization_format( + quant_format) + if act_quant_format: input_activations = quant_config.get("input_activations") # The only case where we have activation quant supported # but no input_activations provided in the config @@ -389,8 +397,10 @@ def _is_wNa16_group_channel(self, weight_quant: BaseModel, return (is_channel_group and input_quant_none and is_static) def _get_scheme_from_parts( - self, weight_quant: BaseModel, - input_quant: BaseModel) -> "CompressedTensorsScheme": + self, + weight_quant: BaseModel, + input_quant: BaseModel, + format: Optional[str] = None) -> "CompressedTensorsScheme": # Detect If Mixed Precision if self._is_fp4a16_nvfp4(weight_quant, input_quant): return CompressedTensorsW4A16Fp4() @@ -412,7 +422,11 @@ def _get_scheme_from_parts( group_size=weight_quant.group_size, actorder=weight_quant.actorder) - if is_activation_quantization_format(self.quant_format): + act_quant_format = is_activation_quantization_format( + format + ) if format is not None else is_activation_quantization_format( + self.quant_format) + if act_quant_format: if self._is_fp4a4_nvfp4(weight_quant, input_quant): if cutlass_fp4_supported( ) or envs.VLLM_USE_NVFP4_CT_EMULATIONS: @@ -507,6 +521,7 @@ def get_scheme(self, scheme_dict = self.target_scheme_map[matched_target] weight_quant = scheme_dict.get("weights") input_quant = scheme_dict.get("input_activations") + format = scheme_dict.get("format") # Find the sparsity scheme of the layer # assume that fused layers inerhit first component's sparsity scheme @@ -547,7 +562,7 @@ def get_scheme(self, scheme = self._get_scheme_from_parts( # type: ignore weight_quant=weight_quant, input_quant=input_quant, - ) + format=format) # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace) From ecf34ca83aaf9f98fc02831472e6922a593ef62d Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 14 Aug 2025 14:49:02 -0700 Subject: [PATCH 022/231] [Core] Return final response for aborted requests from `AsyncLLM.generate` (#22283) Signed-off-by: Nick Hill Signed-off-by: Duncan Moss --- tests/v1/engine/test_async_llm.py | 87 ++++++++++++++++++++++++++++++ vllm/v1/engine/output_processor.py | 33 +++++++----- 2 files changed, 107 insertions(+), 13 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 21694491dd73..484640233f52 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -13,6 +13,7 @@ from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs import PromptType +from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind from vllm.utils import set_default_torch_num_threads @@ -398,3 +399,89 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): # Test 3: Verify healthy engine still works after mock await engine.check_health() + + +@pytest.mark.parametrize( + "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +@pytest.mark.asyncio +async def test_abort_final_output( + monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, +): + """Test that abort() returns a final output with correct information.""" + + with monkeypatch.context() as m, ExitStack() as after: + m.setenv("VLLM_USE_V1", "1") + + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) + after.callback(engine.shutdown) + + request_id = "test-abort-final-output" + + # Start a long-running request + sampling_params = SamplingParams( + max_tokens=3000, # Long enough to allow abort + ignore_eos=True, + output_kind=output_kind, + temperature=0.5, + seed=42, + ) + + outputs: list[RequestOutput] = [] + generated = asyncio.create_task( + collect_outputs(engine, request_id, TEXT_PROMPT, sampling_params, + outputs)) + + # Let it generate some tokens + await asyncio.sleep(0.5) + + # Abort the request + await engine.abort(request_id) + + # Wait for generation to complete and return final output + final_output = await generated + + # Verify we got a final output + assert final_output is not None + assert final_output.finished + assert len(final_output.outputs) == 1 + + assert final_output.outputs[0].finish_reason == "abort" + assert final_output.outputs[0].stop_reason is None + + # Verify num_cached_tokens is set correctly + assert hasattr(final_output, 'num_cached_tokens') + assert final_output.num_cached_tokens >= 0 + + # If we got intermediate outputs, verify they are consistent + if output_kind == RequestOutputKind.DELTA: + # For DELTA, sum all intermediate tokens should <= final tokens + token_count = sum( + len(output.outputs[0].token_ids) for output in outputs) + assert token_count > 0 + assert len(final_output.outputs[0].token_ids) == 0 + else: + # For FINAL_ONLY, we should only get the final output + assert len(outputs) == 0 + assert len(final_output.outputs[0].token_ids) > 0 + + assert not engine.output_processor.has_unfinished_requests() + + +async def collect_outputs( + engine: AsyncLLM, + request_id: str, + prompt: PromptType, + sampling_params: SamplingParams, + outputs_list: list[RequestOutput], +) -> Optional[RequestOutput]: + """Helper to collect outputs and return the final one.""" + final_output: Optional[RequestOutput] = None + async for output in engine.generate(request_id=request_id, + prompt=prompt, + sampling_params=sampling_params): + if not output.finished: + outputs_list.append(output) + final_output = output + return final_output diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 3be6c4821214..2ee55b585da6 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -107,6 +107,7 @@ def __init__( self.max_tokens_param = max_tokens_param self.is_prefilling = True self.queue = queue + self.num_cached_tokens = 0 self.stats = RequestStateStats( arrival_time=arrival_time) if log_stats else None @@ -167,7 +168,6 @@ def make_request_output( finish_reason: Optional[FinishReason], stop_reason: Union[int, str, None], kv_transfer_params: Optional[dict[str, Any]] = None, - num_cached_tokens: int = 0, ) -> Optional[Union[RequestOutput, PoolingRequestOutput]]: finished = finish_reason is not None @@ -195,7 +195,7 @@ def make_request_output( return None return self._new_request_output(request_id, outputs, finished, - kv_transfer_params, num_cached_tokens) + kv_transfer_params) def _new_request_output( self, @@ -203,14 +203,14 @@ def _new_request_output( outputs: Union[list[CompletionOutput], list[PoolingOutput]], finished: bool, kv_transfer_params: Optional[dict[str, Any]] = None, - num_cached_tokens: int = 0, ) -> Union[RequestOutput, PoolingRequestOutput]: - if isinstance(outputs[0], PoolingOutput): + first_output = outputs[0] + if isinstance(first_output, PoolingOutput): assert len(outputs) == 1 return PoolingRequestOutput( request_id=request_id, - outputs=outputs[0], + outputs=first_output, prompt_token_ids=self.prompt_token_ids, finished=finished, ) @@ -229,7 +229,7 @@ def _new_request_output( outputs=cast(list[CompletionOutput], outputs), finished=finished, kv_transfer_params=kv_transfer_params, - num_cached_tokens=num_cached_tokens, + num_cached_tokens=self.num_cached_tokens, ) def _new_completion_output( @@ -308,11 +308,18 @@ def abort_requests( if req_state is not None: self.lora_states.abort_request(req_state) request_ids_to_abort.append(request_id) - else: - parent = self.parent_requests.pop(request_id, None) - if parent and parent.child_requests: - self.abort_requests(parent.child_requests) - request_ids_to_abort.extend(parent.child_requests) + # Produce final abort output. + if req_state.queue is not None and ( + request_output := req_state.make_request_output( + [], None, FinishReason.ABORT, None, None)): + req_state.queue.put(request_output) + elif parent := self.parent_requests.get(request_id): + # Abort children prior to removing the parent. + if parent.child_requests: + child_reqs = list(parent.child_requests) + child_reqs = self.abort_requests(child_reqs) + request_ids_to_abort.extend(child_reqs) + self.parent_requests.pop(request_id, None) return request_ids_to_abort def add_request( @@ -390,7 +397,7 @@ def process_outputs( finish_reason = engine_core_output.finish_reason stop_reason = engine_core_output.stop_reason kv_transfer_params = engine_core_output.kv_transfer_params - num_cached_tokens = engine_core_output.num_cached_tokens + req_state.num_cached_tokens = engine_core_output.num_cached_tokens req_state.is_prefilling = False if pooling_output is None: @@ -411,7 +418,7 @@ def process_outputs( # 4) Create and handle RequestOutput objects. if request_output := req_state.make_request_output( new_token_ids, pooling_output, finish_reason, stop_reason, - kv_transfer_params, num_cached_tokens): + kv_transfer_params): if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put(request_output) From e27479b8a71295f5129ede9be46ef8abb962f731 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 14 Aug 2025 15:20:28 -0700 Subject: [PATCH 023/231] [BugFix] Fix initial DP request load imbalance (#22910) Signed-off-by: Nick Hill Signed-off-by: Duncan Moss --- vllm/v1/engine/core_client.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 5ffa555570a2..29ee0a9dfb1e 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -965,7 +965,7 @@ def __init__(self, # List of [waiting, running] pair per engine. # Used only by DPLBAsyncMPClient subclass. - self.lb_engines: list[list[int]] = [] + self.lb_engines: list[list[int]] = [[0, 0] for _ in self.core_engines] self.first_req_sock_addr = get_open_zmq_inproc_path() self.first_req_send_socket = self.resources.first_req_send_socket = ( @@ -1121,10 +1121,8 @@ def __init__(self, def get_core_engine_for_request( self, request: EngineCoreRequest) -> EngineIdentity: # Engines are in rank order. - current_counts = self.lb_engines if (eng_index := request.data_parallel_rank) is None: - if not current_counts: - return self.core_engine + current_counts = self.lb_engines # TODO use P2C alg for larger DP sizes num_engines = len(current_counts) min_score = sys.maxsize From b1ae1e2227105b88dcbcf17dae28d880040a3612 Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Thu, 14 Aug 2025 19:37:22 -0400 Subject: [PATCH 024/231] [Bugfix] use flash attn on sm90 (#22933) Signed-off-by: Yongye Zhu Co-authored-by: Michael Goin Signed-off-by: Duncan Moss --- vllm/platforms/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 63f6b373c322..483d5e1531a9 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -316,7 +316,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype, # FlashAttention is the default for SM 8.0+ GPUs if cls.has_device_capability(80): - if has_sink: + if has_sink and not cls.is_device_capability(90): logger.info_once("Using Triton backend on V1 engine.") return TRITON_ATTN_VLLM_V1 if is_default_backend_supported := is_attn_backend_supported( From b17cb00279b4530918d84312a7483aa1e38404a9 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 15 Aug 2025 08:21:29 +0800 Subject: [PATCH 025/231] [Kernel] Add cuda kernel for gpt_oss activation (#22538) Signed-off-by: Jee Jee Li Signed-off-by: Duncan Moss --- csrc/activation_kernels.cu | 59 +++++++++++++++++++ csrc/ops.h | 2 + csrc/torch_bindings.cpp | 5 ++ tests/kernels/core/test_activation.py | 45 ++++++++++++-- vllm/model_executor/layers/activation.py | 41 ++++++++++++- .../layers/fused_moe/fused_moe.py | 18 ++---- .../layers/quantization/utils/mxfp4_utils.py | 2 +- vllm/model_executor/models/gpt_oss.py | 2 +- 8 files changed, 150 insertions(+), 24 deletions(-) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 55e659679701..a4a880f13cf7 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param( } } +template +__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up, + float alpha, float limit) { + // clamp gate: min=None, max=limit + const float gate_f = (float)gate; + const float clamped_gate = gate_f > limit ? limit : gate_f; + + // clamp up: min=-limit, max=limit + const float up_f = (float)up; + const float clamped_up = + up_f > limit ? limit : (up_f < -limit ? -limit : up_f); + + // glu = gate * sigmoid(gate * alpha) + const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha)); + const float glu = clamped_gate * sigmoid_val; + + // (up + 1) * glu + return (T)((clamped_up + 1.0f) * glu); +} + +template +__global__ void swigluoai_and_mul_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., 2, d] + const int d, const float alpha, const float limit) { + const int64_t token_idx = blockIdx.x; + // TODO: Vectorize loads and stores. + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + // gate = x[..., ::2] (even indices) + const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]); + // up = x[..., 1::2] (odd indices) + const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]); + + out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); + } +} + } // namespace vllm #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ @@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param( PARAM); \ }); +#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] { \ + vllm::swigluoai_and_mul_kernel> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d, ALPHA, \ + LIMIT); \ + }); + void fatrelu_and_mul(torch::Tensor& out, // [..., d], torch::Tensor& input, // [..., 2 * d] double threshold) { LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); } +void swigluoai_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., 2 * d] + double alpha, double limit) { + LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit); +} namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/ops.h b/csrc/ops.h index 207291eceb16..8b41b95473a1 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input, double threshold); +void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input, + double alpha = 1.702, double limit = 7.0); void gelu_new(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 8c207be083d8..41e9bc8a5e01 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -130,6 +130,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()"); ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul); + ops.def( + "swigluoai_and_mul(Tensor! out, Tensor input, float alpha, float limit) " + "-> ()"); + ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul); + // GELU implementation used in GPT-2. ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_new", torch::kCUDA, &gelu_new); diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index 29c5e70a8ba8..ec5c60fd7b0e 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -11,7 +11,7 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, MulAndSilu, NewGELU, QuickGELU, - SiluAndMul) + SiluAndMul, SwigluOAIAndMul) from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -25,7 +25,15 @@ @pytest.mark.parametrize( "activation", - ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]) + [ + "silu_and_mul", + "mul_and_silu", + "gelu", + "gelu_tanh", + "fatrelu", + "swigluoai_and_mul", + ], +) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -59,18 +67,43 @@ def test_act_and_mul( threshold = random.uniform(0, 1) layer = FatreluAndMul(threshold) fn = torch.ops._C.fatrelu_and_mul + elif activation == "swigluoai_and_mul": + layer = SwigluOAIAndMul() + fn = torch.ops._C.swigluoai_and_mul out = layer(x) ref_out = layer.forward_native(x) - # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are - # equivalent to the native PyTorch implementations, so we can do exact - # comparison. - torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) + if activation == "swigluoai_and_mul": + + rtol = { + #For fp16, change the relative tolerance from 1e-3 to 2e-3 + torch.float16: + 2e-3, + torch.bfloat16: + 2e-2, + torch.float: + 1.3e-6 + } + + def _get_rtol(output) -> float: + return rtol[output.dtype] + + torch.testing.assert_close(out, + ref_out, + atol=get_default_atol(out), + rtol=_get_rtol(out)) + else: + # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are + # equivalent to the native PyTorch implementations, so we can do exact + # comparison. + torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) if activation == "fatrelu": opcheck(fn, (out, x, threshold)) + elif activation == "swigluoai_and_mul": + opcheck(fn, (out, x, layer.alpha, layer.limit)) else: opcheck(fn, (out, x)) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 7ce44174ead6..5f89dadec8b8 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -239,6 +239,35 @@ def extra_repr(self) -> str: return f'approximate={repr(self.approximate)}' +@CustomOp.register("swigluoai_and_mul") +class SwigluOAIAndMul(CustomOp): + # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110 + def __init__(self, alpha: float = 1.702, limit: float = 7.0): + super().__init__() + self.alpha = alpha + self.limit = limit + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + + gate, up = x[..., ::2], x[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + gated_output = (up + 1) * glu + return gated_output + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit) + return out + + def extra_repr(self) -> str: + return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}" + + @CustomOp.register("gelu_new") class NewGELU(CustomOp): @@ -330,6 +359,7 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor: return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + #TODO : implement cuda kenrels return self.forward_native(x) @@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module: _ACTIVATION_AND_MUL_REGISTRY = LazyDict({ - "gelu": lambda: GeluAndMul(), - "silu": lambda: SiluAndMul(), - "geglu": lambda: GeluAndMul(), + "gelu": + lambda: GeluAndMul(), + "silu": + lambda: SiluAndMul(), + "geglu": + lambda: GeluAndMul(), + "swigluoai_and_mul": + lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs), }) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 98087a35e15c..23ebad36daf2 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1633,17 +1633,6 @@ def fused_experts_impl( block_shape=block_shape, B_bias=w1_bias) - # TODO fused kernel - def swiglu_oai(gate_up): - alpha = 1.702 - limit = 7.0 - gate, up = gate_up[..., ::2], gate_up[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - gated_output = (up + 1) * glu - return gated_output - # Activation function with multiplication if activation == "silu" and is_act_and_mul: torch.ops._C.silu_and_mul(intermediate_cache2, @@ -1651,13 +1640,16 @@ def swiglu_oai(gate_up): elif activation == "gelu" and is_act_and_mul: torch.ops._C.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + elif activation == "swigluoai" and is_act_and_mul: + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) # Activation function without multiplication elif activation == "silu": intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) elif activation == "gelu": intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) - elif activation == "swiglu_oai": - intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N)) + else: raise ValueError(f"Unsupported FusedMoe activation: {activation}, " f"with is_act_and_mul={is_act_and_mul}.") diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index deeb69bcad0e..dca38a019e9b 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -68,7 +68,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False, return not (use_grouped_topk or topk_group or num_expert_group or expert_map or custom_routing_function or e_score_correction_bias or apply_router_weight_on_input - or scoring_func != "softmax" or activation != "swiglu_oai" + or scoring_func != "softmax" or activation != "swigluoai" or expert_load_view or logical_to_physical_map or logical_replica_count) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 7c7712dbe106..2f5d9ddd9054 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -159,7 +159,7 @@ def __init__( prefix=f"{prefix}.experts", apply_router_weight_on_input=False, has_bias=True, - activation="swiglu_oai") + activation="swigluoai") def forward(self, x: torch.Tensor) -> torch.Tensor: t = self.norm(x) From 3c2693d435ffb056eef21d302f115403b1743603 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Thu, 14 Aug 2025 17:38:10 -0700 Subject: [PATCH 026/231] Revert "[Kernel] Add cuda kernel for gpt_oss activation" (#22948) Signed-off-by: Duncan Moss --- csrc/activation_kernels.cu | 59 ------------------- csrc/ops.h | 2 - csrc/torch_bindings.cpp | 5 -- tests/kernels/core/test_activation.py | 45 ++------------ vllm/model_executor/layers/activation.py | 41 +------------ .../layers/fused_moe/fused_moe.py | 18 ++++-- .../layers/quantization/utils/mxfp4_utils.py | 2 +- vllm/model_executor/models/gpt_oss.py | 2 +- 8 files changed, 24 insertions(+), 150 deletions(-) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index a4a880f13cf7..55e659679701 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -128,45 +128,6 @@ __global__ void act_and_mul_kernel_with_param( } } -template -__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up, - float alpha, float limit) { - // clamp gate: min=None, max=limit - const float gate_f = (float)gate; - const float clamped_gate = gate_f > limit ? limit : gate_f; - - // clamp up: min=-limit, max=limit - const float up_f = (float)up; - const float clamped_up = - up_f > limit ? limit : (up_f < -limit ? -limit : up_f); - - // glu = gate * sigmoid(gate * alpha) - const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha)); - const float glu = clamped_gate * sigmoid_val; - - // (up + 1) * glu - return (T)((clamped_up + 1.0f) * glu); -} - -template -__global__ void swigluoai_and_mul_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., 2, d] - const int d, const float alpha, const float limit) { - const int64_t token_idx = blockIdx.x; - // TODO: Vectorize loads and stores. - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - // gate = x[..., ::2] (even indices) - const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]); - // up = x[..., 1::2] (odd indices) - const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]); - - out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); - } -} - } // namespace vllm #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ @@ -184,31 +145,11 @@ __global__ void swigluoai_and_mul_kernel( PARAM); \ }); -#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \ - int d = input.size(-1) / 2; \ - int64_t num_tokens = input.numel() / input.size(-1); \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] { \ - vllm::swigluoai_and_mul_kernel> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d, ALPHA, \ - LIMIT); \ - }); - void fatrelu_and_mul(torch::Tensor& out, // [..., d], torch::Tensor& input, // [..., 2 * d] double threshold) { LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); } -void swigluoai_and_mul(torch::Tensor& out, // [..., d] - torch::Tensor& input, // [..., 2 * d] - double alpha, double limit) { - LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit); -} namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/ops.h b/csrc/ops.h index 8b41b95473a1..207291eceb16 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -138,8 +138,6 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input, double threshold); -void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input, - double alpha = 1.702, double limit = 7.0); void gelu_new(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 41e9bc8a5e01..8c207be083d8 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -130,11 +130,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()"); ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul); - ops.def( - "swigluoai_and_mul(Tensor! out, Tensor input, float alpha, float limit) " - "-> ()"); - ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul); - // GELU implementation used in GPT-2. ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_new", torch::kCUDA, &gelu_new); diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index ec5c60fd7b0e..29c5e70a8ba8 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -11,7 +11,7 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, MulAndSilu, NewGELU, QuickGELU, - SiluAndMul, SwigluOAIAndMul) + SiluAndMul) from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -25,15 +25,7 @@ @pytest.mark.parametrize( "activation", - [ - "silu_and_mul", - "mul_and_silu", - "gelu", - "gelu_tanh", - "fatrelu", - "swigluoai_and_mul", - ], -) + ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -67,43 +59,18 @@ def test_act_and_mul( threshold = random.uniform(0, 1) layer = FatreluAndMul(threshold) fn = torch.ops._C.fatrelu_and_mul - elif activation == "swigluoai_and_mul": - layer = SwigluOAIAndMul() - fn = torch.ops._C.swigluoai_and_mul out = layer(x) ref_out = layer.forward_native(x) - if activation == "swigluoai_and_mul": - - rtol = { - #For fp16, change the relative tolerance from 1e-3 to 2e-3 - torch.float16: - 2e-3, - torch.bfloat16: - 2e-2, - torch.float: - 1.3e-6 - } - - def _get_rtol(output) -> float: - return rtol[output.dtype] - - torch.testing.assert_close(out, - ref_out, - atol=get_default_atol(out), - rtol=_get_rtol(out)) - else: - # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are - # equivalent to the native PyTorch implementations, so we can do exact - # comparison. - torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) + # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are + # equivalent to the native PyTorch implementations, so we can do exact + # comparison. + torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) if activation == "fatrelu": opcheck(fn, (out, x, threshold)) - elif activation == "swigluoai_and_mul": - opcheck(fn, (out, x, layer.alpha, layer.limit)) else: opcheck(fn, (out, x)) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 5f89dadec8b8..7ce44174ead6 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -239,35 +239,6 @@ def extra_repr(self) -> str: return f'approximate={repr(self.approximate)}' -@CustomOp.register("swigluoai_and_mul") -class SwigluOAIAndMul(CustomOp): - # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110 - def __init__(self, alpha: float = 1.702, limit: float = 7.0): - super().__init__() - self.alpha = alpha - self.limit = limit - - def forward_native(self, x: torch.Tensor) -> torch.Tensor: - """PyTorch-native implementation equivalent to forward().""" - - gate, up = x[..., ::2], x[..., 1::2] - gate = gate.clamp(min=None, max=self.limit) - up = up.clamp(min=-self.limit, max=self.limit) - glu = gate * torch.sigmoid(gate * self.alpha) - gated_output = (up + 1) * glu - return gated_output - - def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - output_shape = (x.shape[:-1] + (d, )) - out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit) - return out - - def extra_repr(self) -> str: - return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}" - - @CustomOp.register("gelu_new") class NewGELU(CustomOp): @@ -359,7 +330,6 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor: return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - #TODO : implement cuda kenrels return self.forward_native(x) @@ -436,14 +406,9 @@ def get_act_fn(act_fn_name: str) -> nn.Module: _ACTIVATION_AND_MUL_REGISTRY = LazyDict({ - "gelu": - lambda: GeluAndMul(), - "silu": - lambda: SiluAndMul(), - "geglu": - lambda: GeluAndMul(), - "swigluoai_and_mul": - lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs), + "gelu": lambda: GeluAndMul(), + "silu": lambda: SiluAndMul(), + "geglu": lambda: GeluAndMul(), }) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 23ebad36daf2..98087a35e15c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1633,6 +1633,17 @@ def fused_experts_impl( block_shape=block_shape, B_bias=w1_bias) + # TODO fused kernel + def swiglu_oai(gate_up): + alpha = 1.702 + limit = 7.0 + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + gate = gate.clamp(min=None, max=limit) + up = up.clamp(min=-limit, max=limit) + glu = gate * torch.sigmoid(gate * alpha) + gated_output = (up + 1) * glu + return gated_output + # Activation function with multiplication if activation == "silu" and is_act_and_mul: torch.ops._C.silu_and_mul(intermediate_cache2, @@ -1640,16 +1651,13 @@ def fused_experts_impl( elif activation == "gelu" and is_act_and_mul: torch.ops._C.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) - elif activation == "swigluoai" and is_act_and_mul: - # alpha = 1.702, limit = 7.0 - torch.ops._C.swigluoai_and_mul(intermediate_cache2, - intermediate_cache1.view(-1, N)) # Activation function without multiplication elif activation == "silu": intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) elif activation == "gelu": intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) - + elif activation == "swiglu_oai": + intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N)) else: raise ValueError(f"Unsupported FusedMoe activation: {activation}, " f"with is_act_and_mul={is_act_and_mul}.") diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index dca38a019e9b..deeb69bcad0e 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -68,7 +68,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False, return not (use_grouped_topk or topk_group or num_expert_group or expert_map or custom_routing_function or e_score_correction_bias or apply_router_weight_on_input - or scoring_func != "softmax" or activation != "swigluoai" + or scoring_func != "softmax" or activation != "swiglu_oai" or expert_load_view or logical_to_physical_map or logical_replica_count) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 2f5d9ddd9054..7c7712dbe106 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -159,7 +159,7 @@ def __init__( prefix=f"{prefix}.experts", apply_router_weight_on_input=False, has_bias=True, - activation="swigluoai") + activation="swiglu_oai") def forward(self, x: torch.Tensor) -> torch.Tensor: t = self.norm(x) From 496e3feb641aec9e53adbf7b1f1bbf1f179f7393 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 14 Aug 2025 18:39:43 -0700 Subject: [PATCH 027/231] [BugFix][KVConn] Fix use of `get_required_kvcache_layout` (#22734) Signed-off-by: Nick Hill Signed-off-by: Duncan Moss --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 4 ++++ .../kv_transfer/kv_connector/v1/multi_connector.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index b72104397822..07fcdecac627 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -325,4 +325,8 @@ def get_required_kvcache_layout( str: the required KV cache layout. e.g. HND, or NHD. None if the connector does not require a specific layout. """ + + if cls is KVConnectorBase_V1: + raise TypeError("get_required_kvcache_layout should not be called " + "on the abstract base class") return None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 7d67c76e2f05..d3f6a226dc72 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -228,9 +228,10 @@ def get_required_kvcache_layout( for ktc in ktcs: kv_transfer_config = KVTransferConfig(**ktc) temp_vllm_config.kv_transfer_config = kv_transfer_config + connector_cls = KVConnectorFactory.get_connector_class( + kv_transfer_config) required_kvcache_layout = ( - KVConnectorBase_V1.get_required_kvcache_layout( - temp_vllm_config)) + connector_cls.get_required_kvcache_layout(temp_vllm_config)) if required_kvcache_layout is not None: layouts.add(required_kvcache_layout) From 465686cdb6edb5f4c30836c943c9b99f2094c3ea Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 14 Aug 2025 20:17:11 -0700 Subject: [PATCH 028/231] [BugFix] Fix port lookup in internal DP LB tests (#22252) Signed-off-by: Nick Hill Signed-off-by: Duncan Moss --- tests/v1/test_internal_lb_dp.py | 54 +++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index ca80d3a4949d..2b031865cad7 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -4,6 +4,8 @@ import os import threading import time +import traceback +from typing import Optional, cast import openai # use the official client for correctness check import pytest @@ -41,12 +43,15 @@ def __init__(self, self.tp_size = tp_size self.api_server_count = api_server_count self.base_server_args = base_server_args - self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = [] + self.servers: list[Optional[tuple[RemoteOpenAIServer, + list[str]]]] = [None] * (dp_size // + dp_per_node) self.server_threads: list[threading.Thread] = [] def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]: """Start all server instances for multi-node internal LB mode.""" - for rank in range(0, self.dp_size, self.dp_per_node): + for server_idx, rank in enumerate( + range(0, self.dp_size, self.dp_per_node)): # Create server args for this specific rank server_args = self.base_server_args.copy() @@ -87,7 +92,7 @@ def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]: ]) # Use a thread to start each server to allow parallel initialization - def start_server(r: int, sargs: list[str]): + def start_server(sidx: int, r: int, sargs: list[str]): gpus_per_node = self.tp_size * self.dp_per_node try: # Start the server @@ -110,13 +115,14 @@ def start_server(r: int, sargs: list[str]): f"{self.api_server_count} API servers") else: print(f"Headless node (rank {r}) started successfully") - self.servers.append((server, sargs)) + self.servers[sidx] = (server, sargs) except Exception as e: print(f"Failed to start server rank {r}: {e}") + traceback.print_exc() raise thread = threading.Thread(target=start_server, - args=(rank, server_args)) + args=(server_idx, rank, server_args)) thread.start() self.server_threads.append(thread) @@ -128,18 +134,20 @@ def start_server(r: int, sargs: list[str]): # Give servers additional time to fully initialize and coordinate time.sleep(3) - if len(self.servers) != self.dp_size // self.dp_per_node: + if not all(self.servers): raise Exception("Servers failed to start") - return self.servers + return cast(list[tuple[RemoteOpenAIServer, list[str]]], self.servers) def __exit__(self, exc_type, exc_val, exc_tb): """Stop all server instances.""" while self.servers: - try: - self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb) - except Exception as e: - print(f"Error stopping server: {e}") + if server := self.servers.pop(): + try: + server[0].__exit__(exc_type, exc_val, exc_tb) + except Exception as e: + print(f"Error stopping server: {e}") + traceback.print_exc() class APIOnlyServerManager: @@ -157,7 +165,8 @@ def __init__(self, self.tp_size = tp_size self.api_server_count = api_server_count self.base_server_args = base_server_args - self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = [] + self.servers: list[Optional[tuple[RemoteOpenAIServer, + list[str]]]] = [None] * 2 self.server_threads: list[threading.Thread] = [] def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]: @@ -209,7 +218,7 @@ def start_api_server(): server.__enter__() print(f"API-only server started successfully with " f"{self.api_server_count} API servers") - self.servers.append((server, api_server_args)) + self.servers[0] = (server, api_server_args) except Exception as e: print(f"Failed to start API-only server: {e}") raise @@ -231,7 +240,7 @@ def start_engines_server(): server.__enter__() print(f"Headless engines server started successfully with " f"{self.dp_size} engines") - self.servers.append((server, engines_server_args)) + self.servers[1] = (server, engines_server_args) except Exception as e: print(f"Failed to start headless engines server: {e}") raise @@ -253,18 +262,20 @@ def start_engines_server(): # Give servers additional time to fully initialize and coordinate time.sleep(3) - if len(self.servers) != 2: + if not all(self.servers): raise Exception("Both servers failed to start") - return self.servers + return cast(list[tuple[RemoteOpenAIServer, list[str]]], self.servers) def __exit__(self, exc_type, exc_val, exc_tb): """Stop both server instances.""" while self.servers: - try: - self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb) - except Exception as e: - print(f"Error stopping server: {e}") + if server := self.servers.pop(): + try: + server[0].__exit__(exc_type, exc_val, exc_tb) + except Exception as e: + print(f"Error stopping server: {e}") + traceback.print_exc() @pytest.fixture(scope="module") @@ -560,7 +571,7 @@ async def make_request(): assert len(results) == num_requests assert all(completion is not None for completion in results) - _, api_server_args = api_only_servers[0] + api_server, api_server_args = api_only_servers[0] api_server_count = ( api_server_args.count('--api-server-count') and api_server_args[api_server_args.index('--api-server-count') + 1] @@ -569,7 +580,6 @@ async def make_request(): f"engines on headless server (API server count: {api_server_count})") # Check request balancing via Prometheus metrics - api_server = api_only_servers[0][0] check_request_balancing(api_server, DP_SIZE) From 6caa9f2a2a40d19dafc2f6df4f2eaba6f871b465 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 14 Aug 2025 23:25:34 -0400 Subject: [PATCH 029/231] [CI Perf] Prune tests in `tests/kernels/quantization/` (#22942) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- tests/kernels/quantization/test_fp8_quant.py | 8 +-- tests/kernels/quantization/test_int8_quant.py | 7 +-- tests/kernels/quantization/test_machete_mm.py | 4 -- .../kernels/quantization/test_marlin_gemm.py | 4 -- .../quantization/test_rocm_skinny_gemms.py | 60 +++++++++++++++---- .../quantization/test_triton_scaled_mm.py | 16 +++-- 6 files changed, 66 insertions(+), 33 deletions(-) diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py index 0a3edd4ddc16..c2e70ffb8d34 100644 --- a/tests/kernels/quantization/test_fp8_quant.py +++ b/tests/kernels/quantization/test_fp8_quant.py @@ -11,11 +11,9 @@ from tests.kernels.utils import opcheck from vllm.platforms import current_platform -DTYPES = [torch.half, torch.bfloat16, torch.float] -HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192, - 8193] # Arbitrary values for testing -HIDDEN_SIZES += list(range(1024, 1033)) # vectorized conversion edge cases -NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing +DTYPES = [torch.bfloat16, torch.float] +HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] +NUM_TOKENS = [1, 7, 4096] SCALE_UBS = [True, False] SEEDS = [0] diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py index 5a37b976db9e..c1c9bf191d5b 100644 --- a/tests/kernels/quantization/test_int8_quant.py +++ b/tests/kernels/quantization/test_int8_quant.py @@ -9,10 +9,9 @@ from vllm._custom_ops import scaled_int8_quant from vllm.platforms import current_platform -DTYPES = [torch.half, torch.bfloat16, torch.float] -HIDDEN_SIZES = [16, 67, 768, 5137, 8193] # Arbitrary values for testing -HIDDEN_SIZES += list(range(1024, 1033)) # vectorized conversion edge cases -NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing +DTYPES = [torch.bfloat16, torch.float] +HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] +NUM_TOKENS = [1, 7, 4096] SEEDS = [0] SCALE = [0.1, 2.1] diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index a7cb2a4e7f21..a842d2f1cbe8 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -34,8 +34,6 @@ MNK_SHAPES = [ (1, 128, 128), - (1, 512, 1024), - (1, 4096, 4096), (1, 8192, 28672), (13, 8192, 4096), (26, 4096, 8192), @@ -43,8 +41,6 @@ (64, 8192, 28672), (257, 128, 4096), (257, 4224, 4160), - (257, 4096, 4096), - (1024, 4096, 8192), (1024, 8192, 4096), ] diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index 1bd6713ce7fb..cea7700ac329 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -53,12 +53,8 @@ MNK_FACTORS = [ (1, 1, 1), (1, 4, 8), - (1, 7, 5), - (13, 17, 67), (26, 37, 13), - (67, 13, 11), (257, 13, 11), - (658, 13, 11), ] DTYPES = [torch.float16, torch.bfloat16] diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py index 533a4fe59677..03d5d98739c5 100644 --- a/tests/kernels/quantization/test_rocm_skinny_gemms.py +++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py @@ -8,15 +8,55 @@ from vllm.platforms import current_platform DTYPES = [torch.bfloat16, torch.float16] -M = [16, 32, 64, 128, 256, 512, 1024, 4096, 8192] -K = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 6144, 8192] # k % 8 == 0 -N = [1, 2, 3, 4] +# Specific (N, K, M) combinations for targeted testing +NKM_FACTORS_LLMM1 = [ + # Small, medium, large cases + (1, 8, 16), + (1, 32, 64), + (1, 128, 256), + (1, 512, 1024), + (1, 2048, 4096), + # Edge cases with specific K sizes + (1, 6144, 1024), + (1, 8192, 2048), + # Very large case + (1, 4096, 8192), +] + +NKM_FACTORS_WVSPLITK = [ + # Different batch sizes with key dimensions + (1, 16, 16), + (1, 64, 64), + (2, 256, 256), + (3, 1024, 1024), + (4, 4096, 4096), + # Extended K values + (1, 9216, 512), + (2, 10240, 1024), + (4, 16384, 8192), + # Minimum M constraint validation (m >= 8) + (1, 64, 8), + (2, 128, 8), + (4, 256, 8), +] + +NKM_FACTORS_WVSPLITK_FP8 = [ + # FP8-specific cases with K % 16 == 0 + (1, 16, 16), + (1, 64, 64), + (2, 512, 512), + (3, 2048, 2048), + (4, 4096, 4096), + # Extended FP8 dimensions not covered by WVSPLITK + (1, 14336, 1024), + (2, 24576, 2048), + (4, 32768, 28672), +] + SEEDS = [0] -@pytest.mark.parametrize("n", [1]) # only test for batch size 1 -@pytest.mark.parametrize("k", K) -@pytest.mark.parametrize("m", M) +@pytest.mark.parametrize("n,k,m", NKM_FACTORS_LLMM1) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16]) @pytest.mark.parametrize("seed", SEEDS) @@ -34,9 +74,7 @@ def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed): assert torch.allclose(out, ref_out, rtol=0.01) -@pytest.mark.parametrize("n", N) # only test for batch size <= 4 -@pytest.mark.parametrize("k", K + [9216, 10240, 16384]) -@pytest.mark.parametrize("m", [8] + M) # m >= 8 +@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.skipif(not current_platform.is_rocm(), @@ -54,9 +92,7 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed): assert torch.allclose(out, ref_out, rtol=0.01) -@pytest.mark.parametrize("n", N) # only test for batch size <= 4 -@pytest.mark.parametrize("k", K[1:] + [14336, 24576, 32768]) # k % 16 == 0 -@pytest.mark.parametrize("m", M + [28672]) # m >= 16 +@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.skipif( diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 8a2cc3baced2..24245663fb1d 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -60,10 +60,18 @@ def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path, num_logprobs) -@pytest.mark.parametrize("M", [1, 33, 64, 512]) -@pytest.mark.parametrize("N", [256, 971, 20486]) -@pytest.mark.parametrize("K", [128, 496, 1024]) -@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16]) +MNK_FACTORS = [ + (1, 256, 128), + (33, 256, 496), + (64, 971, 1024), + (64, 20486, 128), + (512, 256, 496), + (512, 20486, 1024), +] + + +@pytest.mark.parametrize("M,N,K", MNK_FACTORS) +@pytest.mark.parametrize("out_dtype", [torch.bfloat16]) @pytest.mark.parametrize("in_dtype", get_8bit_types()) @pytest.mark.parametrize("use_scalar_scale_a", [True, False]) @pytest.mark.parametrize("use_scalar_scale_b", [True, False]) From 5fd03f5423c84ea25239a762a45471718115c2ca Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 14 Aug 2025 23:33:42 -0400 Subject: [PATCH 030/231] [CI Perf] Prune tests in `tests/kernels/moe/` (#22939) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- tests/kernels/moe/test_batched_moe.py | 13 +++----- .../moe/test_count_expert_num_tokens.py | 5 ++- tests/kernels/moe/test_moe.py | 33 +++++++++++++------ .../kernels/moe/test_moe_align_block_size.py | 6 ++-- .../kernels/moe/test_moe_permute_unpermute.py | 8 ++--- tests/kernels/moe/test_pplx_moe.py | 12 +++++-- 6 files changed, 46 insertions(+), 31 deletions(-) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 69317405d48b..edf3e6189243 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -89,14 +89,11 @@ def make_tensors(config: BatchedMMConfig): return BatchedMMTensors(A, B, C, num_expert_tokens) -@pytest.mark.parametrize("num_experts", [8, 16, 32]) -@pytest.mark.parametrize("max_tokens_per_expert", - [32, 64, 128, 192, 224, 256, 512]) -@pytest.mark.parametrize("K", [128, 256, 1024]) -@pytest.mark.parametrize("N", [128, 256, 1024]) -@pytest.mark.parametrize( - "dtype", - [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("num_experts", [8, 32]) +@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512]) +@pytest.mark.parametrize("K", [128, 1024]) +@pytest.mark.parametrize("N", [128, 1024]) +@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16]) @pytest.mark.parametrize("block_shape", [None, [128, 128]]) @pytest.mark.parametrize("per_act_token_quant", [False, True]) def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py index 0872836b6064..1768baaf1ca7 100644 --- a/tests/kernels/moe/test_count_expert_num_tokens.py +++ b/tests/kernels/moe/test_count_expert_num_tokens.py @@ -113,8 +113,7 @@ def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int, rtol=0) -@pytest.mark.parametrize( - "num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317]) +@pytest.mark.parametrize("num_tokens", [1, 4, 8, 11, 127, 128, 3333, 7317]) @pytest.mark.parametrize("num_topk", [2, 6, 8]) @pytest.mark.parametrize("num_experts", [64]) @pytest.mark.parametrize("ep_size", [1, 2, 4]) @@ -126,7 +125,7 @@ def test_compute_expert_num_tokens(num_tokens: int, num_topk: int, ep_size, topk_ids_dtype) -@pytest.mark.parametrize("numel", list(range(1, 8192, 11))) +@pytest.mark.parametrize("numel", list(range(1, 8192, 111))) @pytest.mark.parametrize("num_experts", [32]) @pytest.mark.parametrize("ep_size", [2]) @pytest.mark.parametrize("topk_ids_dtype", [torch.int64]) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index b82c74a42ab3..1951eb0c6180 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -42,6 +42,24 @@ EP_SIZE = [1, 4] TOP_KS = [2, 6] +FUSED_MOE_MNK_FACTORS = [ + (1, 128, 128), + (1, 2048, 128), + (33, 2048, 128), + (222, 1024, 1024), + (32768, 128, 128), + (32768, 2048, 511), + (40000, 1024, 1024), +] + +FUSED_MOE_WN16_MNK_FACTORS = [ + (1, 128, 128), + (1, 1024, 1024), + (32, 2048, 128), + (32, 1024, 1024), + (222, 2048, 1024), +] + vllm_config = VllmConfig() vllm_config.scheduler_config.max_num_seqs = 128 vllm_config.scheduler_config.max_model_len = 8192 @@ -116,13 +134,11 @@ def run_moe_test( return baseline_output -@pytest.mark.parametrize("m", [1, 33, 64, 222, 32768, 40000]) -@pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("k", [128, 511, 1024]) +@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("ep_size", EP_SIZE) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("padding", [True, False]) @pytest.mark.parametrize("chunk_size", [8192]) def test_fused_moe( @@ -235,13 +251,11 @@ def m_fused_moe( use_cudagraph=use_cudagraph) -@pytest.mark.parametrize("m", [1, 32, 222]) -@pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("ep_size", EP_SIZE) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("group_size", [64, 128]) @pytest.mark.parametrize("has_zp", [True, False]) @pytest.mark.parametrize("weight_bits", [4, 8]) @@ -352,8 +366,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) -@pytest.mark.parametrize("dtype", - [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("padding", [True, False]) @pytest.mark.parametrize( "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index 12ef9e776c3a..5dfc8d9fab32 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -15,10 +15,10 @@ from vllm.platforms import current_platform from vllm.utils import round_up -NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096] -NUM_EXPERTS = [32, 160, 256, 257, 512] +NUM_TOKENS = [1, 3, 256, 2256, 4096] +NUM_EXPERTS = [32, 160, 256, 257] TOP_KS = [1, 2, 16, 32] -BLOCK_SIZES = [32, 64, 128, 256] +BLOCK_SIZES = [32, 128] current_platform.seed_everything(0) diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 8d215a0cbeed..6ca01f9271bb 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -18,7 +18,7 @@ from vllm.platforms import current_platform NUM_EXPERTS = [16, 64, 256] -TOP_KS = [2, 4, 6, 8] +TOP_KS = [2, 6, 8] EP_SIZE = [1, 4, 16] current_platform.seed_everything(0) @@ -177,11 +177,11 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor, return output -@pytest.mark.parametrize("n_token", [1, 33, 64, 222, 1024, 2048, 3000, 5000]) -@pytest.mark.parametrize("n_hidden", [2048, 4096, 7168]) +@pytest.mark.parametrize("n_token", [1, 33, 1024, 5000]) +@pytest.mark.parametrize("n_hidden", [2048, 7168]) @pytest.mark.parametrize("n_expert", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("ep_size", EP_SIZE) @pytest.mark.parametrize("align_block_size", [None, 128]) def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index f7a661b4bc7b..fbef6706beaf 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -44,6 +44,14 @@ reason="Requires PPLX kernels", ) +BATCHED_MOE_MNK_FACTORS = [ + (1, 128, 128), + (33, 2048, 128), + (64, 128, 2048), + (222, 128, 128), + (222, 2048, 1024), +] + PPLX_COMBOS = [ # TODO: figure out why this fails, seems to be test problem #(1, 128, 128), @@ -152,9 +160,7 @@ def torch_batched_moe( return torch_finalize(out, topk_weight, topk_ids) -@pytest.mark.parametrize("m", [1, 33, 64, 222]) -@pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("k", [128, 512, 1024]) +@pytest.mark.parametrize("m,n,k", BATCHED_MOE_MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("dtype", [torch.bfloat16]) From b725016bd5e9ec3197dcc67845dc9d3a42d46c7c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 14 Aug 2025 23:34:53 -0400 Subject: [PATCH 031/231] [CI Perf] Prune tests in `tests/kernels/attention/` (#22936) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- .../attention/test_aiter_flash_attn.py | 6 ++--- tests/kernels/attention/test_attention.py | 7 ++---- tests/kernels/attention/test_cache.py | 4 ++-- tests/kernels/attention/test_flash_attn.py | 16 +++++++------ tests/kernels/attention/test_flashinfer.py | 24 ++++++++++--------- .../test_flashinfer_trtllm_attention.py | 6 ++--- .../kernels/attention/test_prefix_prefill.py | 6 ++--- .../test_triton_unified_attention.py | 8 +++---- 8 files changed, 39 insertions(+), 38 deletions(-) diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index d0687c62b113..2d882bdf4066 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -9,10 +9,10 @@ import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 from vllm.platforms import current_platform -NUM_HEADS = [(4, 4), (8, 2), (16, 2)] +NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] -BLOCK_SIZES = [16, 32] -DTYPES = [torch.float16, torch.bfloat16] +BLOCK_SIZES = [16] +DTYPES = [torch.bfloat16] QDTYPES = [None] # one value large enough to test overflow in index calculation. # one value small enough to test the schema op check diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 2e0b4efebfdb..7083661575ef 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -29,17 +29,14 @@ NUM_BLOCKS = 4321 # Arbitrary values for testing PARTITION_SIZE = 512 PARTITION_SIZE_ROCM = 256 -# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} -DTYPES = [ - torch.half, torch.bfloat16, torch.float -] if not current_platform.is_rocm() else [torch.half, torch.bfloat16] +DTYPES = [torch.bfloat16] NUM_GEN_SEQS = [7] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing # This should be sync with get_supported_head_sizes() in # vllm.attention.ops.paged_attn.PagedAttention -HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [32, 80, 128, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 789507615580..8c3cc8cba9d9 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -11,11 +11,11 @@ from vllm.platforms import current_platform COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] -DTYPES = [torch.half, torch.bfloat16, torch.float] +DTYPES = [torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 120, 256] +HEAD_SIZES = [64, 80, 256] BLOCK_SIZES = [8, 16, 32] CACHE_LAYOUTS = ["NHD", "HND"] diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index bd3190d09b0f..2544703f8bf9 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -12,14 +12,16 @@ flash_attn_with_kvcache, is_fa_version_supported) -NUM_HEADS = [(4, 4), (8, 2), (16, 2)] +NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] -BLOCK_SIZES = [16, 32] -DTYPES = [torch.float16, torch.bfloat16] +BLOCK_SIZES = [16] +DTYPES = [torch.bfloat16] QDTYPES = [None, torch.float8_e4m3fn] # one value large enough to test overflow in index calculation. # one value small enough to test the schema op check NUM_BLOCKS = [32768, 2048] +SOFT_CAPS = [None, 50.0] +SLIDING_WINDOWS = [None, 256] def ref_paged_attn( @@ -83,9 +85,9 @@ def ref_paged_attn( @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) +@pytest.mark.parametrize("soft_cap", SOFT_CAPS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) -@pytest.mark.parametrize("sliding_window", [None, 256]) +@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS) @pytest.mark.parametrize("fa_version", [2, 3]) @pytest.mark.parametrize("q_dtype", QDTYPES) @torch.inference_mode() @@ -198,9 +200,9 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("sliding_window", [None, 256]) +@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) +@pytest.mark.parametrize("soft_cap", SOFT_CAPS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("fa_version", [2, 3]) @pytest.mark.parametrize("q_dtype", QDTYPES) diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 8f9b4eceaa72..be78f0e4fcc6 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -9,11 +9,13 @@ from vllm.platforms import current_platform -NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)] +NUM_HEADS = [(32, 8), (6, 1)] HEAD_SIZES = [128, 256] BLOCK_SIZES = [16, 32] -DTYPES = [torch.float16, torch.bfloat16] +DTYPES = [torch.bfloat16] NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. +SOFT_CAPS = [None, 30.0] +SLIDING_WINDOWS = [None, 64] def ref_paged_attn( @@ -76,8 +78,8 @@ def ref_paged_attn( @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) -@pytest.mark.parametrize("sliding_window", [None, 64]) +@pytest.mark.parametrize("soft_cap", SOFT_CAPS) +@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS) @torch.inference_mode def test_flashinfer_decode_with_paged_kv( kv_lens: list[int], @@ -173,8 +175,8 @@ def test_flashinfer_decode_with_paged_kv( @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) -@pytest.mark.parametrize("sliding_window", [None, 64]) +@pytest.mark.parametrize("soft_cap", SOFT_CAPS) +@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS) @torch.inference_mode def test_flashinfer_prefill_with_paged_kv( seq_lens: list[tuple[int, int]], @@ -278,11 +280,11 @@ def test_flashinfer_prefill_with_paged_kv( @pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]]) -@pytest.mark.parametrize("num_heads", [(32, 8), (6, 1)]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) +@pytest.mark.parametrize("soft_cap", SOFT_CAPS) def test_flashinfer_prefill_with_paged_fp8_kv( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, @@ -385,11 +387,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv( @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]]) -@pytest.mark.parametrize("num_heads", [(32, 8), (64, 8), (6, 1)]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) +@pytest.mark.parametrize("soft_cap", SOFT_CAPS) +@pytest.mark.skip(reason="TODO: fix the accuracy issue") @torch.inference_mode def test_flashinfer_decode_with_paged_fp8_kv( kv_lens: list[int], @@ -399,7 +402,6 @@ def test_flashinfer_decode_with_paged_fp8_kv( block_size: int, soft_cap: Optional[float], ) -> None: - pytest.skip("TODO: fix the accuracy issue") # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") current_platform.seed_everything(0) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index e87ce520bc66..53e225ea3ea6 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -20,11 +20,11 @@ MAX_Q_LEN = 1024 MAX_KV_LEN = 4096 BATCH_SIZES = [4, 12] -NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)] +NUM_HEADS = [(16, 16), (40, 8)] HEAD_SIZES = [128] -BLOCK_SIZES = [16, 32] +BLOCK_SIZES = [16] KV_LAYOUTS = ["HND"] -DTYPES = [torch.float16, torch.bfloat16] +DTYPES = [torch.bfloat16] KV_CACHE_DTYPES = [None, current_platform.fp8_dtype()] NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. SOFT_CAPS = [None, 50.0] diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index b09e1bbc4279..8544eab3accc 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -19,13 +19,13 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE NUM_HEADS = [64] -NUM_QUERIES_PER_KV = [1, 8, 64] -HEAD_SIZES = [128, 96, 24] +NUM_QUERIES_PER_KV = [1, 64] +HEAD_SIZES = [24, 128] DTYPES = [torch.float16] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] -SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048] +SLIDING_WINDOW = [0, 16, 2048] KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"] OPS = [chunked_prefill_paged_decode, context_attention_fwd] diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 0cb7f5963c79..4b97d51e6ed2 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -9,11 +9,11 @@ from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.platforms import current_platform -NUM_HEADS = [(4, 4), (8, 2), (16, 2)] +NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] -BLOCK_SIZES = [16, 32] +BLOCK_SIZES = [16] -DTYPES = [torch.float16, torch.bfloat16] +DTYPES = [torch.bfloat16] QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [ None, torch.float8_e4m3fnuz ] @@ -85,7 +85,7 @@ def ref_paged_attn( @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("sliding_window", [None, 256]) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) +@pytest.mark.parametrize("soft_cap", [None, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("q_dtype", QDTYPES) @torch.inference_mode() From 04e9109796973ddb99c42b8c8b40283ebf34a8f4 Mon Sep 17 00:00:00 2001 From: amirkl94 <203507526+amirkl94@users.noreply.github.com> Date: Fri, 15 Aug 2025 09:19:31 +0300 Subject: [PATCH 032/231] refactor: Change scaling factors calculation for flashinfer FusedMoE (#22812) Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com> Co-authored-by: Michael Goin Signed-off-by: Duncan Moss --- .../layers/fused_moe/fused_moe.py | 29 +++++------- .../model_executor/layers/quantization/fp8.py | 5 +- .../layers/quantization/modelopt.py | 5 +- .../quantization/utils/flashinfer_utils.py | 46 +++++++++++++++++-- 4 files changed, 60 insertions(+), 25 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 98087a35e15c..1c497fa5521b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1189,10 +1189,10 @@ def flashinfer_fused_moe_per_tensor_scale_fp8( hidden_states: torch.Tensor, input_scale: torch.Tensor, gemm1_weights: torch.Tensor, - gemm1_weights_scale: torch.Tensor, - activation_scale: torch.Tensor, gemm2_weights: torch.Tensor, - gemm2_weights_scale: torch.Tensor, + output1_scales_scalar: torch.Tensor, + output1_scales_gate_scalar: torch.Tensor, + output2_scales_scalar: torch.Tensor, num_experts: int, top_k: int, num_expert_group: Optional[int], @@ -1206,17 +1206,12 @@ def flashinfer_fused_moe_per_tensor_scale_fp8( num_expert_group = num_expert_group if num_expert_group is not None else 0 topk_group = topk_group if topk_group is not None else 0 - quant_hidden_states, input_scale = moe_kernel_quantize_input( + quant_hidden_states, _ = moe_kernel_quantize_input( hidden_states, input_scale, quant_dtype=torch.float8_e4m3fn, per_act_token_quant=False) - output1_scales_scalar = gemm1_weights_scale * input_scale * ( - 1.0 / activation_scale) - output1_scales_gate_scalar = gemm1_weights_scale * input_scale - output2_scales_scalar = activation_scale * gemm2_weights_scale - from vllm.utils.flashinfer import ( flashinfer_trtllm_fp8_per_tensor_scale_moe) return flashinfer_trtllm_fp8_per_tensor_scale_moe( @@ -1244,24 +1239,24 @@ def flashinfer_fused_moe_per_tensor_scale_fp8( def flashinfer_fused_moe_per_tensor_scale_fp8_fake( routing_logits: torch.Tensor, - routing_bias: torch.Tensor, + routing_bias: Optional[torch.Tensor], hidden_states: torch.Tensor, + input_scale: torch.Tensor, gemm1_weights: torch.Tensor, + gemm2_weights: torch.Tensor, output1_scales_scalar: torch.Tensor, output1_scales_gate_scalar: torch.Tensor, - gemm2_weights: torch.Tensor, output2_scales_scalar: torch.Tensor, num_experts: int, top_k: int, - num_expert_group: int, - topk_group: int, + num_expert_group: Optional[int], + topk_group: Optional[int], intermediate_size: int, local_expert_offset: int, local_num_experts: int, - routed_scaling_factor: float = 1.0, - use_routing_scales_on_input: bool = False, - tile_tokens_dim: int = 8, - routing_method_type: int = 0) -> torch.Tensor: + use_routing_scales_on_input: bool, + routing_method_type: int, + routed_scaling_factor: float = 1.0) -> torch.Tensor: pass diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 5e107c799b9f..dbd523428695 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -24,8 +24,8 @@ QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( - apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights, - swap_w13_to_w31) + apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors, + rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( @@ -694,6 +694,7 @@ def process_weights_after_loading(self, layer: Module) -> None: w2_weight = layer.w2_weight.data w2_weight_scale_inv = layer.w2_weight_scale_inv.data if not self.block_quant: + register_moe_scaling_factors(layer) rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight) else: w13_weight = layer.w13_weight.data diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 8f9ca73bc505..22fbbab00e91 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -25,8 +25,8 @@ build_flashinfer_fp4_cutlass_moe_kernel, flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( - apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights, - swap_w13_to_w31) + apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors, + rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( apply_fp4_marlin_linear, is_fp4_marlin_supported, prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin) @@ -430,6 +430,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight) + register_moe_scaling_factors(layer) def apply( self, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 9fb194767e4a..278ee5232f47 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -82,6 +82,12 @@ def apply_flashinfer_per_tensor_scale_fp8( apply_router_weight_on_input: bool, ) -> torch.Tensor: from flashinfer.fused_moe import RoutingMethodType + assert layer.output1_scales_scalar is not None, ( + "Expected output1_scales_scalar to be initialized") + assert layer.output1_scales_scalar is not None, ( + "Expected output1_scales_gate_scalar to be initialized") + assert layer.output1_scales_scalar is not None, ( + "Expected output2_scales_scalar to be initialized") from vllm.model_executor.models.llama4 import Llama4MoE assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \ @@ -92,10 +98,10 @@ def apply_flashinfer_per_tensor_scale_fp8( hidden_states=hidden_states, input_scale=layer.w13_input_scale, gemm1_weights=layer.w13_weight, - gemm1_weights_scale=layer.w13_weight_scale, gemm2_weights=layer.w2_weight, - gemm2_weights_scale=layer.w2_weight_scale, - activation_scale=layer.w2_input_scale, + output1_scales_scalar=layer.output1_scales_scalar, + output1_scales_gate_scalar=layer.output1_scales_gate_scalar, + output2_scales_scalar=layer.output2_scales_scalar, num_experts=global_num_experts, top_k=top_k, num_expert_group=num_expert_group, @@ -105,4 +111,36 @@ def apply_flashinfer_per_tensor_scale_fp8( local_num_experts=layer.local_num_experts, use_routing_scales_on_input=apply_router_weight_on_input, routing_method_type=RoutingMethodType.Llama4, - ) \ No newline at end of file + ) + + +def get_moe_scaling_factors( + input_scale: torch.Tensor, + gemm1_weights_scale: torch.Tensor, + activation_scale: torch.Tensor, + gemm2_weights_scale: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + output1_scales_scalar = gemm1_weights_scale * input_scale * ( + 1.0 / activation_scale) + output1_scales_gate_scalar = gemm1_weights_scale * input_scale + output2_scales_scalar = activation_scale * gemm2_weights_scale + + return output1_scales_scalar, output1_scales_gate_scalar, \ + output2_scales_scalar + + +def register_moe_scaling_factors(layer: torch.nn.Module) -> None: + output1_scales, output1_gate_scales, output2_scales = \ + get_moe_scaling_factors( + layer.w13_input_scale, layer.w13_weight_scale, + layer.w2_input_scale, layer.w2_weight_scale + ) + layer.register_parameter( + 'output1_scales_scalar', + torch.nn.Parameter(output1_scales, requires_grad=False)) + layer.register_parameter( + 'output1_scales_gate_scalar', + torch.nn.Parameter(output1_gate_scales, requires_grad=False)) + layer.register_parameter( + 'output2_scales_scalar', + torch.nn.Parameter(output2_scales, requires_grad=False)) From 2e7268772b5daf472028e03ed3910894fd90195b Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 15 Aug 2025 02:27:30 -0400 Subject: [PATCH 033/231] [Feature] Full Cuda Graph Support for Cutlass MLA and 6% E2E Throughput Improvement (#22763) Signed-off-by: yewentao256 Signed-off-by: Duncan Moss --- .../compile/piecewise/test_full_cudagraph.py | 74 +++++++++++++++++++ vllm/v1/attention/backends/mla/cutlass_mla.py | 16 +++- 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index efe9c843f144..cc1a95b820a4 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -66,6 +66,80 @@ def llm_pair(request): ) +@pytest.fixture(scope="class") +def cutlass_mla_llm_pair(request): + model = request.param + + # force V1 engine and Cutlass MLA backend + with temporary_environ({ + "VLLM_USE_V1": "1", + "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", + "FORCE_NUM_KV_SPLITS": + "1", # TODO: remove this when hang issue is fixed + }): + full = LLM( + model=model, + gpu_memory_utilization=0.45, + trust_remote_code=True, + max_model_len=1024, + compilation_config=CompilationConfig( + full_cuda_graph=True, + cudagraph_capture_sizes=[16, 32, 64, 128, 256, 512], + ), + ) + piecewise = LLM( + model=model, + gpu_memory_utilization=0.45, + trust_remote_code=True, + max_model_len=1024, + compilation_config=CompilationConfig(), + ) + + yield weakref.proxy(full), weakref.proxy(piecewise) + del full + del piecewise + + wait_for_gpu_memory_to_clear( + devices=[0], + threshold_ratio=0.1, + ) + + +@pytest.mark.parametrize( + "cutlass_mla_llm_pair", + [ + # use an MLA model + "deepseek-ai/DeepSeek-V2-Lite", + ], + indirect=True) +@pytest.mark.skipif(current_platform.get_device_capability() != (10, 0), + reason="Only Blackwell GPUs support Cutlass MLA") +class TestFullCUDAGraphCutlassMLA: + """ + Validate full CUDA Graph with Cutlass MLA (decode-only capture). + """ + + @pytest.mark.parametrize(("batch_size", "max_tokens"), [ + (8, 8), + ]) + def test_full_cudagraph_sm100_cutlass_mla( + self, batch_size, max_tokens, cutlass_mla_llm_pair: tuple[LLM, + LLM]): + piecewise_llm, full_cudagraph_llm = cutlass_mla_llm_pair + + prompts = ["Hello, my name is"] * batch_size + sampling_params = SamplingParams(temperature=0.0, + max_tokens=max_tokens, + top_p=0.95) + + piecewise_responses = piecewise_llm.generate(prompts, sampling_params) + full_responses = full_cudagraph_llm.generate(prompts, sampling_params) + + for piecewise_res, full_res in zip(piecewise_responses, + full_responses): + assert piecewise_res.outputs[0].text == full_res.outputs[0].text + + @pytest.mark.parametrize( "llm_pair", [ diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index b23a8f0a5e87..b076613c8645 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from typing import Optional +from typing import ClassVar, Optional import torch @@ -12,11 +12,19 @@ from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, - MLACommonMetadata) + MLACommonMetadata, + MLACommonMetadataBuilder) +from vllm.v1.attention.backends.utils import AttentionCGSupport logger = init_logger(__name__) +class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): + # enable full CUDA Graph support for decode-only capture + attn_cudagraph_support: ClassVar[ + AttentionCGSupport] = AttentionCGSupport.PURE_DECODE_ONLY + + class CutlassMLABackend(MLACommonBackend): @staticmethod @@ -27,6 +35,10 @@ def get_name() -> str: def get_impl_cls() -> type["CutlassMLAImpl"]: return CutlassMLAImpl + @staticmethod + def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]: + return CutlassMLAMetadataBuilder + class SM100Workspace: From 93ff0c383f3373ebd8778ef260644fa86d3e9f40 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Fri, 15 Aug 2025 09:38:05 +0300 Subject: [PATCH 034/231] [Mamba] - refactor: Renamed mamba_attn to mamba2_attn (#22818) Signed-off-by: asafg Co-authored-by: asafg Signed-off-by: Duncan Moss --- tests/kernels/mamba/test_mamba_ssm_ssd.py | 2 +- tests/v1/attention/test_mamba_selectors.py | 2 +- vllm/model_executor/layers/mamba/mamba2_metadata.py | 2 +- vllm/model_executor/layers/mamba/mamba_mixer2.py | 2 +- vllm/v1/attention/backends/{mamba_attn.py => mamba2_attn.py} | 0 vllm/v1/attention/backends/mamba_selectors.py | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename vllm/v1/attention/backends/{mamba_attn.py => mamba2_attn.py} (100%) diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index d2b893ffff7c..2c554baaff76 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -9,7 +9,7 @@ from vllm.model_executor.layers.mamba.ops.ssd_combined import ( mamba_chunk_scan_combined) from vllm.platforms import current_platform -from vllm.v1.attention.backends.mamba_attn import ( +from vllm.v1.attention.backends.mamba2_attn import ( _query_start_loc_to_chunk_indices_offsets) # Added by the IBM Team, 2024 diff --git a/tests/v1/attention/test_mamba_selectors.py b/tests/v1/attention/test_mamba_selectors.py index 8eaafc5e1681..4245b50c7131 100644 --- a/tests/v1/attention/test_mamba_selectors.py +++ b/tests/v1/attention/test_mamba_selectors.py @@ -4,7 +4,7 @@ import pytest -from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend +from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index 0a836fd17533..3256ac034aa1 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -11,7 +11,7 @@ PlaceholderAttentionMetadata) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.platforms import current_platform -from vllm.v1.attention.backends.mamba_attn import ( +from vllm.v1.attention.backends.mamba2_attn import ( Mamba2AttentionMetadata, _query_start_loc_to_chunk_indices_offsets) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 10a5618c227e..6bf0c18ebdb4 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -36,7 +36,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op -from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionMetadata +from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata # Added by the IBM Team, 2024 diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba2_attn.py similarity index 100% rename from vllm/v1/attention/backends/mamba_attn.py rename to vllm/v1/attention/backends/mamba2_attn.py diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py index 852e0dfe1b31..d3a0c63c5e96 100644 --- a/vllm/v1/attention/backends/mamba_selectors.py +++ b/vllm/v1/attention/backends/mamba_selectors.py @@ -3,7 +3,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend -from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend +from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]: From 0bfdd802d6319a1deefd14ec5ee7a55d5fbf2816 Mon Sep 17 00:00:00 2001 From: TJian Date: Thu, 14 Aug 2025 23:39:19 -0700 Subject: [PATCH 035/231] Revert "[ROCm][AITER] Support AITER Rope ops in RotaryEmbedding Module." (#22956) Signed-off-by: vllmellm Co-authored-by: vllmellm Signed-off-by: Duncan Moss --- .../layers/rotary_embedding/base.py | 71 ---------- .../layers/rotary_embedding/common.py | 4 +- .../rotary_embedding/deepseek_scaling_rope.py | 12 +- .../rotary_embedding/rocm_aiter_rope_ops.py | 127 ------------------ 4 files changed, 10 insertions(+), 204 deletions(-) delete mode 100644 vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 6dfc28be7da1..10fce857a8ae 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -8,7 +8,6 @@ from vllm.model_executor.custom_op import CustomOp from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch -from .rocm_aiter_rope_ops import is_rocm_rotary_embedding_enabled @CustomOp.register("rotary_embedding") @@ -36,7 +35,6 @@ def __init__( cache = cache.to(dtype) self.cos_sin_cache: torch.Tensor self.register_buffer("cos_sin_cache", cache, persistent=False) - self.is_rocm_aiter_enabled = is_rocm_rotary_embedding_enabled() def _compute_inv_freq(self, base: float) -> torch.Tensor: """Compute the inverse frequency.""" @@ -121,75 +119,6 @@ def forward_cuda( self.cos_sin_cache, self.is_neox_style) return query, key - def forward_hip( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - is_nope_first=False, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - # currently only rotary embedding ops from AITER package are - # supported for HiP forward. - if self.is_rocm_aiter_enabled: - return self.forward_hip_rocm_aiter(positions, query, key, offsets, - is_nope_first) - return self.forward_native(positions, query, key, offsets) - - def forward_hip_rocm_aiter( - self, - positions: torch.Tensor, - # if is_nope_first - # [[batch_size, seq_len, num_heads, nope_size+rope_size] - # if NOT is_nope_first - # [[batch_size, seq_len, num_heads, rope_size+nope_size], - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - is_nope_first: bool = False, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - if self.cos_sin_cache.device != query.device or \ - self.cos_sin_cache.dtype != query.dtype: - self.cos_sin_cache = self.cos_sin_cache.to(query.device, - dtype=query.dtype) - cos, sin = self.cos_sin_cache.chunk(2, dim=-1) - - cos = cos.unsqueeze(-2).unsqueeze(-2) - sin = sin.unsqueeze(-2).unsqueeze(-2) - - rotate_style = 0 if self.is_neox_style else 1 - - num_tokens = positions.numel() - - query_shape = query.shape - query = query.view(1, num_tokens, -1, self.head_size) - if key is not None: - key_shape = key.shape - key = key.view(1, num_tokens, -1, self.head_size) - - positions = positions.view(*query.shape[:2]) - if offsets is not None: - offsets = offsets.view(*query.shape[:2]) - - if not is_nope_first: - query_ = query[..., :self.rotary_dim] - key_ = key[..., :self.rotary_dim] if key is not None else None - else: - query_ = query[..., -self.rotary_dim:] - key_ = key[..., -self.rotary_dim:] if key is not None else None - - if key_ is None: - torch.ops.vllm.rocm_aiter_rotary_emb_without_key_forward_hip( - positions, sin, cos, query_, offsets, rotate_style, - is_nope_first) - return query.view(query_shape), None - - torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_hip( - positions, sin, cos, query_, key_, offsets, rotate_style, - is_nope_first) - - return query.view(query_shape), key.view(key_shape) - def forward_xpu( self, positions: torch.Tensor, diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 99b6bb212033..8d821bea19e3 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -99,7 +99,7 @@ def yarn_linear_ramp_mask(low: float, high: float, dim: int, return ramp_func -def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: +def yarn_get_mscale(scale: float = 1) -> float: if scale <= 1: return 1.0 - return 0.1 * mscale * math.log(scale) + 1.0 + return 0.1 * math.log(scale) + 1.0 diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index 5af671703a3f..cd888b733426 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math from typing import Optional import torch @@ -9,7 +10,13 @@ from .base import RotaryEmbedding from .common import (rotate_gptj, rotate_neox, yarn_find_correction_range, - yarn_get_mscale, yarn_linear_ramp_mask) + yarn_linear_ramp_mask) + + +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 class DeepseekScalingRotaryEmbedding(RotaryEmbedding): @@ -89,9 +96,6 @@ def forward( offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" - if self.is_rocm_aiter_enabled: - return self.forward_hip_rocm_aiter(positions, query, key, offsets) - assert key is not None query_rot = query[..., :self.rotary_dim] key_rot = key[..., :self.rotary_dim] diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py deleted file mode 100644 index 91a2318badb4..000000000000 --- a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py +++ /dev/null @@ -1,127 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -import torch - -import vllm.envs as envs -from vllm.platforms import current_platform -from vllm.utils import direct_register_custom_op - - -def is_rocm_rotary_embedding_enabled() -> bool: - return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER) - - -def rocm_aiter_rotary_emb_without_key_forward_hip_impl( - positions: torch.Tensor, - sin: torch.Tensor, - cos: torch.Tensor, - query: torch.Tensor, - offsets: Optional[torch.Tensor] = None, - rotate_style: int = 0, - is_nope_first: bool = False, -) -> None: - import aiter as ops - if offsets is None: - ops.rope_cached_positions_fwd_inplace( - query, - cos, - sin, - positions, - rotate_style, - reuse_freqs_front_part=True, - nope_first=is_nope_first, - ) - else: - ops.rope_cached_positions_offsets_fwd_inplace( - query, - cos, - sin, - positions, - offsets, - rotate_style, - reuse_freqs_front_part=True, - nope_first=is_nope_first, - ) - - -def rocm_aiter_rotary_emb_with_key_forward_hip_impl( - positions: torch.Tensor, - sin: torch.Tensor, - cos: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - offsets: Optional[torch.Tensor] = None, - rotate_style: int = 0, - is_nope_first: bool = False, -) -> None: - import aiter as ops - if offsets is None: - ops.rope_cached_positions_2c_fwd_inplace( - query, - key, - cos, - sin, - positions, - rotate_style, - reuse_freqs_front_part=True, - nope_first=is_nope_first, - ) - else: - ops.rope_cached_positions_offsets_2c_fwd_inplace( - query, - key, - cos, - sin, - positions, - offsets, - rotate_style, - reuse_freqs_front_part=True, - nope_first=is_nope_first, - ) - - -def rocm_aiter_rotary_emb_with_key_forward_hip_fake( - positions: torch.Tensor, - sin: torch.Tensor, - cos: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - offsets: Optional[torch.Tensor] = None, - rotate_style: int = 0, - is_nope_first: bool = False, -) -> None: - pass - - -def rocm_aiter_rotary_emb_without_key_forward_hip_fake( - positions: torch.Tensor, - sin: torch.Tensor, - cos: torch.Tensor, - query: torch.Tensor, - offsets: Optional[torch.Tensor] = None, - rotate_style: int = 0, - is_nope_first: bool = False, -) -> None: - pass - - -if is_rocm_rotary_embedding_enabled(): - - direct_register_custom_op( - op_name="rocm_aiter_rotary_emb_with_key_forward_hip", - op_func=rocm_aiter_rotary_emb_with_key_forward_hip_impl, - mutates_args=["key", "query"], - fake_impl=rocm_aiter_rotary_emb_with_key_forward_hip_fake, - dispatch_key=current_platform.dispatch_key, - ) - - direct_register_custom_op( - op_name="rocm_aiter_rotary_emb_without_key_forward_hip", - op_func=rocm_aiter_rotary_emb_without_key_forward_hip_impl, - mutates_args=["query"], - fake_impl=rocm_aiter_rotary_emb_without_key_forward_hip_fake, - dispatch_key=current_platform.dispatch_key, - ) \ No newline at end of file From 581e0c0db929abacf8fc5c37894f47dbe65e759a Mon Sep 17 00:00:00 2001 From: frankie Date: Fri, 15 Aug 2025 15:01:48 +0800 Subject: [PATCH 036/231] [P/D]Provide bucket algorithm rate limiter for proxy_server (#22643) Signed-off-by: frankie-ys Signed-off-by: frankie Co-authored-by: Cyrus Leung Co-authored-by: Kuntai Du Signed-off-by: Duncan Moss --- .../disagg_prefill_proxy_server.py | 240 ++++++++++++++---- benchmarks/disagg_benchmarks/rate_limiter.py | 45 ++++ benchmarks/disagg_benchmarks/request_queue.py | 39 +++ 3 files changed, 272 insertions(+), 52 deletions(-) create mode 100644 benchmarks/disagg_benchmarks/rate_limiter.py create mode 100644 benchmarks/disagg_benchmarks/request_queue.py diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index f62d8102e2d9..904f80534914 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,63 +1,199 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import asyncio +import logging import os import aiohttp -from quart import Quart, make_response, request - -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) - -app = Quart(__name__) - - -async def forward_request(url, data): - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: +from quart import Quart, Response, make_response, request +from rate_limiter import RateLimiter +from request_queue import RequestQueue + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + """parse command line arguments""" + parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server") + + # Add args + parser.add_argument( + "--timeout", + type=float, + default=300, + help="Timeout for backend service requests in seconds (default: 300)", + ) + parser.add_argument( + "--max-concurrent", + type=int, + default=100, + help="Maximum concurrent requests to backend services (default: 100)", + ) + parser.add_argument( + "--queue-size", + type=int, + default=500, + help="Maximum number of requests in the queue (default: 500)", + ) + parser.add_argument( + "--rate-limit", + type=int, + default=40, + help="Maximum requests per second (default: 40)", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Port to run the server on (default: 8000)", + ) + parser.add_argument( + "--prefill-url", + type=str, + default="http://localhost:8100/v1/completions", + help="Prefill service endpoint URL", + ) + parser.add_argument( + "--decode-url", + type=str, + default="http://localhost:8200/v1/completions", + help="Decode service endpoint URL", + ) + + return parser.parse_args() + + +def main(): + """parse command line arguments""" + args = parse_args() + + # Initialize configuration using command line parameters + AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) + MAX_CONCURRENT_REQUESTS = args.max_concurrent + REQUEST_QUEUE_SIZE = args.queue_size + RATE_LIMIT = args.rate_limit + PREFILL_SERVICE_URL = args.prefill_url + DECODE_SERVICE_URL = args.decode_url + PORT = args.port + + app = Quart(__name__) + + # Initialize the rate limiter and request queue + rate_limiter = RateLimiter(RATE_LIMIT) + request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) + + # Attach the configuration object to the application instance + app.config.update( + { + "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, + "rate_limiter": rate_limiter, + "request_queue": request_queue, + "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, + "DECODE_SERVICE_URL": DECODE_SERVICE_URL, + } + ) + + # Start queue processing on app startup + @app.before_serving + async def startup(): + """Start request processing task when app starts serving""" + asyncio.create_task(request_queue.process()) + + async def forward_request(url, data): + """Forward request to backend service with rate limiting and error handling""" headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - async with session.post(url=url, json=data, headers=headers) as response: - if response.status == 200: - # if response.headers.get('Transfer-Encoding') == 'chunked': - if True: - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - content = await response.read() - yield content - - -@app.route("/v1/completions", methods=["POST"]) -async def handle_request(): - try: - original_request_data = await request.get_json() - - prefill_request = original_request_data.copy() - # change max_tokens = 1 to let it only do prefill - prefill_request["max_tokens"] = 1 - - # finish prefill - async for _ in forward_request( - "http://localhost:8100/v1/completions", prefill_request - ): - continue - # return decode - generator = forward_request( - "http://localhost:8200/v1/completions", original_request_data - ) - response = await make_response(generator) - response.timeout = None - - return response - - except Exception as e: - import sys - import traceback - - exc_info = sys.exc_info() - print("Error occurred in disagg prefill proxy server") - print(e) - print("".join(traceback.format_exception(*exc_info))) + # Use rate limiter as context manager + async with ( + rate_limiter, + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + ): + try: + async with session.post( + url=url, json=data, headers=headers + ) as response: + if response.status == 200: + # Stream response chunks + async for chunk_bytes in response.content.iter_chunked(1024): + yield chunk_bytes + else: + # Handle backend service errors + error_text = await response.text() + logger.error( + "Backend service error: %s - %s", + response.status, + error_text, + ) + yield b'{"error": "Backend service error"}' + except aiohttp.ClientError as e: + # Handle connection errors + logger.error("Connection error to %s: %s", url, str(e)) + yield b'{"error": "Service unavailable"}' + except asyncio.TimeoutError: + # Handle timeout errors + logger.error("Timeout connecting to %s", url) + yield b'{"error": "Service timeout"}' + + async def process_request(): + """Process a single request through prefill and decode stages""" + try: + original_request_data = await request.get_json() + + # Create prefill request (max_tokens=1) + prefill_request = original_request_data.copy() + prefill_request["max_tokens"] = 1 + + # Execute prefill stage + async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): + continue + + # Execute decode stage and stream response + generator = forward_request(DECODE_SERVICE_URL, original_request_data) + response = await make_response(generator) + response.timeout = None # Disable timeout for streaming response + return response + + except Exception: + logger.exception("Error processing request") + return Response( + response=b'{"error": "Internal server error"}', + status=500, + content_type="application/json", + ) + + @app.route("/v1/completions", methods=["POST"]) + async def handle_request(): + """Handle incoming API requests with concurrency and rate limiting""" + # Create task for request processing + task = asyncio.create_task(process_request()) + + # Enqueue request or reject if queue is full + if not await request_queue.enqueue(task): + return Response( + response=b'{"error": "Server busy, try again later"}', + status=503, + content_type="application/json", + ) + + try: + # Return the response from the processing task + return await task + except asyncio.CancelledError: + # Handle task cancellation (timeout or queue full) + logger.warning("Request cancelled due to timeout or queue full") + return Response( + response=b'{"error": "Request cancelled"}', + status=503, + content_type="application/json", + ) + + # Start the Quart server with host can be set to 0.0.0.0 + app.run(port=PORT) if __name__ == "__main__": - app.run(port=8000) + main() diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py new file mode 100644 index 000000000000..87ac8cb6ab1a --- /dev/null +++ b/benchmarks/disagg_benchmarks/rate_limiter.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import time + + +class RateLimiter: + """Token bucket rate limiter implementation""" + + def __init__(self, rate_limit): + self.rate_limit = rate_limit # Requests per second + self.num_available_tokens = rate_limit # Available tokens + self.last_refill = time.monotonic() # Last token refill time + self.lock = asyncio.Lock() # Synchronization lock + + async def acquire(self): + """Acquire a token from the rate limiter""" + while True: + async with self.lock: + current_time = time.monotonic() + elapsed = current_time - self.last_refill + + # Refill num_available_tokens if more than 1 second has passed + if elapsed > 1.0: + self.num_available_tokens = self.rate_limit + self.last_refill = current_time + + # Check if num_available_tokens are available + if self.num_available_tokens > 0: + self.num_available_tokens -= 1 + return True + + # Calculate wait time if no num_available_tokens available + wait_time = 1.0 - elapsed + await asyncio.sleep(wait_time) + + async def __aenter__(self): + """Enter async context manager - acquire token""" + await self.acquire() + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + """Exit async context manager - no cleanup needed""" + pass diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py new file mode 100644 index 000000000000..410bcb956050 --- /dev/null +++ b/benchmarks/disagg_benchmarks/request_queue.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +from collections import deque + + +class RequestQueue: + """Request queue manager with concurrency control""" + + def __init__(self, max_concurrent, max_queue_size): + # Maximum concurrent requests + self.max_concurrent = max_concurrent + self.max_queue_size = max_queue_size # Maximum queue size + # Concurrency control + self.semaphore = asyncio.Semaphore(max_concurrent) + self.queue = deque() # Request queue + self.queue_size = 0 # Current queue size + self.lock = asyncio.Lock() # Sync queue Lock + + async def enqueue(self, task): + """Add a request task to the queue""" + async with self.lock: + if self.queue_size >= self.max_queue_size: + return False + + self.queue.append(task) + self.queue_size += 1 + return True + + async def process(self): + """Process queued requests using semaphore for concurrency control""" + while True: + if self.queue: + async with self.semaphore, self.lock: + task = self.queue.popleft() + self.queue_size -= 1 + await task + await asyncio.sleep(0.01) # Yield control to event loop From 041fa232d63ccb3357e78cc43c65736475c34ef8 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 15 Aug 2025 16:16:15 +0800 Subject: [PATCH 037/231] [CI] Pooling models mteb test uses enforce_eager (#22878) Signed-off-by: wang.yuqi Signed-off-by: Duncan Moss --- tests/models/language/pooling/mteb_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index d024c76dddfd..4a1f8a53d024 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -18,7 +18,7 @@ # - Different model results in differences more than 1e-3 # 1e-4 is a good tolerance threshold MTEB_EMBED_TASKS = ["STS12"] -MTEB_EMBED_TOL = 1e-4 +MTEB_EMBED_TOL = 0.02 # See #19344 MTEB_RERANK_TASKS = ["NFCorpus"] @@ -175,6 +175,7 @@ def mteb_test_embed_models(hf_runner, with vllm_runner(model_info.name, runner="pooling", max_model_len=None, + enforce_eager=True, **vllm_extra_kwargs) as vllm_model: model_config = vllm_model.llm.llm_engine.model_config @@ -198,6 +199,7 @@ def mteb_test_embed_models(hf_runner, st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) st_dtype = next(hf_model.model.parameters()).dtype + print("Model:", model_info.name) print("VLLM:", vllm_dtype, vllm_main_score) print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) @@ -286,6 +288,7 @@ def mteb_test_rerank_models(hf_runner, runner="pooling", max_model_len=None, max_num_seqs=8, + enforce_eager=True, **vllm_extra_kwargs) as vllm_model: model_config = vllm_model.llm.llm_engine.model_config @@ -304,6 +307,7 @@ def mteb_test_rerank_models(hf_runner, st_main_score, st_dtype = mteb_test_rerank_models_hf( hf_runner, model_info.name, hf_model_callback) + print("Model:", model_info.name) print("VLLM:", vllm_dtype, vllm_main_score) print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) From aa2eb6a5d2a36d575799c1c4a514969cbdf946d2 Mon Sep 17 00:00:00 2001 From: amirai21 <89905406+amirai21@users.noreply.github.com> Date: Fri, 15 Aug 2025 11:59:52 +0300 Subject: [PATCH 038/231] [V1] - Split Prefill and Decode for Mamba1 models (#22653) Signed-off-by: amirk Signed-off-by: asafg Co-authored-by: asafg Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Signed-off-by: Duncan Moss --- .../models/language/generation/test_hybrid.py | 13 + .../layers/mamba/mamba_mixer.py | 309 +++++++++++++----- vllm/v1/attention/backends/mamba1_attn.py | 26 +- 3 files changed, 253 insertions(+), 95 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 19fcbf561640..e75677347f03 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -57,6 +57,13 @@ # Avoid OOM MAX_NUM_SEQS = 4 +# Once we add support for FCG in Mamba1, this list will be removed and tests +# all test cases will use enforce_eager=False +ENFORCE_EAGER_MODELS_V1 = [ + "state-spaces/mamba-130m-hf", + "ai21labs/Jamba-tiny-dev", +] + @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @@ -94,13 +101,19 @@ def test_models( example_prompts, max_tokens, num_logprobs) if model in V1_SUPPORTED_MODELS: + enforce_eager = False with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") if model in HYBRID_MODELS: # required due to reorder_batch behaviour m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") + + if model in ENFORCE_EAGER_MODELS_V1: + enforce_eager = True + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, + enforce_eager=enforce_eager, enable_prefix_caching=False) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 17b7f84a933f..3b17fb0ca8c7 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -1,13 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional +from typing import NamedTuple, Optional import torch from torch import nn from torch.nn.parameter import Parameter from vllm import envs +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import get_current_vllm_config from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) @@ -154,13 +155,38 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor): self.prefix = prefix + def _ssm_transform( + self, x: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if self.is_lora_enabled: + # Lora kernel requires contiguous tensor. + ssm_params = self.x_proj(x.contiguous())[0] + else: + ssm_params = self.x_proj(x)[0] + time_step, B, C = torch.split( + ssm_params, + [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], + dim=-1) + if self.use_rms_norm: + assert self.dt_layernorm is not None + assert self.b_layernorm is not None + assert self.c_layernorm is not None + time_step = self.dt_layernorm(time_step.contiguous()) + B = self.b_layernorm(B.contiguous()) + C = self.c_layernorm(C.contiguous()) + discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) + return discrete_time_step, B, C + def forward(self, hidden_states: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): if not envs.VLLM_USE_V1: return CustomOp.forward(self, hidden_states, mamba_cache_params) else: - return self.forward_cuda(hidden_states, mamba_cache_params) + return self.forward_cuda( + hidden_states, + mamba_cache_params, + ) def forward_native(self, hidden_states: torch.Tensor, @@ -170,6 +196,27 @@ def forward_native(self, def forward_cuda(self, hidden_states: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): + """ + Run the Mamba-1 SSM pipeline. + + Steps + ----- + 1. Apply the gated-MLP linear projection to the raw input. + 2. Pass the projected sequence through the convolutional mixing layer. + 3. Feed the result into the State-Space Model (SSM) blocks. + 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x) + to produce contextual representations. + 5. Project the contextualised sequence back + to the output embedding dimension. + + Batch handling + -------------- + Prefill and decode tokens are processed by dedicated CUDA + kernels for both the convolutional (conv1d) and SSM stages. + In the case of a mixed batch (containing both prefill and + decode tokens), both sets of kernels are executed independently + and their outputs are concatenated before the final output projection. + """ forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.attn_metadata @@ -185,126 +232,142 @@ def forward_cuda(self, self_kv_cache = self.kv_cache[forward_context.virtual_engine] conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] - has_initial_state = mamba1_metadata.has_initial_states - context_lens_tensor = mamba1_metadata.context_lens_tensor + has_initial_states = mamba1_metadata.has_initial_states else: + assert isinstance(attn_metadata, AttentionMetadata) assert mamba_cache_params is not None conv_state = mamba_cache_params.conv_state ssm_state = mamba_cache_params.ssm_state state_indices_tensor = mamba_cache_params.state_indices_tensor query_start_loc = attn_metadata.query_start_loc context_lens_tensor = attn_metadata.context_lens_tensor - + has_initial_states = None if context_lens_tensor is not None: - has_initial_state = context_lens_tensor > 0 + has_initial_states = context_lens_tensor > 0 # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) - hidden_states, gate = projected_states.chunk(2, dim=-2) + hidden_states_BC, gate = projected_states.chunk(2, dim=-2) - # 2. Convolution sequence transformation conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) if envs.VLLM_USE_V1 and attn_metadata is None: # V1 profile run - hidden_states = hidden_states.contiguous() - return self.out_proj(hidden_states.transpose(-2, -1))[0] - - if query_start_loc is not None and context_lens_tensor is not None: - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - hidden_states = causal_conv1d_fn( - hidden_states, + hidden_states_BC = hidden_states_BC.contiguous() + return self.out_proj(hidden_states_BC.transpose(-2, -1))[0] + + num_prefill_tokens = attn_metadata.num_prefill_tokens # token count + num_decode_tokens = attn_metadata.num_decode_tokens + num_prefills = attn_metadata.num_prefills # request count + num_decodes = attn_metadata.num_decode_tokens # token count (=request) + has_prefill = num_prefill_tokens > 0 + has_decode = num_decode_tokens > 0 + + prefill_decode_split = split_batch_to_prefill_and_decode( + hidden_states_BC, + gate, + state_indices_tensor, + query_start_loc, + has_initial_states, + num_prefill_tokens, + num_decode_tokens, + num_prefills, + num_decodes, + ) + hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p + hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d + gate_p = prefill_decode_split.gate_p + gate_d = prefill_decode_split.gate_d + state_indices_tensor_p = prefill_decode_split.state_indices_tensor_p + state_indices_tensor_d = prefill_decode_split.state_indices_tensor_d + query_start_loc_p = prefill_decode_split.query_start_loc_p + has_initial_states_p = prefill_decode_split.has_initial_states_p + + ssm_outputs = [] + + if has_prefill: + # 2. Convolution sequence transformation + conv_out_p = causal_conv1d_fn( + hidden_states_BC_p, conv_weights, - bias=self.conv1d.bias, + self.conv1d.bias, activation=self.activation, conv_states=conv_state, - has_initial_state=has_initial_state, - cache_indices=state_indices_tensor, - query_start_loc=query_start_loc) - else: - hidden_states = causal_conv1d_update( - hidden_states.transpose(0, 1), + has_initial_state=has_initial_states_p, + cache_indices=state_indices_tensor_p, + query_start_loc=query_start_loc_p) + # 3. State Space Model sequence transformations. + discrete_time_step_p, B_p, C_p = self._ssm_transform( + conv_out_p.transpose(-2, -1)) + time_proj_bias = self._time_proj_bias() + + # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x) + scan_out_p = selective_scan_fn( + conv_out_p, + ssm_state, + discrete_time_step_p, + self.A, + B_p.transpose(-2, -1), + C_p.transpose(-2, -1), + self.D.float(), + gate_p, + time_proj_bias, + delta_softplus=True, + cache_indices=state_indices_tensor_p, + has_initial_state=has_initial_states_p, + query_start_loc=query_start_loc_p) + ssm_outputs.append(scan_out_p) + + if has_decode: + # 2. Convolution sequence transformation + conv_out_d = causal_conv1d_update( + hidden_states_BC_d.transpose(0, 1), conv_state, conv_weights, self.conv1d.bias, self.activation, - conv_state_indices=state_indices_tensor) - hidden_states = hidden_states.transpose(0, 1) + conv_state_indices=state_indices_tensor_d).transpose(0, 1) - # 3. State Space Model sequence transformation - # 3.a. input varying initialization of time_step, B and C + # 3. State Space Model sequence transformation. + discrete_time_step_d, B_d, C_d = self._ssm_transform( + conv_out_d.transpose(-2, -1)) + time_proj_bias = self._time_proj_bias() - if self.is_lora_enabled: - # lora kernel requires contiguous tensor - ssm_parameters = self.x_proj( - hidden_states.transpose(-2, -1).contiguous())[0] - else: - ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] - - time_step, B, C = torch.split( - ssm_parameters, - [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], - dim=-1, - ) - if self.use_rms_norm: - assert self.dt_layernorm is not None - assert self.b_layernorm is not None - assert self.c_layernorm is not None - time_step = self.dt_layernorm(time_step.contiguous()) - B = self.b_layernorm(B.contiguous()) - C = self.c_layernorm(C.contiguous()) - - discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) - # 3.c perform the recurrence y ← SSM(A, B, C)(x) - time_proj_bias = (self.dt_proj.bias.float() if hasattr( - self.dt_proj, "bias") else None) - - if query_start_loc is not None and context_lens_tensor is not None: - scan_outputs = selective_scan_fn( - hidden_states, - ssm_state, - discrete_time_step, - self.A, - B.transpose(-2, -1), - C.transpose(-2, -1), - self.D.float(), - gate, - time_proj_bias, - delta_softplus=True, - cache_indices=state_indices_tensor, - has_initial_state=has_initial_state, - query_start_loc=query_start_loc) - else: - scan_outputs = torch.empty_like(hidden_states.transpose(0, 1)) + # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x) + scan_outputs_d = torch.empty_like( + hidden_states_BC_d.transpose(0, 1)) selective_state_update(ssm_state, - hidden_states.transpose(0, 1), - discrete_time_step.transpose(0, 1), + conv_out_d.transpose(0, 1), + discrete_time_step_d.transpose(0, 1), self.A, - B, - C, + B_d, + C_d, self.D, - gate.transpose(0, 1), + gate_d.transpose(0, 1), time_proj_bias, dt_softplus=True, - state_batch_indices=state_indices_tensor, - out=scan_outputs) - scan_outputs = scan_outputs.transpose(0, 1) - - # 4. Final linear projection - if self.is_lora_enabled: - # lora kernel requires contiguous tensor - contextualized_states = self.out_proj( - scan_outputs.transpose(-2, -1).contiguous())[0] + state_batch_indices=state_indices_tensor_d, + out=scan_outputs_d) + scan_outputs_d = scan_outputs_d.transpose(0, 1) + + if envs.VLLM_USE_V1: + ssm_outputs.insert(0, scan_outputs_d) + else: + ssm_outputs.append(scan_outputs_d) + + scan_outputs_combined = ssm_outputs[0] if len( + ssm_outputs) == 1 else torch.cat(ssm_outputs, dim=-1) + + # 5. Final output projection + if self.is_lora_enabled: # Lora kernel requires contiguous tensor. + scan_outputs_combined = scan_outputs_combined.transpose( + -2, -1).contiguous() + out = self.out_proj(scan_outputs_combined)[0] else: - contextualized_states = self.out_proj( - scan_outputs.transpose(-2, -1))[0] - return contextualized_states + out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0] + + return out def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: return MambaStateShapeCalculator.mamba1_state_shape( @@ -317,3 +380,69 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: @property def mamba_type(self) -> str: return "mamba1" + + def _time_proj_bias(self) -> Optional[torch.Tensor]: + if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None: + return self.dt_proj.bias.float() + return None + + +class PrefillDecodeSplit(NamedTuple): + hidden_states_BC_p: torch.Tensor + hidden_states_BC_d: torch.Tensor + gate_p: torch.Tensor + gate_d: torch.Tensor + state_indices_tensor_p: torch.Tensor + state_indices_tensor_d: torch.Tensor + query_start_loc_p: Optional[torch.Tensor] + has_initial_states_p: Optional[torch.Tensor] + + +def split_batch_to_prefill_and_decode( + hidden_states_BC: torch.Tensor, + gate: torch.Tensor, + state_indices_tensor: torch.Tensor, + query_start_loc: torch.Tensor, + has_initial_states: Optional[torch.Tensor], + num_prefill_tokens: int, + num_decode_tokens: int, + num_prefills: int, + num_decodes: int, +) -> PrefillDecodeSplit: + if envs.VLLM_USE_V1: + # In v1, decode tokens come first, then prefill tokens. + hidden_states_BC_d, hidden_states_BC_p = torch.split( + hidden_states_BC, [num_decode_tokens, num_prefill_tokens], dim=-1) + gate_d, gate_p = torch.split(gate, + [num_decode_tokens, num_prefill_tokens], + dim=-1) + state_indices_tensor_d, state_indices_tensor_p = torch.split( + state_indices_tensor, [num_decodes, num_prefills], dim=0) + query_start_loc_p = (query_start_loc[-num_prefills - 1:] - + num_decodes if num_prefills > 0 else None) + has_initial_states_p = has_initial_states[-num_prefills:] if ( + has_initial_states is not None and num_prefills > 0) else None + else: + # In v0, prefill tokens come first, then decode tokens. + hidden_states_BC_p, hidden_states_BC_d = torch.split( + hidden_states_BC, [num_prefill_tokens, num_decode_tokens], dim=-1) + gate_p, gate_d = torch.split(gate, + [num_prefill_tokens, num_decode_tokens], + dim=-1) + state_indices_tensor_p, state_indices_tensor_d = torch.split( + state_indices_tensor, [num_prefills, num_decodes], dim=0) + query_start_loc_p = (query_start_loc[:num_prefills + + 1] if num_prefills > 0 else None) + has_initial_states_p = has_initial_states[:num_prefills] if ( + has_initial_states is not None and num_prefills > 0) else None + + return PrefillDecodeSplit( + hidden_states_BC_p=hidden_states_BC_p, + hidden_states_BC_d=hidden_states_BC_d, + gate_p=gate_p, + gate_d=gate_d, + state_indices_tensor_p=state_indices_tensor_p, + state_indices_tensor_d=state_indices_tensor_d, + query_start_loc_p=query_start_loc_p, + has_initial_states_p=has_initial_states_p, + ) diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index f0e4636fdb52..6cdc509083ae 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -2,14 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import ClassVar +from typing import ClassVar, Optional import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, - CommonAttentionMetadata) + CommonAttentionMetadata, + split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec @@ -25,12 +26,15 @@ class Mamba1AttentionMetadata: query_start_loc: torch.Tensor context_lens_tensor: torch.Tensor state_indices_tensor: torch.Tensor - has_initial_states: torch.Tensor + has_initial_states: Optional[torch.Tensor] + num_prefills: int + num_prefill_tokens: int + num_decodes: int + num_decode_tokens: int class Mamba1AttentionMetadataBuilder( AttentionMetadataBuilder[Mamba1AttentionMetadata]): - reorder_batch_threshold: ClassVar[int] = 1 def __init__( @@ -57,11 +61,23 @@ def build( state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] context_lens_tensor = common_attn_metadata.num_computed_tokens_cpu.to( query_start_loc.device) - has_initial_states = (context_lens_tensor > 0) + + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + split_decodes_and_prefills(common_attn_metadata, + decode_threshold=1)) + + has_initial_states = None + + if num_prefills > 0: + has_initial_states = context_lens_tensor > 0 return Mamba1AttentionMetadata( query_start_loc=query_start_loc, context_lens_tensor=context_lens_tensor, has_initial_states=has_initial_states, state_indices_tensor=state_indices_tensor, + num_prefills=num_prefills, + num_prefill_tokens=num_prefill_tokens, + num_decodes=num_decodes, + num_decode_tokens=num_decode_tokens, ) From 38c8f87335dbaaa5957f875f3f415ab900a80817 Mon Sep 17 00:00:00 2001 From: Sayandip Dutta Date: Fri, 15 Aug 2025 14:58:00 +0530 Subject: [PATCH 039/231] [Bugfix] Unquote file uri before reading image (#22912) Signed-off-by: Sayandip Dutta Co-authored-by: Cyrus Leung Signed-off-by: Duncan Moss --- tests/multimodal/test_utils.py | 26 ++++++++++++++++++++++++++ vllm/multimodal/utils.py | 3 ++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 41f4773a11c8..ea964a54383c 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -148,6 +148,32 @@ async def test_fetch_image_local_files(image_url: str): f"file://{temp_dir}/../{os.path.basename(image_url)}") +@pytest.mark.asyncio +async def test_fetch_image_local_files_with_space_in_name(): + image_url = TEST_IMAGE_URLS[0] + connector = MediaConnector() + + with TemporaryDirectory() as temp_dir: + local_connector = MediaConnector(allowed_local_media_path=temp_dir) + + origin_image = connector.fetch_image(image_url) + filename = "file name with space.jpg" + origin_image.save(os.path.join(temp_dir, filename), + quality=100, + icc_profile=origin_image.info.get('icc_profile')) + + try: + image_async = await local_connector.fetch_image_async( + f"file://{temp_dir}/{filename}") + image_sync = local_connector.fetch_image( + f"file://{temp_dir}/{filename}") + except FileNotFoundError as e: + pytest.fail( + "Failed to fetch image with space in name: {}".format(e)) + # Check that the images are equal + assert not ImageChops.difference(image_sync, image_async).getbbox() + + @pytest.mark.asyncio async def test_fetch_image_error_conversion(): connector = MediaConnector() diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3b01ee7ad4a4..f914d0dc6c5e 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse +from urllib.request import url2pathname import numpy as np import numpy.typing as npt @@ -108,7 +109,7 @@ def _load_file_url( raise RuntimeError("Cannot load local files without " "`--allowed-local-media-path`.") - filepath = Path(url_spec.path) + filepath = Path(url2pathname(url_spec.path)) if allowed_local_media_path not in filepath.resolve().parents: raise ValueError( f"The file path {filepath} must be a subpath " From 0ce467310def45f4c712f2149fcf56505dc71eb3 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Fri, 15 Aug 2025 18:10:22 +0800 Subject: [PATCH 040/231] [Bugfix] fix cuda 12.6 and 11.8 build (#22952) Signed-off-by: Jinzhen Lin Signed-off-by: Duncan Moss --- CMakeLists.txt | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c1a200d1899..dcec854a0872 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -351,8 +351,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set_gencode_flags_for_srcs( SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" CUDA_ARCHS "${MARLIN_ARCHS}") - set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} - PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) @@ -366,8 +368,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set_gencode_flags_for_srcs( SRCS "${MARLIN_SRCS}" CUDA_ARCHS "${MARLIN_ARCHS}") - set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu" - PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu" + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") @@ -859,8 +863,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set_gencode_flags_for_srcs( SRCS "${MOE_WNAA16_MARLIN_SRC}" CUDA_ARCHS "${MARLIN_MOE_ARCHS}") - set_source_files_properties(${MOE_WNAA16_MARLIN_SRC} - PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MOE_WNAA16_MARLIN_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) From 8120bd73760ac43b79c2c1f641c08719fbdd5803 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 15 Aug 2025 04:41:38 -0700 Subject: [PATCH 041/231] [MM] Allow skipping memory profiling for multimodal models. (#22950) Signed-off-by: Roger Wang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Duncan Moss --- vllm/config/__init__.py | 17 ++++- vllm/engine/arg_utils.py | 4 ++ vllm/v1/worker/gpu_model_runner.py | 84 ++++++++++++----------- vllm/v1/worker/tpu_model_runner.py | 106 +++++++++++++++-------------- 4 files changed, 121 insertions(+), 90 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index b4ea15ef5a0f..a2e93c344b3f 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -388,6 +388,10 @@ class ModelConfig: interleave_mm_strings: bool = False """Enable fully interleaved support for multimodal prompts, while using --chat-template-content-format=string. Defaults to False.""" + skip_mm_profiling: bool = False + """When enabled, skips multimodal memory profiling and only profiles with + language backbone model during engine initialization. + """ media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) """Additional args passed to process media inputs, keyed by modalities. For example, to set num_frames for video, set @@ -837,7 +841,8 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: media_io_kwargs=self.media_io_kwargs, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, - interleave_mm_strings=self.interleave_mm_strings) + interleave_mm_strings=self.interleave_mm_strings, + skip_mm_profiling=self.skip_mm_profiling) return None @@ -2511,6 +2516,16 @@ class MultiModalConfig: Enable fully interleaved support for multimodal prompts. """ + skip_mm_profiling: bool = False + """ + When enabled, skips multimodal memory profiling and only profiles with + language backbone model during engine initialization. + + This reduces engine startup time but shifts the responsibility to users for + estimating the peak memory usage of the activation of multimodal encoder and + embedding cache. + """ + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index dd1072da0844..31de2ede7a38 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -350,6 +350,7 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb + skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled @@ -716,6 +717,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: multimodal_group.add_argument( "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"]) + multimodal_group.add_argument("--skip-mm-profiling", + **multimodal_kwargs["skip_mm_profiling"]) # LoRA related configs lora_kwargs = get_kwargs(LoRAConfig) @@ -918,6 +921,7 @@ def create_model_config(self) -> ModelConfig: limit_mm_per_prompt=self.limit_mm_per_prompt, interleave_mm_strings=self.interleave_mm_strings, media_io_kwargs=self.media_io_kwargs, + skip_mm_profiling=self.skip_mm_profiling, use_async_output_proc=not self.disable_async_output_proc, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8fb9641844fb..703092ca9fee 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2479,50 +2479,56 @@ def _dummy_pooler_run( def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: - mm_budget = self.mm_budget - assert mm_budget is not None - - # TODO: handle encoder-decoder models once we support them. - if (encoder_budget := mm_budget.get_encoder_budget()) > 0: - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - ( - dummy_modality, - max_tokens, - ) = mm_budget.get_modality_with_max_tokens() - ( - max_mm_items_per_prompt, - max_mm_items_per_batch, - ) = mm_budget.get_max_items(dummy_modality, max_tokens) - + if self.model_config.multimodal_config.skip_mm_profiling: logger.info( - "Encoder cache will be initialized with a budget of " - "%s tokens, and profiled with %s %s items of the maximum " - "feature size.", - encoder_budget, - max_mm_items_per_batch, - dummy_modality, - ) + "Skipping memory profiling for multimodal encoder and " + "encoder cache.") + else: + mm_budget = self.mm_budget + assert mm_budget is not None + + # TODO: handle encoder-decoder models once we support them. + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + ( + dummy_modality, + max_tokens, + ) = mm_budget.get_modality_with_max_tokens() + ( + max_mm_items_per_prompt, + max_mm_items_per_batch, + ) = mm_budget.get_max_items(dummy_modality, max_tokens) + + logger.info( + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the " + "maximum feature size.", + encoder_budget, + max_mm_items_per_batch, + dummy_modality, + ) - # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_modality, - max_mm_items_per_batch, - ) + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_batch, + ) - # Run multimodal encoder. - dummy_encoder_outputs = self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) + # Run multimodal encoder. + dummy_encoder_outputs = \ + self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) - sanity_check_mm_encoder_outputs( - dummy_encoder_outputs, - expected_num_items=max_mm_items_per_batch, - ) + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_batch, + ) - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict( - enumerate(dummy_encoder_outputs)) + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) # Add `is_profile` here to pre-allocate communication buffers hidden_states, last_hidden_states \ diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 46262284e333..f7e68edba3a1 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1529,60 +1529,66 @@ def profile_run( ) -> None: # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: - mm_budget = self.mm_budget - assert mm_budget is not None - - # TODO: handle encoder-decoder models once we support them. - if (encoder_budget := mm_budget.get_encoder_budget()) > 0: - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - ( - dummy_modality, - max_tokens, - ) = mm_budget.get_modality_with_max_tokens() - ( - max_mm_items_per_prompt, - max_mm_items_per_batch, - ) = mm_budget.get_max_items(dummy_modality, max_tokens) - + if self.model_config.multimodal_config.skip_mm_profiling: logger.info( - "Encoder cache will be initialized with a budget of " - "%s tokens, and profiled with %s %s items of the maximum " - "feature size.", - encoder_budget, - max_mm_items_per_batch, - dummy_modality, - ) - - # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_modality, - max_mm_items_per_batch, - ) + "Skipping memory profiling for multimodal encoder and " + "encoder cache.") + else: + mm_budget = self.mm_budget + assert mm_budget is not None + + # TODO: handle encoder-decoder models once we support them. + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + ( + dummy_modality, + max_tokens, + ) = mm_budget.get_modality_with_max_tokens() + ( + max_mm_items_per_prompt, + max_mm_items_per_batch, + ) = mm_budget.get_max_items(dummy_modality, max_tokens) + + logger.info( + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the " + "maximum feature size.", + encoder_budget, + max_mm_items_per_batch, + dummy_modality, + ) - # Run multimodal encoder. - # Isolate encoder graph from post-processing to minimize - # impact of recompilation until it's fixed. - start = time.perf_counter() - xm.mark_step() - dummy_encoder_outputs = self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) - xm.mark_step() - xm.wait_device_ops() - end = time.perf_counter() - logger.info( - "Multimodal Encoder profiling finished in in %.2f [secs].", - end - start) + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_batch, + ) - sanity_check_mm_encoder_outputs( - dummy_encoder_outputs, - expected_num_items=max_mm_items_per_batch, - ) + # Run multimodal encoder. + # Isolate encoder graph from post-processing to minimize + # impact of recompilation until it's fixed. + start = time.perf_counter() + xm.mark_step() + dummy_encoder_outputs = \ + self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) + xm.mark_step() + xm.wait_device_ops() + end = time.perf_counter() + logger.info( + "Multimodal Encoder profiling finished in %.2f [secs].", + end - start) + + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_batch, + ) - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict( - enumerate(dummy_encoder_outputs)) + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) # Trigger compilation for general shape. self._dummy_run(num_tokens, self.num_reqs_max_model_len, From 11652f2d61995a9e185cc2f6c2b94f6733e10cbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Staszek=20Pa=C5=9Bko?= Date: Fri, 15 Aug 2025 14:32:56 +0200 Subject: [PATCH 042/231] Improve multimodal hasher performance for re-used Image prompts (#22825) Signed-off-by: Staszek Pasko Signed-off-by: Duncan Moss --- tests/multimodal/test_hasher.py | 20 ++++++++++++++++++++ vllm/multimodal/hasher.py | 6 ++++++ 2 files changed, 26 insertions(+) diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py index 42cb40739dcc..75a233c2567c 100644 --- a/tests/multimodal/test_hasher.py +++ b/tests/multimodal/test_hasher.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import uuid from pathlib import Path import numpy as np @@ -72,3 +73,22 @@ def test_hash_non_contiguous_array(): hasher = MultiModalHasher # Both should be hashable and produce the same hashes assert hasher.hash_kwargs(data=arr) == hasher.hash_kwargs(data=arr_c) + + +def test_hash_image_exif_id(): + # Test that EXIF ImageId tag can be used to store UUID + # and the hasher will use that instead of the image data. + image1 = image2 = Image.new("1", size=(10, 20)) + id = uuid.uuid4() + image1.getexif()[Image.ExifTags.Base.ImageID] = id + image2 = Image.open(ASSETS_DIR / "image1.png") + image2.getexif()[Image.ExifTags.Base.ImageID] = "Not a UUID" + image2a = Image.open(ASSETS_DIR / "image1.png") + + hasher = MultiModalHasher + # first image has UUID in ImageID, so it should hash to that UUID + assert hasher.hash_kwargs(image=image1) == hasher.hash_kwargs( + image=id.bytes) + # second image has non-UUID in ImageID, so it should hash to the image data + assert hasher.hash_kwargs(image=image2) == hasher.hash_kwargs( + image=image2a) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index ac27bb66f7b5..c9ce1f0be5f8 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle +import uuid from collections.abc import Iterable, Mapping from typing import Union @@ -34,6 +35,11 @@ def serialize_item(cls, obj: object) -> Union[bytes, memoryview]: return np.array(obj).tobytes() if isinstance(obj, Image.Image): + exif = obj.getexif() + if Image.ExifTags.Base.ImageID in exif and isinstance( + exif[Image.ExifTags.Base.ImageID], uuid.UUID): + # If the image has exif ImageID tag, use that + return exif[Image.ExifTags.Base.ImageID].bytes return cls.item_to_bytes( "image", np.asarray(convert_image_mode(obj, "RGBA"))) if isinstance(obj, torch.Tensor): From a857d8d3e41c7d519fb90a4e7d555884b7d0d487 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Fri, 15 Aug 2025 14:57:06 +0200 Subject: [PATCH 043/231] [V1] [Hybrid] Support using float32 for state in Hybrid Models (Mamba2, Mamba1, Minimax) (#22928) Signed-off-by: Daniel Afrimi Signed-off-by: Thomas Parnell Signed-off-by: Chen Zhang Co-authored-by: Daniel Afrimi Co-authored-by: Burkhard Ringlein Co-authored-by: Chen Zhang Signed-off-by: Duncan Moss --- .../models/language/generation/test_hybrid.py | 62 +++++++++++++++++++ tests/v1/worker/test_gpu_model_runner.py | 2 + vllm/config/__init__.py | 2 +- vllm/config/cache.py | 12 ++++ vllm/engine/arg_utils.py | 20 ++++-- .../layers/mamba/mamba_mixer.py | 17 ++++- .../layers/mamba/mamba_mixer2.py | 51 +++++++++------ .../layers/mamba/mamba_utils.py | 52 ++++++++++++++++ .../layers/mamba/ops/ssd_combined.py | 10 ++- vllm/model_executor/models/bamba.py | 29 +++++++-- vllm/model_executor/models/config.py | 2 +- vllm/model_executor/models/falcon_h1.py | 29 +++++++-- .../model_executor/models/granitemoehybrid.py | 30 +++++++-- vllm/model_executor/models/jamba.py | 28 +++++++-- vllm/model_executor/models/mamba.py | 27 ++++++-- vllm/model_executor/models/mamba2.py | 36 +++++++++-- vllm/model_executor/models/mamba_cache.py | 15 +++-- vllm/model_executor/models/minimax_text_01.py | 34 +++++++++- vllm/model_executor/models/nemotron_h.py | 32 ++++++++-- vllm/model_executor/models/zamba2.py | 38 ++++++++++-- vllm/utils/__init__.py | 1 + vllm/v1/kv_cache_interface.py | 7 ++- vllm/v1/worker/gpu_model_runner.py | 18 +++--- 23 files changed, 467 insertions(+), 87 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index e75677347f03..aee0a50336c0 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -431,3 +431,65 @@ def test_full_cuda_graph( name_0="hf" if hf_outputs is not None else "vllm-v0", name_1="vllm-v1", ) + + +@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_fp32_state( + hf_runner, + vllm_runner, + example_prompts, + monkeypatch, + model: str, + max_tokens: int, + num_logprobs: int, +) -> None: + + try: + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + except ValueError: + pass + + with hf_runner(model) as hf_model: + if model not in HF_UNSUPPORTED_MODELS: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) + else: + hf_outputs = None + + with vllm_runner(model, + max_num_seqs=MAX_NUM_SEQS, + mamba_ssm_cache_dtype="float32") as vllm_model: + vllm_v0_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + if model in HYBRID_MODELS: + # required due to reorder_batch behaviour + m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") + with vllm_runner(model, + max_num_seqs=MAX_NUM_SEQS, + mamba_ssm_cache_dtype="float32", + enable_prefix_caching=False) as vllm_model: + vllm_v1_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + if hf_outputs is not None: + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_v0_outputs, + name_0="hf", + name_1="vllm-v0", + ) + + ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs + check_logprobs_close( + outputs_0_lst=ref_outputs, + outputs_1_lst=vllm_v1_outputs, + name_0="hf" if hf_outputs is not None else "vllm-v0", + name_1="vllm-v1", + ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index e97cdf482710..4bcc63f293e0 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -772,6 +772,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): head_dim=hf_config.mamba_d_head, rms_norm_eps=hf_config.rms_norm_eps, activation=hf_config.hidden_act, + cache_config=cache_config, + model_config=model_config, prefix=key, ) # suppress var not used error diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index a2e93c344b3f..82ef8db673fe 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -29,7 +29,7 @@ import vllm.envs as envs from vllm import version -from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, +from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType, PrefixCachingHashAlgo) from vllm.config.compilation import (CompilationConfig, CompilationLevel, PassConfig) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 69cb0d9732fa..ae11dec3ca5e 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -23,6 +23,7 @@ BlockSize = Literal[1, 8, 16, 32, 64, 128] CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"] +MambaDType = Literal["auto", "float32"] PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"] @@ -93,6 +94,15 @@ class CacheConfig: """ Optional override for mamba page size; used by hybrid mamba/attention models to ensure exact alignment with attention page size.""" + mamba_cache_dtype: MambaDType = "auto" + """The data type to use for the Mamba cache (both the conv as well as the + ssm state). If set to 'auto', the data type will be inferred from the model + config.""" + mamba_ssm_cache_dtype: MambaDType = "auto" + """The data type to use for the Mamba cache (ssm state only, conv state will + still be controlled by mamba_cache_dtype). If set to 'auto', the data type + for the ssm state will be determined by mamba_cache_dtype.""" + # Will be set after profiling. num_gpu_blocks: Optional[int] = field(default=None, init=False) """The number of blocks to allocate for GPU memory.""" @@ -123,6 +133,8 @@ def compute_hash(self) -> str: """ factors: list[Any] = [] factors.append(self.cache_dtype) + factors.append(self.mamba_cache_dtype) + factors.append(self.mamba_ssm_cache_dtype) # `cpu_offload_gb` does not use `torch.compile` yet. hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 31de2ede7a38..f8af6d36e0c0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -27,12 +27,12 @@ DeviceConfig, DistributedExecutorBackend, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, ModelConfig, ModelDType, ModelImpl, - MultiModalConfig, ObservabilityConfig, ParallelConfig, - PoolerConfig, PrefixCachingHashAlgo, RunnerOption, - SchedulerConfig, SchedulerPolicy, SpeculativeConfig, - TaskOption, TokenizerMode, VllmConfig, get_attr_docs, - get_field) + LoRAConfig, MambaDType, ModelConfig, ModelDType, + ModelImpl, MultiModalConfig, ObservabilityConfig, + ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, + RunnerOption, SchedulerConfig, SchedulerPolicy, + SpeculativeConfig, TaskOption, TokenizerMode, + VllmConfig, get_attr_docs, get_field) from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins @@ -422,6 +422,8 @@ class EngineArgs: override_attention_dtype: str = ModelConfig.override_attention_dtype calculate_kv_scales: bool = CacheConfig.calculate_kv_scales + mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype + mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype additional_config: dict[str, Any] = \ get_field(VllmConfig, "additional_config") @@ -694,6 +696,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **cache_kwargs["calculate_kv_scales"]) cache_group.add_argument("--kv-sharing-fast-prefill", **cache_kwargs["kv_sharing_fast_prefill"]) + cache_group.add_argument("--mamba-cache-dtype", + **cache_kwargs["mamba_cache_dtype"]) + cache_group.add_argument("--mamba-ssm-cache-dtype", + **cache_kwargs["mamba_ssm_cache_dtype"]) # Multimodal related configs multimodal_kwargs = get_kwargs(MultiModalConfig) @@ -1105,6 +1111,8 @@ def create_engine_config( cpu_offload_gb=self.cpu_offload_gb, calculate_kv_scales=self.calculate_kv_scales, kv_sharing_fast_prefill=self.kv_sharing_fast_prefill, + mamba_cache_dtype=self.mamba_cache_dtype, + mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype, ) ray_runtime_env = None diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 3b17fb0ca8c7..3c7322260df4 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -9,7 +9,7 @@ from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata -from vllm.config import get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.forward_context import ForwardContext, get_forward_context @@ -20,7 +20,7 @@ RowParallelLinear) from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( @@ -56,6 +56,8 @@ def __init__(self, rms_norm_eps: float = 1e-5, activation="silu", is_lora_enabled: bool = False, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, prefix: str = ""): super().__init__() self.time_step_rank = time_step_rank @@ -153,6 +155,8 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor): # The inner tuple is (conv_state, ssm_state) self.kv_cache = [(torch.tensor([]), torch.tensor([]))] + self.model_config = model_config + self.cache_config = cache_config self.prefix = prefix def _ssm_transform( @@ -369,6 +373,15 @@ def forward_cuda(self, return out + def get_state_dtype(self) -> tuple[torch.dtype]: + assert self.model_config is not None + assert self.cache_config is not None + return MambaStateDtypeCalculator.mamba1_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + self.cache_config.mamba_ssm_cache_dtype, + ) + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: return MambaStateShapeCalculator.mamba1_state_shape( tp_world_size=get_tensor_model_parallel_world_size(), diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 6bf0c18ebdb4..743e520ec8ee 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -8,7 +8,7 @@ from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata -from vllm.config import get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, @@ -21,7 +21,7 @@ from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata, update_metadata) from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated @@ -218,23 +218,23 @@ class MambaMixer2(MambaBase, CustomOp): **selective** state spaces) """ - def __init__( - self, - hidden_size: int, - ssm_state_size: int, - conv_kernel_size: int, - intermediate_size: int, - use_conv_bias: bool, - use_bias: bool, - n_groups: int = 1, - num_heads: int = 128, - head_dim: int = 64, - rms_norm_eps: float = 1e-5, - activation: str = "silu", - use_rms_norm: bool = True, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, + hidden_size: int, + ssm_state_size: int, + conv_kernel_size: int, + intermediate_size: int, + use_conv_bias: bool, + use_bias: bool, + n_groups: int = 1, + num_heads: int = 128, + head_dim: int = 64, + rms_norm_eps: float = 1e-5, + activation: str = "silu", + use_rms_norm: bool = True, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() # For TP, the sharding plan is as follows: @@ -417,6 +417,8 @@ def __init__( # The inner tuple is (conv_state, ssm_state) self.kv_cache = [(torch.tensor([]), torch.tensor([]))] + self.model_config = model_config + self.cache_config = cache_config self.prefix = prefix def forward_native( @@ -670,7 +672,7 @@ def forward_cuda( dt_limit=(0.0, float("inf")), out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1, self.head_dim), - ) + state_dtype=ssm_state.dtype) # update ssm states # - varlen state is a (num_prefills, nheads, headdim, dstate) tensor @@ -732,6 +734,15 @@ def forward_cuda( # 5. Final linear projection output[:num_actual_tokens], _ = self.out_proj(hidden_states) + def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: + assert self.model_config is not None + assert self.cache_config is not None + return MambaStateDtypeCalculator.mamba2_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + self.cache_config.mamba_ssm_cache_dtype, + ) + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: return MambaStateShapeCalculator.mamba2_state_shape( intermediate_size=self.intermediate_size, diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index ad1401791238..66674d1a6f25 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -1,6 +1,58 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Union + +import torch + +from vllm.config import MambaDType, ModelDType from vllm.distributed import divide +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_kv_cache_torch_dtype + + +class MambaStateDtypeCalculator: + + @classmethod + def linear_attention_state_dtype( + cls, + model_dtype: Union[ModelDType, torch.dtype], + mamba_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + # TODO (tdoublep) requires testing + if mamba_cache_dtype == "float32": + raise ValueError("fp32 state for minimax is not yet supported") + state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + return (state_dtype, ) + + @classmethod + def mamba1_state_dtype( + cls, + model_dtype: Union[ModelDType, torch.dtype], + mamba_cache_dtype: MambaDType, + mamba_ssm_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + # TODO (tdoublep) requires kernel changes + if mamba_cache_dtype == "float32" or mamba_ssm_cache_dtype == "float32": + raise ValueError("fp32 state for mamba1 is not yet supported") + else: + return MambaStateDtypeCalculator.mamba2_state_dtype( + model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype) + + @classmethod + def mamba2_state_dtype( + cls, + model_dtype: Union[ModelDType, torch.dtype], + mamba_cache_dtype: MambaDType, + mamba_ssm_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, + model_dtype) + if mamba_ssm_cache_dtype == "auto": + temporal_state_dtype = conv_state_dtype + else: + temporal_state_dtype = ( + STR_DTYPE_TO_TORCH_DTYPE[mamba_ssm_cache_dtype]) + + return (conv_state_dtype, temporal_state_dtype) class MambaStateShapeCalculator: diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py index fd74cb837290..d0b3e9e5235b 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py @@ -41,6 +41,7 @@ def _mamba_chunk_scan_combined_fwd(x, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf")), + state_dtype=None, out=None): assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2" batch, seqlen, nheads, headdim = x.shape @@ -118,7 +119,7 @@ def _mamba_chunk_scan_combined_fwd(x, if initial_states is not None else None, seq_idx=seq_idx, chunk_size=chunk_size, - out_dtype=C.dtype, + out_dtype=state_dtype if state_dtype is not None else C.dtype, is_cont_batched=cu_seqlens is not None) states, final_states = (rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]) @@ -189,7 +190,8 @@ def mamba_chunk_scan_combined(x, dt_limit=(0.0, float("inf")), out=None, return_final_states=False, - return_varlen_states=False): + return_varlen_states=False, + state_dtype=None): """ Argument: x: (batch, seqlen, nheads, headdim) @@ -206,6 +208,7 @@ def mamba_chunk_scan_combined(x, cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True dt_softplus: Whether to apply softplus to dt out: Preallocated output tensor + state_dtype: The data type of the ssm state """ if not return_varlen_states: @@ -229,7 +232,8 @@ def mamba_chunk_scan_combined(x, cu_seqlens=cu_seqlens, dt_softplus=dt_softplus, dt_limit=dt_limit, - out=out) + out=out, + state_dtype=state_dtype) if not return_varlen_states: if not return_final_states: return diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 4a2ae07581f3..e2cd31af5390 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -12,7 +12,7 @@ from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import get_forward_context @@ -26,7 +26,7 @@ Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -83,6 +83,7 @@ class BambaMixerDecoderLayer(nn.Module): def __init__(self, config: BambaConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: @@ -100,6 +101,8 @@ def __init__(self, head_dim=config.mamba_d_head, rms_norm_eps=config.rms_norm_eps, activation=config.hidden_act, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.mixer") @@ -138,6 +141,7 @@ def __init__( self, config: BambaConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -266,6 +270,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: BambaConfig = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -289,6 +294,7 @@ def get_layer(prefix: str): return layer_class( config, layer_idx, + model_config, cache_config, quant_config=quant_config, prefix=prefix, @@ -437,6 +443,18 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, } embedding_padding_modules = ["lm_head"] + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba2_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, @@ -528,10 +546,13 @@ def forward(self, mamba_state_shape = \ self.get_mamba_state_shape_from_config( self.vllm_config, use_v1=False) + mamba_state_dtype = \ + self.get_mamba_state_dtype_from_config( + self.vllm_config) self.mamba_cache = MambaCacheManager(self.vllm_config, - self.lm_head.weight.dtype, num_mamba_layers, - *mamba_state_shape) + *mamba_state_shape, + *mamba_state_dtype) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 6f21cd267b0e..882df7e8162c 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -318,7 +318,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: # get mamba page size mamba_page_size = MambaSpec( shapes=model_cls.get_mamba_state_shape_from_config(vllm_config), - dtype=kv_cache_dtype, + dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config), block_size=model_config.max_model_len, ).page_size_bytes diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 85d64af5bd28..5e2b6d69124c 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -11,7 +11,7 @@ from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import get_forward_context @@ -25,7 +25,7 @@ Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -85,6 +85,7 @@ class FalconH1SSMDecoderLayer(nn.Module): def __init__( self, config: FalconH1Config, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -108,6 +109,8 @@ def __init__( head_dim=config.mamba_d_head, rms_norm_eps=config.rms_norm_eps, activation=config.hidden_act, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, use_rms_norm=config.mamba_rms_norm, prefix=f"{prefix}.mixer", @@ -317,6 +320,7 @@ def __init__( self, config: FalconH1Config, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -339,6 +343,7 @@ def __init__( # Instantiate the SSM branch self.mamba = FalconH1SSMDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=ssm_prefix, @@ -408,6 +413,7 @@ class FalconH1Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: FalconH1Config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -435,6 +441,7 @@ def get_layer(prefix: str): return layer_class( config, layer_idx, + model_config, cache_config, quant_config=quant_config, prefix=prefix, @@ -519,6 +526,18 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, } embedding_padding_modules = ["lm_head"] + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba2_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, @@ -624,12 +643,14 @@ def forward( mamba_state_shape = \ self.get_mamba_state_shape_from_config( self.vllm_config, use_v1=False) + mamba_state_dtype = \ + self.get_mamba_state_dtype_from_config( + self.vllm_config) self.mamba_cache = MambaCacheManager( self.vllm_config, - self.lm_head.weight.dtype if hasattr( - self.lm_head, 'weight') else torch.bfloat16, self.config.num_hidden_layers, *mamba_state_shape, + *mamba_state_dtype, ) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index e59502f12a1c..5704496b9a5d 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -12,7 +12,7 @@ from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import get_forward_context @@ -24,7 +24,7 @@ Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -50,6 +50,7 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module): def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: @@ -70,6 +71,8 @@ def __init__(self, head_dim=config.mamba_d_head, rms_norm_eps=config.rms_norm_eps, activation=config.hidden_act, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.mixer") @@ -137,6 +140,7 @@ def __init__( self, config: GraniteMoeHybridConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -217,6 +221,7 @@ class GraniteMoeHybridAttention(nn.Module): def __init__( self, config: GraniteMoeHybridConfig, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -316,6 +321,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -340,6 +346,7 @@ def get_layer(prefix: str): return layer_class( config, layer_idx, + model_config, cache_config, quant_config=quant_config, prefix=prefix, @@ -527,6 +534,18 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, } embedding_padding_modules = ["lm_head"] + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba2_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, @@ -625,10 +644,13 @@ def forward(self, mamba_state_shape = \ self.get_mamba_state_shape_from_config( self.vllm_config, use_v1=False) + mamba_state_dtype = \ + self.get_mamba_state_dtype_from_config( + self.vllm_config) self.mamba_cache = MambaCacheManager(self.vllm_config, - self.model_config.dtype, num_mamba_layers, - *mamba_state_shape) + *mamba_state_shape, + *mamba_state_dtype) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index fbd310121ad4..0b32d6f25659 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -10,7 +10,7 @@ from vllm import envs from vllm.attention.layer import Attention -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.fused_moe import FusedMoE @@ -21,7 +21,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -94,6 +94,7 @@ class JambaMambaDecoderLayer(nn.Module): def __init__(self, config: JambaConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, is_lora_enabled: Optional[bool] = False, @@ -114,6 +115,8 @@ def __init__(self, rms_norm_eps=config.rms_norm_eps, activation=config.hidden_act, is_lora_enabled = self.is_lora_enabled, + model_config=model_config, + cache_config=cache_config, prefix=f"{prefix}.mixer", ) @@ -164,6 +167,7 @@ class JambaAttentionDecoderLayer(nn.Module): def __init__(self, config: JambaConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -280,6 +284,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -304,6 +309,7 @@ def get_layer(prefix: str): config.layers_block_type[layer_idx]] return layer_class(config, layer_idx, + model_config, cache_config, quant_config=quant_config, prefix=prefix, @@ -520,9 +526,11 @@ def forward(self, self.vllm_config.parallel_config, LayerBlockType.mamba) state_shape = self.get_mamba_state_shape_from_config( self.vllm_config) + state_dtype = self.get_mamba_state_dtype_from_config( + self.vllm_config) self.mamba_cache = MambaCacheManager(self.vllm_config, - self.lm_head.weight.dtype, - num_layers, *state_shape) + num_layers, *state_shape, + *state_dtype) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) @@ -537,6 +545,18 @@ def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba1_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 80b63e15377a..f4aaf0c6f467 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -9,13 +9,13 @@ from transformers import MambaConfig from vllm import envs -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -40,6 +40,7 @@ class MambaDecoderLayer(nn.Module): def __init__(self, config: MambaConfig, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, is_lora_enabled: Optional[bool] = False, @@ -61,6 +62,8 @@ def __init__(self, rms_norm_eps=mixer_rms_eps, activation=config.hidden_act, is_lora_enabled=self.is_lora_enabled, + model_config=model_config, + cache_config=cache_config, prefix=f"{prefix}.mixer") self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -88,6 +91,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -108,6 +112,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MambaDecoderLayer(config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, is_lora_enabled=is_lora_enabled, @@ -243,9 +248,11 @@ def forward(self, self.vllm_config.parallel_config, LayerBlockType.mamba) state_shape = self.get_mamba_state_shape_from_config( self.vllm_config) + state_dtype = self.get_mamba_state_dtype_from_config( + self.vllm_config) self.mamba_cache = MambaCacheManager(self.vllm_config, - self.lm_head.weight.dtype, - num_layers, *state_shape) + num_layers, *state_shape, + *state_dtype) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) @@ -254,6 +261,18 @@ def forward(self, return hidden_states + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba1_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 75e92b01762d..3432cf29feac 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -11,7 +11,7 @@ from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import get_forward_context from vllm.model_executor.layers.layernorm import RMSNorm @@ -20,7 +20,7 @@ Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -45,6 +45,8 @@ class Mamba2DecoderLayer(nn.Module): def __init__(self, config: MambaConfig, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: super().__init__() @@ -62,6 +64,8 @@ def __init__(self, head_dim=config.head_dim, rms_norm_eps=config.layer_norm_epsilon, activation=config.hidden_act, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.mixer") @@ -93,6 +97,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config is_lora_enabled = bool(lora_config) @@ -112,8 +118,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: Mamba2DecoderLayer( - config, quant_config=quant_config, prefix=prefix), + lambda prefix: Mamba2DecoderLayer(config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), prefix=f"{prefix}.layers") self.norm_f = RMSNorm(config.hidden_size, @@ -200,6 +209,18 @@ def load_weights(self, weights: Iterable[tuple[str, class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba2_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, @@ -290,10 +311,13 @@ def forward(self, mamba_state_shape = \ self.get_mamba_state_shape_from_config( self.vllm_config, use_v1=False) + mamba_state_dtype = \ + self.get_mamba_state_dtype_from_config( + self.vllm_config) self.mamba_cache = MambaCacheManager(self.vllm_config, - self.lm_head.weight.dtype, num_mamba_layers, - *mamba_state_shape) + *mamba_state_shape, + *mamba_state_dtype) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) else: diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index 27685c59a3ea..6b16e3ce7d98 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -24,9 +24,14 @@ def at_layer_idx(self, layer_idx): class MambaCacheManager(ConstantSizeCache): - def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, - num_mamba_layers: int, conv_state_shape: tuple[int, int], - temporal_state_shape: tuple[int, int]): + def __init__(self, vllm_config: VllmConfig, num_mamba_layers: int, + conv_state_shape: tuple[int, int], + temporal_state_shape: tuple[int, int], + conv_state_dtype: torch.dtype, + temporal_state_dtype: torch.dtype): + + self.conv_state_dtype = conv_state_dtype + self.temporal_state_dtype = temporal_state_dtype # Determine max batch size to set size of MambaCache max_batch_size = vllm_config.scheduler_config.max_num_seqs @@ -40,11 +45,11 @@ def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, assert conv_state_shape[0] > conv_state_shape[1] conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) + (conv_state_shape[1], conv_state_shape[0]), - dtype=dtype, + dtype=self.conv_state_dtype, device="cuda").transpose(-1, -2) temporal_state = torch.empty(size=(num_mamba_layers, max_batch_size) + temporal_state_shape, - dtype=dtype, + dtype=self.temporal_state_dtype, device="cuda") self._mamba_cache = (conv_state, temporal_state) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 3d14a6ad5c3a..82e96844cd5f 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -16,7 +16,8 @@ from vllm import envs from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import (CacheConfig, ModelConfig, VllmConfig, + get_current_vllm_config) from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( get_pp_group, get_tensor_model_parallel_rank, @@ -36,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -338,6 +339,12 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): def mamba_type(self) -> str: return "linear_attention" + def get_state_dtype(self) -> tuple[torch.dtype]: + return MambaStateDtypeCalculator.linear_attention_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + ) + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: return MambaStateShapeCalculator.linear_attention_state_shape( num_heads=self.num_heads, @@ -353,6 +360,8 @@ def __init__( max_position: int, block_size: int, num_hidden_layer: int, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, layer_idx: int = 0, linear_layer_idx: int = 0, @@ -374,6 +383,8 @@ def __init__( self.tp_heads = self.total_num_heads // self.tp_size self.qkv_size = self.num_heads * self.head_dim self.tp_hidden = self.head_dim * self.tp_heads + self.model_config = model_config + self.cache_config = cache_config self.prefix = prefix self.qkv_proj = ColumnParallelLinear( @@ -657,6 +668,7 @@ class MiniMaxText01DecoderLayer(nn.Module): def __init__( self, config: MiniMaxConfig, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, expert_num: int = 1, @@ -693,6 +705,8 @@ def __init__( max_position=max_position_embeddings, block_size=config.block if hasattr(config, "block") else 256, num_hidden_layer=config.num_hidden_layers, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, layer_idx=self._ilayer, linear_layer_idx=linear_layer_id, @@ -861,6 +875,7 @@ class MiniMaxText01Model(nn.Module): def __init__( self, config: MiniMaxConfig, + model_config: Optional[ModelConfig] = None, quant_config: Optional[QuantizationConfig] = None, cache_config: Optional[CacheConfig] = None, scheduler_config=None, @@ -910,6 +925,7 @@ def layer_fn(prefix): decoder_kwargs = { "quant_config": quant_config, "layer_id": layer_idx, + "model_config": model_config, "cache_config": cache_config } @@ -1111,8 +1127,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.config.max_model_len = vllm_config.model_config.max_model_len self.model = MiniMaxText01Model( self.config, - quant_config, + model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, + quant_config=quant_config, scheduler_config=vllm_config.scheduler_config, prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: @@ -1409,6 +1426,17 @@ def load_basic_weight(name: str, loaded_weight: torch.Tensor, load_basic_weight(name, loaded_weight, self) return loaded_params + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.linear_attention_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 08315a13853c..07cd5a4c6e24 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -26,7 +26,7 @@ from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import get_forward_context @@ -40,7 +40,7 @@ Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) @@ -110,6 +110,7 @@ def __init__( self, config: NemotronHConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -149,6 +150,7 @@ def __init__( self, config: NemotronHConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -167,6 +169,8 @@ def __init__( head_dim=config.mamba_head_dim, rms_norm_eps=config.rms_norm_eps, activation=config.mamba_hidden_act, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.mixer", ) @@ -198,6 +202,7 @@ def __init__( self, config: NemotronHConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -270,6 +275,7 @@ def __init__( self, config: NemotronHConfig, layer_idx: int, + model_config: Optional[ModelConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -279,6 +285,7 @@ def __init__( self.mixer = NemotronHAttention( config, layer_idx, + model_config, cache_config, quant_config, prefix=f"{prefix}.mixer", @@ -317,6 +324,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: NemotronHConfig = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -340,6 +348,7 @@ def get_layer(prefix: str): return layer_class( config, layer_idx, + model_config, cache_config, quant_config=quant_config, prefix=prefix, @@ -478,6 +487,18 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, } embedding_padding_modules = ["lm_head"] + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba2_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, @@ -569,10 +590,13 @@ def forward(self, mamba_state_shape = \ self.get_mamba_state_shape_from_config( self.vllm_config, use_v1=False) + mamba_state_dtype = \ + self.get_mamba_state_dtype_from_config( + self.vllm_config) self.mamba_cache = MambaCacheManager(self.vllm_config, - self.lm_head.weight.dtype, num_mamba_layers, - *mamba_state_shape) + *mamba_state_shape, + *mamba_state_dtype) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 4cb0becf302f..ed65944c109b 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -18,7 +18,7 @@ from vllm import envs from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import GeluAndMul @@ -33,7 +33,7 @@ Mamba2Metadata, prepare_mamba2_metadata) from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_utils import ( - MambaStateShapeCalculator) + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -478,6 +478,8 @@ class Zamba2MambaDecoderLayer(nn.Module): def __init__(self, config: Zamba2Config, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: """Initialize the Mamba decoder layer. @@ -502,6 +504,8 @@ def __init__(self, config.n_mamba_heads, rms_norm_eps=config.rms_norm_eps, activation="silu", + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.mixer") @@ -578,6 +582,8 @@ def __init__( shared_transformer: Zamba2AttentionDecoderLayer, config: Zamba2Config, block_idx: int, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -596,6 +602,8 @@ def __init__( bias=False, quant_config=quant_config) self.mamba_decoder = Zamba2MambaDecoderLayer(config, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=prefix) @@ -669,6 +677,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -718,11 +727,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: Zamba2HybridLayer(block, config, block_idx, - quant_config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, prefix=prefix)) else: layers.append( Zamba2MambaDecoderLayer(config, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=prefix)) self.layers = nn.ModuleList(layers) @@ -848,6 +861,18 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): "1.weight": "B.weight", }) + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba2_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, + ) + @classmethod def get_mamba_state_shape_from_config( cls, @@ -966,10 +991,13 @@ def forward(self, mamba_state_shape = \ self.get_mamba_state_shape_from_config( self.vllm_config, use_v1=False) + mamba_state_dtype = \ + self.get_mamba_state_dtype_from_config( + self.vllm_config) self.mamba_cache = MambaCacheManager(self.vllm_config, - self.lm_head.weight.dtype, num_mamba_layers, - *mamba_state_shape) + *mamba_state_shape, + *mamba_state_dtype) # Get cache parameters for current run mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index cae4eecc0dee..a1f8ad164762 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -173,6 +173,7 @@ RESET = '\033[0;0m' STR_DTYPE_TO_TORCH_DTYPE = { + "float32": torch.float32, "half": torch.half, "bfloat16": torch.bfloat16, "float": torch.float, diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 4ff96f9786b8..429416afa248 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -182,14 +182,15 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: @dataclass(frozen=True) class MambaSpec(KVCacheSpec): shapes: tuple[tuple[int, ...], ...] - dtype: torch.dtype + dtypes: tuple[torch.dtype] page_size_padded: Optional[int] = None mamba_type: str = "mamba2" @property def page_size_bytes(self) -> int: - num_elements = sum(prod(shape) for shape in self.shapes) - page_size = num_elements * get_dtype_size(self.dtype) + page_size = sum( + prod(shape) * get_dtype_size(dtype) + for (shape, dtype) in zip(self.shapes, self.dtypes)) if self.page_size_padded is not None: assert self.page_size_padded >= page_size return self.page_size_padded diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 703092ca9fee..d5325287889f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2884,23 +2884,25 @@ def _reshape_kv_cache_tensors( elif isinstance(kv_cache_spec, MambaSpec): has_mamba = True raw_tensor = kv_cache_raw_tensors[layer_name] - dtype = kv_cache_spec.dtype - num_element_per_page = (kv_cache_spec.page_size_bytes // - get_dtype_size(dtype)) state_tensors = [] - storage_offset = 0 - for shape in kv_cache_spec.shapes: + storage_offset_bytes = 0 + for (shape, dtype) in zip(kv_cache_spec.shapes, + kv_cache_spec.dtypes): + dtype_size = get_dtype_size(dtype) + num_element_per_page = ( + kv_cache_spec.page_size_bytes // dtype_size) target_shape = (num_blocks, *shape) stride = torch.empty(target_shape).stride() target_stride = (num_element_per_page, *stride[1:]) + assert storage_offset_bytes % dtype_size == 0 tensor = torch.as_strided( raw_tensor.view(dtype), size=target_shape, stride=target_stride, - storage_offset=storage_offset, + storage_offset=storage_offset_bytes // dtype_size, ) state_tensors.append(tensor) - storage_offset += stride[0] + storage_offset_bytes += stride[0] * dtype_size kv_caches[layer_name] = state_tensors else: @@ -3087,7 +3089,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: for layer_name, mamba_module in mamba_layers.items(): kv_cache_spec[layer_name] = MambaSpec( shapes=mamba_module.get_state_shape(), - dtype=self.kv_cache_dtype, + dtypes=mamba_module.get_state_dtype(), block_size=max_model_len, page_size_padded=page_size_padded, mamba_type=mamba_module.mamba_type) From 76c2fa8c1f5bc004120177a496ddc616f04c801c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 15 Aug 2025 20:58:03 +0800 Subject: [PATCH 044/231] [Misc] Ignore ep_kernels_workspace (#22807) Signed-off-by: Jee Jee Li Signed-off-by: Duncan Moss --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 721dd7536bec..465935d488f8 100644 --- a/.gitignore +++ b/.gitignore @@ -207,3 +207,6 @@ shellcheck*/ # Ignore moe/marlin_moe gen code csrc/moe/marlin_moe_wna16/kernel_* + +# Ignore ep_kernels_workspace folder +ep_kernels_workspace/ \ No newline at end of file From af610df089591a9855ae7f151b9bd32d0309d0eb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 15 Aug 2025 13:58:06 +0100 Subject: [PATCH 045/231] [CI] Remove duplicated docs build from buildkite (#22924) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Duncan Moss --- .buildkite/test-pipeline.yaml | 10 ---------- docker/Dockerfile | 11 ++++------- tests/standalone_tests/python_only_compile.sh | 2 +- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 942a8d3f9bfd..04d7cdc3d885 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -31,16 +31,6 @@ steps: ##### fast check tests ##### -- label: Documentation Build # 2min - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/test_docs" - fast_check: true - no_gpu: True - commands: - - pip install -r ../requirements/docs.txt - # TODO: add `--strict` once warnings in docstrings are fixed - - mkdocs build - - label: Pytorch Nightly Dependency Override Check # 2min # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist diff --git a/docker/Dockerfile b/docker/Dockerfile index a20a4bfb2b88..66a6e6fd6f67 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -497,14 +497,11 @@ ENV HF_HUB_ENABLE_HF_TRANSFER 1 # Copy in the v1 package for testing (it isn't distributed yet) COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 -# doc requires source code -# we hide them inside `test_docs/` , so that this source code +# Source code is used in the `python_only_compile.sh` test +# We hide it inside `src/` so that this source code # will not be imported by other tests -RUN mkdir test_docs -RUN mv docs test_docs/ -RUN cp -r examples test_docs/ -RUN mv vllm test_docs/ -RUN mv mkdocs.yaml test_docs/ +RUN mkdir src +RUN mv vllm src/vllm #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh index ec1bcbcc58a0..7cc5ef659649 100644 --- a/tests/standalone_tests/python_only_compile.sh +++ b/tests/standalone_tests/python_only_compile.sh @@ -10,7 +10,7 @@ cd /vllm-workspace/ # uninstall vllm pip3 uninstall -y vllm # restore the original files -mv test_docs/vllm ./vllm +mv src/vllm ./vllm # remove all compilers apt remove --purge build-essential -y From 590408257ce350f6149566c8a9cff87adede6bd5 Mon Sep 17 00:00:00 2001 From: Csrayz Date: Fri, 15 Aug 2025 21:00:20 +0800 Subject: [PATCH 046/231] [Frontend] Expose do_log_stats interval to env (#22905) Signed-off-by: Csrayz Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Duncan Moss --- docs/usage/troubleshooting.md | 1 + vllm/entrypoints/openai/api_server.py | 2 +- vllm/envs.py | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index 9715ad66d9b3..b92c6cef4a3f 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -35,6 +35,7 @@ You can check if this is happening by trying the old defaults with `--generation If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging. +- `export VLLM_LOG_STATS_INTERVAL=1.` to get log statistics more frequently for tracking running queue, waiting queue and cache hit states. - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem. - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL. - `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time. diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e5d31c1fd03f..af86835a497d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -126,7 +126,7 @@ async def lifespan(app: FastAPI): async def _force_log(): while True: - await asyncio.sleep(10.) + await asyncio.sleep(envs.VLLM_LOG_STATS_INTERVAL) await engine_client.do_log_stats() task = asyncio.create_task(_force_log()) diff --git a/vllm/envs.py b/vllm/envs.py index 2f0bafa01cc2..82084d1fc5ae 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -38,6 +38,7 @@ VLLM_LOGGING_PREFIX: str = "" VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None + VLLM_LOG_STATS_INTERVAL: float = 10. VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None @@ -436,6 +437,12 @@ def get_vllm_port() -> Optional[int]: lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")) if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None, + # If set, vllm will log stats at this interval in seconds + # If not set, vllm will log stats every 10 seconds. + "VLLM_LOG_STATS_INTERVAL": + lambda: val if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) + > 0. else 10., + # Trace function calls # If set to 1, vllm will trace function calls # Useful for debugging From 5f4872817c2187b03564d1da9f1f682d92a1fe5c Mon Sep 17 00:00:00 2001 From: fhl2000 <63384265+fhl2000@users.noreply.github.com> Date: Fri, 15 Aug 2025 22:01:39 +0800 Subject: [PATCH 047/231] [Core] Allow full cudagraph with separate attention routines and orthogonal to compilation, add support for FA2 and FlashInfer (#20059) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: fhl <2410591650@qq.com> Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Signed-off-by: Lucas Wilkinson Signed-off-by: Lucas Wilkinson Co-authored-by: Luka Govedič Co-authored-by: Lucas Wilkinson Co-authored-by: Lucas Wilkinson Signed-off-by: Duncan Moss --- .../compile/piecewise/test_full_cudagraph.py | 253 ++++++----- tests/compile/piecewise/test_simple.py | 33 +- tests/compile/piecewise/test_toy_llama.py | 36 +- tests/v1/cudagraph/__init__.py | 0 tests/v1/cudagraph/test_cudagraph_dispatch.py | 406 ++++++++++++++++++ tests/v1/cudagraph/test_cudagraph_mode.py | 187 ++++++++ vllm/compilation/backends.py | 42 +- vllm/compilation/base_piecewise_backend.py | 72 ---- vllm/compilation/base_static_graph.py | 54 +++ vllm/compilation/cuda_graph.py | 193 +++++++++ vllm/compilation/cuda_piecewise_backend.py | 133 +----- vllm/compilation/monitor.py | 18 + vllm/compilation/wrapper.py | 7 +- vllm/config/__init__.py | 52 ++- vllm/config/compilation.py | 188 ++++++-- vllm/forward_context.py | 52 ++- vllm/platforms/cuda.py | 13 +- vllm/platforms/interface.py | 19 +- vllm/platforms/rocm.py | 4 +- vllm/platforms/tpu.py | 12 +- vllm/platforms/xpu.py | 22 +- vllm/v1/attention/backends/flash_attn.py | 68 +-- vllm/v1/attention/backends/flashinfer.py | 13 +- vllm/v1/attention/backends/mamba2_attn.py | 8 +- vllm/v1/attention/backends/mla/common.py | 6 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 2 +- vllm/v1/attention/backends/mla/flashmla.py | 11 +- .../attention/backends/mla/rocm_aiter_mla.py | 13 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 5 - vllm/v1/attention/backends/triton_attn.py | 8 +- vllm/v1/attention/backends/utils.py | 24 +- vllm/v1/cudagraph_dispatcher.py | 120 ++++++ vllm/v1/worker/gpu_model_runner.py | 359 ++++++++++++---- vllm/v1/worker/gpu_worker.py | 5 - 34 files changed, 1840 insertions(+), 598 deletions(-) create mode 100644 tests/v1/cudagraph/__init__.py create mode 100644 tests/v1/cudagraph/test_cudagraph_dispatch.py create mode 100644 tests/v1/cudagraph/test_cudagraph_mode.py delete mode 100644 vllm/compilation/base_piecewise_backend.py create mode 100644 vllm/compilation/base_static_graph.py create mode 100644 vllm/compilation/cuda_graph.py create mode 100644 vllm/v1/cudagraph_dispatcher.py diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index cc1a95b820a4..97140a9db7af 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -3,7 +3,8 @@ import contextlib import os import weakref -from contextlib import ExitStack +from dataclasses import dataclass +from typing import Optional import pytest @@ -32,69 +33,133 @@ def temporary_environ(env_vars): os.environ[k] = v -@pytest.fixture(scope="class") -def llm_pair(request): - model = request.param - - with temporary_environ({ - "VLLM_USE_V1": "1", - "VLLM_FLASH_ATTN_VERSION": "3" - }): - full = LLM( - model=model, - gpu_memory_utilization=0.45, - trust_remote_code=True, - max_model_len=1024, - compilation_config=CompilationConfig(full_cuda_graph=True), - ) - piecewise = LLM( - model=model, - gpu_memory_utilization=0.45, - trust_remote_code=True, - max_model_len=1024, - compilation_config=CompilationConfig(), - ) - - # PyTest caches the fixture values so we use weakref.proxy to enable GC - yield weakref.proxy(full), weakref.proxy(piecewise) - del full - del piecewise - - wait_for_gpu_memory_to_clear( - devices=[0], - threshold_ratio=0.1, - ) - - -@pytest.fixture(scope="class") -def cutlass_mla_llm_pair(request): - model = request.param - - # force V1 engine and Cutlass MLA backend - with temporary_environ({ +@dataclass +class BackendConfig: + name: str + env_vars: dict + comp_config: dict + specific_gpu_arch: Optional[tuple] = None + + +# Define all backend configurations of full cudagraph to be tested +backend_configs = { + # FA3 on Hopper + "FA3": + BackendConfig(name="FA3", + env_vars={"VLLM_FLASH_ATTN_VERSION": "3"}, + comp_config={ + "cudagraph_mode": "FULL", + }, + specific_gpu_arch=(9, 0)), + # FlashMLA on Hopper + "FlashMLA": + BackendConfig(name="FlashMLA", + env_vars={ + "VLLM_ATTENTION_BACKEND": "FLASHMLA", + }, + comp_config={ + "cudagraph_mode": "FULL_AND_PIECEWISE", + }, + specific_gpu_arch=(9, 0)), + # Cutlass MLA on Blackwell + "CutlassMLA": + BackendConfig( + name="CutlassMLA", + env_vars={ "VLLM_USE_V1": "1", "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", "FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed - }): + }, + comp_config={ + "cudagraph_mode": "FULL_AND_PIECEWISE", + "cudagraph_capture_sizes": [16, 32, 64, 128, 256, 512], + }, + specific_gpu_arch=(10, 0)), + # FA2 + "FA2": + BackendConfig(name="FA2", + env_vars={"VLLM_FLASH_ATTN_VERSION": "2"}, + comp_config={ + "cudagraph_mode": "FULL", + }), + # Triton Attention + "TritonAttn": + BackendConfig(name="TritonAttn", + env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"}, + comp_config={ + "cudagraph_mode": "FULL", + }), + # FlashInfer + "FlashInfer": + BackendConfig(name="FlashInfer", + env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"}, + comp_config={ + "cudagraph_mode": "FULL_AND_PIECEWISE", + }), +} + +test_params_full_cudagraph = [] + +# deepseek-ai/DeepSeek-V2-Lite with MLA +MLA_backends = ["FlashMLA", "CutlassMLA"] +for mla_backend in MLA_backends: + test_params_full_cudagraph.append( + pytest.param( + ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]))) + +# Qwen/Qwen2-1.5B-Instruct with other backends +other_backend_configs = [ + backend_configs[c] for c in backend_configs if c not in MLA_backends +] +for backend_config in other_backend_configs: + test_params_full_cudagraph.append( + pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config))) + + +@pytest.fixture(scope="class") +def llm_pair(request): + model, backend_config = request.param + + # Dynamically skip test if GPU capability is not met + if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\ + != current_platform.get_device_capability(): + if backend_config.specific_gpu_arch == (9, 0): + pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") + elif backend_config.specific_gpu_arch == (10, 0): + pytest.skip("Only Blackwell GPUs support Cutlass MLA") + + env_vars = { + "VLLM_USE_V1": "1", + # Force native sampler to avoid potential nondeterminism in FlashInfer + # when per-request generators are not used in V1. + "VLLM_USE_FLASHINFER_SAMPLER": "0", + **backend_config.env_vars, + } + with temporary_environ(env_vars): full = LLM( model=model, - gpu_memory_utilization=0.45, + gpu_memory_utilization=0.43, trust_remote_code=True, max_model_len=1024, - compilation_config=CompilationConfig( - full_cuda_graph=True, - cudagraph_capture_sizes=[16, 32, 64, 128, 256, 512], - ), + max_num_seqs=128, + compilation_config=\ + CompilationConfig(**backend_config.comp_config), + generation_config="vllm", + seed=42, ) piecewise = LLM( model=model, - gpu_memory_utilization=0.45, + gpu_memory_utilization=0.43, trust_remote_code=True, max_model_len=1024, - compilation_config=CompilationConfig(), + max_num_seqs=128, + compilation_config=CompilationConfig(cudagraph_mode="PIECEWISE"), + generation_config="vllm", + seed=42, ) + # PyTest caches the fixture values so we use weakref.proxy to enable GC yield weakref.proxy(full), weakref.proxy(piecewise) del full del piecewise @@ -105,51 +170,7 @@ def cutlass_mla_llm_pair(request): ) -@pytest.mark.parametrize( - "cutlass_mla_llm_pair", - [ - # use an MLA model - "deepseek-ai/DeepSeek-V2-Lite", - ], - indirect=True) -@pytest.mark.skipif(current_platform.get_device_capability() != (10, 0), - reason="Only Blackwell GPUs support Cutlass MLA") -class TestFullCUDAGraphCutlassMLA: - """ - Validate full CUDA Graph with Cutlass MLA (decode-only capture). - """ - - @pytest.mark.parametrize(("batch_size", "max_tokens"), [ - (8, 8), - ]) - def test_full_cudagraph_sm100_cutlass_mla( - self, batch_size, max_tokens, cutlass_mla_llm_pair: tuple[LLM, - LLM]): - piecewise_llm, full_cudagraph_llm = cutlass_mla_llm_pair - - prompts = ["Hello, my name is"] * batch_size - sampling_params = SamplingParams(temperature=0.0, - max_tokens=max_tokens, - top_p=0.95) - - piecewise_responses = piecewise_llm.generate(prompts, sampling_params) - full_responses = full_cudagraph_llm.generate(prompts, sampling_params) - - for piecewise_res, full_res in zip(piecewise_responses, - full_responses): - assert piecewise_res.outputs[0].text == full_res.outputs[0].text - - -@pytest.mark.parametrize( - "llm_pair", - [ - # Model names for the llm_pair fixture - "deepseek-ai/DeepSeek-V2-Lite", - "Qwen/Qwen2-1.5B-Instruct" - ], - indirect=True) -@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0), - reason="Only Hopper GPUs support FA3 and FlashMLA") +@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True) class TestFullCUDAGraph: """ Use a class such that an llm pair is constructed once for all @@ -178,12 +199,14 @@ def test_full_cudagraph(self, batch_size, max_tokens, full cudagraph compilation works for padded cases too. """ - piecewise_llm, full_cudagraph_llm = llm_pair + full_cudagraph_llm, piecewise_llm = llm_pair - prompts = ["Hello, my name is"] * batch_size + prompts = ["the quick brown fox"] * batch_size + # Use purely greedy decoding to avoid top-p truncation sensitivity + # that can amplify tiny numeric differences across runtimes. sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens, - top_p=0.95) + top_p=1.0) piecewise_responses = piecewise_llm.generate(prompts, sampling_params) full_responses = full_cudagraph_llm.generate(prompts, sampling_params) @@ -191,42 +214,16 @@ def test_full_cudagraph(self, batch_size, max_tokens, # Check that all responses are the same for piecewise_res, full_res in zip(piecewise_responses, full_responses): - assert piecewise_res.outputs[0].text == full_res.outputs[0].text - - -@pytest.mark.parametrize( - "model, supported", - [ - ("Qwen/Qwen2-1.5B-Instruct", True), - # MLA does not support capturing CUDA Graphs with size > max_num_seqs - ("deepseek-ai/DeepSeek-V2-Lite", False), - ]) -@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0), - reason="Only Hopper GPUs support FA3 and FlashMLA") -def test_lower_max_num_seqs(model, supported): - with temporary_environ({ - "VLLM_USE_V1": "1", - "VLLM_FLASH_ATTN_VERSION": "3" - }), ExitStack() as stack: - if not supported: - stack.enter_context(pytest.raises(RuntimeError)) - - llm = LLM(model=model, - max_num_seqs=256, - trust_remote_code=True, - max_model_len=1024, - compilation_config=CompilationConfig( - full_cuda_graph=True, - cudagraph_capture_sizes=[64, 256, 512])) - llm.generate(["Hello, my name is"] * 10) + assert piecewise_res.outputs[0].text.lower() == \ + full_res.outputs[0].text.lower() @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") def test_full_cudagraph_with_invalid_backend(): with temporary_environ({ "VLLM_USE_V1": "1", - "VLLM_FLASH_ATTN_VERSION": - "2" #FA2 not supported with full_cuda_graph + "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION" + # Flex_Attention is not supported with full cuda graph }), pytest.raises(RuntimeError): LLM(model="Qwen/Qwen2-1.5B-Instruct", - compilation_config=CompilationConfig(full_cuda_graph=True)) + compilation_config=CompilationConfig(cudagraph_mode="FULL")) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 06ac3527e1fb..2d1a72d44ec7 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -11,10 +11,10 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile -from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, - set_current_vllm_config) +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + VllmConfig, set_current_vllm_config) from vllm.envs import VLLM_USE_V1 -from vllm.forward_context import set_forward_context +from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils import direct_register_custom_op global_counter = 0 @@ -101,16 +101,33 @@ def test_simple_piecewise_compile(use_inductor): num_backend_compilations=3, # num_piecewise_capturable_graphs_seen num_cudagraph_captured= 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ), set_forward_context({}, vllm_config=vllm_config): - + ), set_forward_context(None, + vllm_config=vllm_config): # background context + # warm up with background context model(inputs) - model(torch.randn(2).cuda()) - model(torch.randn(1).cuda()) + # capturing/replaying should under context of cudagraph dispatching + with set_forward_context( + None, + vllm_config=vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + batch_descriptor=BatchDescriptor(num_tokens=2, )): + model(torch.randn(2).cuda()) + with set_forward_context( + None, + vllm_config=vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + batch_descriptor=BatchDescriptor(num_tokens=1, )): + model(torch.randn(1).cuda()) input = torch.zeros(2).cuda() global global_counter global_counter = 0 - output = model(input) + with set_forward_context( + None, + vllm_config=vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + batch_descriptor=BatchDescriptor(num_tokens=2, )): + output = model(input) assert global_counter == 2 assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index b7ed8353b3ce..bcfd0d834c5d 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -18,9 +18,9 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile -from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, - set_current_vllm_config) -from vllm.forward_context import set_forward_context +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + VllmConfig, set_current_vllm_config) +from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils import direct_register_custom_op # create a library to hold the custom op @@ -276,9 +276,11 @@ def run_model(llama_config, ) if split_attn: compilation_config.splitting_ops = ["silly.attention"] + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: compilation_config = CompilationConfig( level=CompilationLevel.NO_COMPILATION, ) + cudagraph_runtime_mode = CUDAGraphMode.NONE vllm_config = VllmConfig(compilation_config=compilation_config, additional_config=llama_config) @@ -287,17 +289,37 @@ def run_model(llama_config, vllm_config=vllm_config, prefix="").eval().cuda() - with set_forward_context({}, vllm_config=vllm_config): + with set_forward_context({}, + vllm_config=vllm_config): # background context B = 16 # max batch size input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() positions = torch.arange(B).cuda() + # warmup for the model with cudagraph_mode NONE model(input_ids, positions) - model(input_ids[:2], positions[:2]) - model(input_ids[:1], positions[:1]) + + # simulate cudagraphs capturing + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + model(input_ids[:2], positions[:2]) + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=1, )): + model(input_ids[:1], positions[:1]) input_ids[:2].zero_() - output = model(input_ids[:2], positions[:2]) + # simulate cudagraphs replay + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + output = model(input_ids[:2], positions[:2]) output = output.cpu() diff --git a/tests/v1/cudagraph/__init__.py b/tests/v1/cudagraph/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py new file mode 100644 index 000000000000..64f2fa462802 --- /dev/null +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -0,0 +1,406 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from unittest.mock import MagicMock, patch + +import pytest +import torch +import torch.nn as nn + +from tests.utils import create_new_process_for_each_test +from vllm.compilation.cuda_graph import CUDAGraphWrapper +from vllm.compilation.monitor import set_cudagraph_capturing_enabled +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + ParallelConfig, SchedulerConfig, VllmConfig) +from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.platforms import current_platform +from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher + + +# Helper MLP for testing +class SimpleMLP(nn.Module): + + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(10, 10) + self.fc2 = nn.Linear(10, 10) + + def forward(self, x): + return self.fc2(self.fc1(x)) + + +def _create_vllm_config(compilation_config: CompilationConfig, + max_num_seqs: int = 8) -> MagicMock: + mock_config = MagicMock(spec=VllmConfig) + mock_config.compilation_config = compilation_config + mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs) + mock_config.parallel_config = ParallelConfig() + + # Mimic the behavior of VllmConfig.__post_init__() + if compilation_config.level == CompilationLevel.PIECEWISE: + compilation_config.set_splitting_ops_for_v1() + + return mock_config + + +class TestCudagraphDispatcher: + + @pytest.mark.parametrize( + "params", + [ + # Test case 0: Full CG for mixed batches, no separate routine + { + "case_id": 0, + "cudagraph_mode": "FULL", + "compilation_level": CompilationLevel.NO_COMPILATION, + }, + # Test case 1: Full CG for uniform batches, piecewise for mixed + { + "case_id": 1, + "cudagraph_mode": "FULL_AND_PIECEWISE", + "compilation_level": CompilationLevel.PIECEWISE, + }, + # Test case 2: Full CG for uniform batches, no CG for mixed + { + "case_id": 2, + "cudagraph_mode": "FULL_DECODE_ONLY", + "compilation_level": CompilationLevel.NO_COMPILATION, + }, + # Test case 3: Piecewise for all + { + "case_id": 3, + "cudagraph_mode": "PIECEWISE", + "compilation_level": CompilationLevel.PIECEWISE, + }, + ]) + def test_dispatcher(self, params): + # Setup dispatcher + comp_config = CompilationConfig( + cudagraph_mode=params["cudagraph_mode"], + level=params["compilation_level"], + cudagraph_capture_sizes=[1, 8]) + + config = _create_vllm_config(comp_config, max_num_seqs=8) + dispatcher = CudagraphDispatcher(config) + dispatcher.initialize_cudagraph_keys( + cudagraph_mode=comp_config.cudagraph_mode, + uniform_decode_query_len=1) + + # Verify the key is initialized correctly + if params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]: + assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 2 + else: + assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 0 + if params["cudagraph_mode"] not in ["NONE", "PIECEWISE"]: + assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 2 + else: + assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 0 + + # Test dispatch logic + # 1. non-uniform batch, size in cudagraph size list + desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False) + rt_mode, key = dispatcher.dispatch(desc_full_exact) + if params["cudagraph_mode"] == "FULL": + assert rt_mode == CUDAGraphMode.FULL + assert key == desc_full_exact + elif params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]: + assert rt_mode == CUDAGraphMode.PIECEWISE + assert key == desc_full_exact + else: + assert rt_mode == CUDAGraphMode.NONE + + # 2. uniform decode batch, size in cudagraph size list + desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True) + rt_mode, key = dispatcher.dispatch(desc_uniform_exact) + if params["cudagraph_mode"] == "FULL": + assert rt_mode == CUDAGraphMode.FULL + assert key == desc_uniform_exact.non_uniform + elif params["cudagraph_mode"] in [ + "FULL_DECODE_ONLY", "FULL_AND_PIECEWISE" + ]: + assert rt_mode == CUDAGraphMode.FULL + assert key == desc_uniform_exact + elif params["cudagraph_mode"] == "PIECEWISE": + assert rt_mode == CUDAGraphMode.PIECEWISE + assert key == desc_uniform_exact.non_uniform + else: + assert rt_mode == CUDAGraphMode.NONE + + # 3. No key match + desc_no_match = BatchDescriptor(num_tokens=15, uniform_decode=False) + rt_mode, key = dispatcher.dispatch(desc_no_match) + assert rt_mode == CUDAGraphMode.NONE + assert key is None + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") +class TestCUDAGraphWrapper: + + def setup_method(self): + self.vllm_config = _create_vllm_config(CompilationConfig()) + self.model = SimpleMLP().to("cuda") + self.persistent_input_buffer = torch.zeros(1, 10, device="cuda") + self.input_tensor = torch.randn(1, 10, device="cuda") + + @create_new_process_for_each_test("spawn") + def test_capture_and_replay(self): + wrapper = CUDAGraphWrapper(self.model, + self.vllm_config, + runtime_mode=CUDAGraphMode.FULL) + batch_descriptor = BatchDescriptor(num_tokens=10) + + # 0. global warmup + with set_forward_context(attn_metadata=None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + batch_descriptor=None): + wrapper(self.input_tensor) + + # 1. Capture + with set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.FULL, + batch_descriptor=batch_descriptor),\ + patch("torch.cuda.graph", + wraps=torch.cuda.graph) as mock_cuda_graph: + output1 = wrapper(self.input_tensor) + # capturing phase should generate a zero output + assert torch.allclose(output1, torch.zeros_like(output1)) + mock_cuda_graph.assert_called_once() + + assert batch_descriptor in wrapper.concrete_cudagraph_entries + entry = wrapper.concrete_cudagraph_entries[batch_descriptor] + assert entry.cudagraph is not None + + # 2. Replay + with set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.FULL, + batch_descriptor=batch_descriptor),\ + patch.object(entry.cudagraph, 'replay', + wraps=entry.cudagraph.replay) as mock_replay: + output2 = wrapper(self.input_tensor) + mock_replay.assert_called_once() + + # Compare with eager output + eager_output = self.model(self.input_tensor) + torch.testing.assert_close(eager_output, output2) + + @create_new_process_for_each_test("spawn") + def test_bypass_on_mode_mismatch(self): + wrapper = CUDAGraphWrapper(self.model, + self.vllm_config, + runtime_mode=CUDAGraphMode.FULL) + batch_descriptor = BatchDescriptor(num_tokens=10) + + with set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + batch_descriptor=batch_descriptor), \ + patch('torch.cuda.graph', + wraps=torch.cuda.graph) as mock_cuda_graph, \ + patch.object(self.model, 'forward', + wraps=self.model.forward) as mock_forward: + wrapper(self.input_tensor) + mock_cuda_graph.assert_not_called() + mock_forward.assert_called_once() + assert not wrapper.concrete_cudagraph_entries + + @create_new_process_for_each_test("spawn") + def test_bypass_on_mode_none(self): + wrapper = CUDAGraphWrapper(self.model, + self.vllm_config, + runtime_mode=CUDAGraphMode.FULL) + batch_descriptor = BatchDescriptor(num_tokens=10) + + with set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + batch_descriptor=batch_descriptor), \ + patch('torch.cuda.graph', + wraps=torch.cuda.graph) as mock_cuda_graph: + wrapper(self.input_tensor) + mock_cuda_graph.assert_not_called() + assert not wrapper.concrete_cudagraph_entries + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") +class TestCudagraphIntegration: + + def setup_method(self): + # only FULL mode for non-uniform batches + self.comp_config = CompilationConfig(level=CompilationLevel.PIECEWISE, + cudagraph_mode="FULL", + cudagraph_capture_sizes=[10, 20]) + self.vllm_config = _create_vllm_config(self.comp_config) + self.dispatcher = CudagraphDispatcher(self.vllm_config) + self.dispatcher.initialize_cudagraph_keys( + self.comp_config.cudagraph_mode, uniform_decode_query_len=1) + + def _run_and_monitor_call(self, wrapper, input_tensor, runtime_mode, + batch_descriptor): + """Helper to run a single call and monitor the action.""" + + with patch('torch.cuda.graph', + wraps=torch.cuda.graph) as mock_graph_context, \ + patch.object(wrapper, 'runnable', + wraps=wrapper.runnable) as mock_runnable: + + entry = wrapper.concrete_cudagraph_entries.get( + batch_descriptor, None) + + context = set_forward_context(attn_metadata=None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=runtime_mode, + batch_descriptor=batch_descriptor) + mock_replay = MagicMock() + if entry and entry.cudagraph: + with context, \ + patch.object(entry.cudagraph, 'replay', + new_callable=MagicMock) as mock_replay: + wrapper(input_tensor) + else: + with context: + wrapper(input_tensor) + + if mock_graph_context.called: + # note that this is globally mocked, so it will be detected + # even whether called by the inner or outer wrapper + return "capture_global" + if mock_replay.called: + # only for outer wrapper + return "replay" + if mock_runnable.call_count > 0: + # only for outer wrapper + return "bypass" + return "unknown" + + @create_new_process_for_each_test("spawn") + def test_capture_replay_bypass_logic(self): + model = SimpleMLP().to("cuda") + full_wrapper = CUDAGraphWrapper(model, self.vllm_config, + CUDAGraphMode.FULL) + max_bs = 16 + persistent_input_buffer = torch.zeros(max_bs, 10, device="cuda") + input_1 = persistent_input_buffer[:1] + input_2 = persistent_input_buffer[:2] + input_3 = persistent_input_buffer[:3] + + desc_1 = BatchDescriptor(num_tokens=1) + desc_2 = BatchDescriptor(num_tokens=2) + desc_3_unseen = BatchDescriptor(num_tokens=3) + + # 0. global warmup + with set_forward_context(attn_metadata=None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + batch_descriptor=None): + full_wrapper(input_1) + + rt_mode, key = self.dispatcher.dispatch(desc_1) + # 1. Capture first shape + action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode, + key) + assert action == "capture_global" + + # 2. Replay first shape + action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode, + key) + assert action == "replay" + + rt_mode, key = self.dispatcher.dispatch(desc_2) + # 3. Capture second shape + action = self._run_and_monitor_call(full_wrapper, input_2, rt_mode, + key) + assert action == "capture_global" + + # 4. Replay second shape + action = self._run_and_monitor_call(full_wrapper, input_2, + CUDAGraphMode.FULL, desc_2) + assert action == "replay" + + # 5. Bypass if no key match + rt_mode, key = self.dispatcher.dispatch(desc_3_unseen) + assert rt_mode == CUDAGraphMode.NONE + action = self._run_and_monitor_call(full_wrapper, input_3, rt_mode, + key) + assert action == "bypass" + + # capture unseen shape is not allowed after disable + set_cudagraph_capturing_enabled(False) + with pytest.raises(RuntimeError): + self._run_and_monitor_call(full_wrapper, input_3, + CUDAGraphMode.FULL, desc_3_unseen) + set_cudagraph_capturing_enabled(True) + + @create_new_process_for_each_test("spawn") + def test_nested_wrappers(self): + """Tests a scenario with a PIECEWISE wrapper inside a FULL one.""" + model = SimpleMLP().to("cuda") + full_wrapper = CUDAGraphWrapper(model, self.vllm_config, + CUDAGraphMode.FULL) + input_1 = torch.randn(1, 10, device="cuda") + + # Setup: Inner model is wrapped with PIECEWISE, outer with FULL + inner_model = SimpleMLP().to("cuda") + piecewise_wrapper = CUDAGraphWrapper(inner_model, self.vllm_config, + CUDAGraphMode.PIECEWISE) + inner_model.forward = MagicMock(wraps=inner_model.forward) + outer_model = SimpleMLP().to("cuda") + # When outer model is called, it calls the piecewise_wrapper + outer_model.forward = MagicMock(wraps=outer_model.forward, + side_effect=piecewise_wrapper) + full_wrapper = CUDAGraphWrapper(outer_model, self.vllm_config, + CUDAGraphMode.FULL) + + desc_1 = BatchDescriptor(num_tokens=1) + + # 0. global warmup + with set_forward_context(attn_metadata=None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + batch_descriptor=None): + full_wrapper(input_1) + + # --- Test runtime mode FULL--- + # Run with FULL mode context. Expect outer wrapper to capture. + # The inner mock should be called once inside the graph capture. + outer_model.forward.reset_mock() + inner_model.forward.reset_mock() + action = self._run_and_monitor_call(full_wrapper, input_1, + CUDAGraphMode.FULL, desc_1) + assert action == "capture_global" + assert outer_model.forward.call_count == 1 + assert inner_model.forward.call_count == 1 + + # Run again. Expect outer wrapper to replay. + # The outer model should NOT be called because the whole graph + # is replayed. + action = self._run_and_monitor_call(full_wrapper, input_1, + CUDAGraphMode.FULL, desc_1) + assert action == "replay" + assert outer_model.forward.call_count == 1 # No new call + assert inner_model.forward.call_count == 1 + + # --- Test runtime mode PIECEWISE --- + outer_model.forward.reset_mock() + inner_model.forward.reset_mock() + # Run with PIECEWISE mode context. + # Expect outer wrapper to bypass and call inner wrapper. + # Inner wrapper should capture. + action = self._run_and_monitor_call(full_wrapper, input_1, + CUDAGraphMode.PIECEWISE, desc_1) + assert action == "capture_global" + assert outer_model.forward.call_count == 1 + assert inner_model.forward.call_count == 1 + + # Run again with PIECEWISE. + # Outer bypasses, inner replays. + action = self._run_and_monitor_call(full_wrapper, input_1, + CUDAGraphMode.PIECEWISE, desc_1) + assert action == "bypass" + assert outer_model.forward.call_count == 2 + assert inner_model.forward.call_count == 1 diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py new file mode 100644 index 000000000000..81655e417500 --- /dev/null +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -0,0 +1,187 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib +import os +import weakref +from contextlib import ExitStack +from dataclasses import dataclass +from typing import Optional + +import pytest + +from tests.utils import wait_for_gpu_memory_to_clear +from vllm import LLM +from vllm.config import CompilationConfig +from vllm.platforms import current_platform + + +@contextlib.contextmanager +def temporary_environ(env_vars): + """ + Temporarily set environment variables and restore them afterward. + We have to do this vs monkeypatch because monkeypatch doesn't work + with "module" scoped fixtures. + """ + original_env = {k: os.environ.get(k) for k in env_vars} + try: + os.environ.update(env_vars) + yield + finally: + for k, v in original_env.items(): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v + + +@dataclass +class BackendConfig: + name: str + env_vars: dict + comp_config: dict + specific_gpu_arch: Optional[tuple] = None + + +# Define all backend configurations of full cudagraph to be tested +backend_configs = { + # FA3 on Hopper + "FA3": + BackendConfig(name="FA3", + env_vars={"VLLM_FLASH_ATTN_VERSION": "3"}, + comp_config={ + "cudagraph_mode": "FULL", + }, + specific_gpu_arch=(9, 0)), + # FlashMLA on Hopper + "FlashMLA": + BackendConfig(name="FlashMLA", + env_vars={ + "VLLM_ATTENTION_BACKEND": "FLASHMLA", + }, + comp_config={ + "cudagraph_mode": "FULL_AND_PIECEWISE", + }, + specific_gpu_arch=(9, 0)), + # FA2 + "FA2": + BackendConfig(name="FA2", + env_vars={"VLLM_FLASH_ATTN_VERSION": "2"}, + comp_config={ + "cudagraph_mode": "FULL_AND_PIECEWISE", + }), + # Triton Attention + "TritonAttn": + BackendConfig(name="TritonAttn", + env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"}, + comp_config={ + "cudagraph_mode": "FULL_AND_PIECEWISE", + }), + # FlashInfer + "FlashInfer": + BackendConfig(name="FlashInfer", + env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"}, + comp_config={ + "cudagraph_mode": "FULL_AND_PIECEWISE", + }), +} + +# test attention backend and cudagraph_mode combo +# (backend_name, cudagraph_mode, supported) +combo_cases_1 = [ + ("FA3", "FULL", True), + ("FA3", "FULL_AND_PIECEWISE", True), + ("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE + ("FA2", "FULL_AND_PIECEWISE", True), + ("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE + ("FlashInfer", "FULL_AND_PIECEWISE", True), +] + + +@pytest.mark.parametrize("combo_case", combo_cases_1) +def test_backend_and_cudagraph_mode_combo(combo_case): + backend_name, cudagraph_mode, supported = combo_case + if backend_name == "FlashInfer": + try: + import flashinfer # noqa: F401 + except ImportError: + pytest.skip("FlashInfer is not installed") + backend_config = backend_configs[backend_name] + # Dynamically skip test if GPU capability is not met + if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\ + != current_platform.get_device_capability(): + pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") + + env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} + + with temporary_environ(env_vars), ExitStack() as stack: + if not supported: + stack.enter_context(pytest.raises(Exception)) + + llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", + max_num_seqs=256, + trust_remote_code=True, + gpu_memory_utilization=0.45, + max_model_len=1024, + compilation_config=CompilationConfig( + level=3, cudagraph_mode=cudagraph_mode)) + llm.generate(["Hello, my name is"] * 10) + + try: + llm = weakref.proxy(llm) + del llm + except UnboundLocalError: + pass + + wait_for_gpu_memory_to_clear( + devices=[0], + threshold_ratio=0.1, + ) + + +# test cudagraph_mode with different compilation level. +# (backend_name, cudagraph_mode, compilation_level, supported) +combo_cases_2 = [ + ("FA2", "FULL", 0, True), # no compilation + full cudagraph + ("FA2", "FULL", 3, True), # piecewise compilation + full cudagraph + ("FA2", "PIECEWISE", 0, False), # no compilation + piecewise cudagraph + ("FA2", "PIECEWISE", 3, + True), # piecewise compilation + piecewise cudagraph + ("FA2", "FULL_AND_PIECEWISE", 0, + False), # piecewise cudagraph not supported without piecewise compilation + ("FA2", "FULL_AND_PIECEWISE", 3, True), + ("FA2", "FULL_DECODE_ONLY", 0, True), + ("FA2", "FULL_DECODE_ONLY", 3, True), + ("FA2", "NONE", 0, True), # no compilation + no cudagraph + ("FA2", "NONE", 3, True), # piecewise compilation + no cudagraph +] + + +@pytest.mark.parametrize("combo_case", combo_cases_2) +def test_cudagraph_compilation_combo(combo_case): + backend_name, cudagraph_mode, compilation_level, supported\ + = combo_case + + env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} + + with temporary_environ(env_vars), ExitStack() as stack: + if not supported: + stack.enter_context(pytest.raises(Exception)) + + llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", + max_num_seqs=256, + trust_remote_code=True, + gpu_memory_utilization=0.45, + max_model_len=1024, + compilation_config=CompilationConfig( + level=compilation_level, cudagraph_mode=cudagraph_mode)) + llm.generate(["Hello, my name is"] * 10) + try: + llm = weakref.proxy(llm) + del llm + except UnboundLocalError: + pass + finally: + wait_for_gpu_memory_to_clear( + devices=[0], + threshold_ratio=0.1, + ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 673fb5866234..059e7a3b2976 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -15,7 +15,7 @@ from torch._dispatch.python import enable_python_dispatcher import vllm.envs as envs -from vllm.config import CompilationConfig, VllmConfig +from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname @@ -277,9 +277,6 @@ def split_graph(graph: fx.GraphModule, return split_gm, outputs -# we share the global graph pool among all the backends -global_graph_pool = None - compilation_start_time = 0.0 @@ -339,14 +336,37 @@ def call_module(self, target: torch.fx.node.Target, graph_index=index, num_graphs=len(self.compile_submod_names), runtime_shape=None) + # Lazy import here to avoid circular import + from .cuda_graph import CUDAGraphOptions + from .cuda_piecewise_backend import PiecewiseBackend - piecewise_backend = resolve_obj_by_qualname( - current_platform.get_piecewise_backend_cls()) - self.module.__dict__[target] = piecewise_backend( - submod, self.vllm_config, self.graph_pool, index, + piecewise_backend = PiecewiseBackend( + submod, self.vllm_config, index, len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_dynamic_shape, self.vllm_backend) + if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + # resolve the static graph wrapper class (e.g. CUDAGraphWrapper + # class) as platform dependent. + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + # Always assign PIECEWISE runtime mode to the + # CUDAGraphWrapper for piecewise_backend, to distinguish + # it from the FULL cudagraph runtime mode, no matter it + # is wrapped on a full or piecewise fx graph. + self.module.__dict__[target] = static_graph_wrapper_class( + runnable=piecewise_backend, + vllm_config=self.vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + graph_pool=self.graph_pool, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=piecewise_backend.is_first_graph, + gc_disable=not piecewise_backend.is_first_graph, + weak_ref_output=piecewise_backend.is_last_graph)) + else: + self.module.__dict__[target] = piecewise_backend + compilation_counter.num_piecewise_capturable_graphs_seen += 1 return output @@ -413,9 +433,7 @@ def __init__( # them, e.g. backbone (default), eagle_head, etc. self.prefix = prefix or model_tag - global global_graph_pool - if global_graph_pool is None: - global_graph_pool = current_platform.graph_pool_handle() + global_graph_pool = current_platform.get_global_graph_pool() # TODO: in the future, if we want to use multiple # streams, it might not be safe to share a global pool. @@ -585,7 +603,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self._called = True - if not self.compilation_config.use_cudagraph or \ + if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \ not self.compilation_config.cudagraph_copy_inputs: return self.split_gm diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py deleted file mode 100644 index 4d7aeeb4d03e..000000000000 --- a/vllm/compilation/base_piecewise_backend.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any, Callable, Protocol - -import torch.fx as fx - -from vllm.compilation.backends import VllmBackend -from vllm.config import VllmConfig - - -class AbstractPiecewiseBackend(Protocol): - """ - PiecewiseBackend interface that allows platforms to extend - piecewise static graph. - """ - - def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, - graph_pool: Any, piecewise_compile_index: int, - total_piecewise_compiles: int, sym_shape_indices: list[int], - compiled_graph_for_general_shape: Callable, - vllm_backend: VllmBackend, **kwargs): - """ - Initializes the PiecewiseBackend class with compilation and - execution-related configurations. - - This class handles piecewise compilation, graph capturing, - and dispatching for specific input shapes. - - Args: - graph (fx.GraphModule): The graph represented in fx. - vllm_config (VllmConfig): Global configuration for vLLM. - graph_pool (Any): - Graph memory pool handle, e.g., - `torch.cuda.graph_pool_handle()`. - piecewise_compile_index (int): - Index of the current piecewise subgraph. - total_piecewise_compiles (int): - Total number of piecewise-compiled graphs. - sym_shape_indices (list[int]): - Indices of symbolic shape. - compiled_graph_for_general_shape (Callable): - Callable that executes the graph compiled for general shapes. - vllm_backend (VllmBackend): - Backend compiler that manages compilation and graph runtime - for vLLM. - - Keyword Args: - kwargs: Additional keyword arguments reserved for future - extensions or custom platforms. - """ - raise NotImplementedError - - def __call__(self, *args) -> Any: - """Executes the compiled graph for given input args. - - If this is the first invocation, executes the general compiled graph - and initiates the compilation process tracking. For subsequent calls, - dynamically dispatches execution to either a compiled graph or a static - graph based on the input shape. - - Args: - *args: Variable length input arguments to be passed into the - graph. The symbolic shape is expected to be in position - `sym_shape_indices[0]`. - - Returns: - Any: Output of the executed graph. This can be from the general - compiled graph, a specialized compiled version for the given shape, - or a replayed static graph. - """ - raise NotImplementedError diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py new file mode 100644 index 000000000000..1c3f52c533b1 --- /dev/null +++ b/vllm/compilation/base_static_graph.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Callable, Protocol + +from vllm.config import CUDAGraphMode, VllmConfig + + +class AbstractStaticGraphWrapper(Protocol): + """ + StaticGraphWrapper interface that allows platforms to wrap a callable + to be captured as a static graph. + """ + + def __init__(self, runnable: Callable, vllm_config: VllmConfig, + runtime_mode: CUDAGraphMode, graph_pool: Any, **kwargs): + """ + Initializes the StaticGraphWrapper class with graph capturing and + execution-related configurations. + + Args: + runnable (Callable): The callable to be wrapped and captured. + vllm_config (VllmConfig): Global configuration for vLLM. + runtime_mode (CUDAGraphMode): The style of the static + graph runtime. See CUDAGraphMode in vllm/config.py. + Note that only the subset enum `NONE`, `PIECEWISE` and `FULL` + are used as concrete runtime mode for cudagraph dispatching. + graph_pool (Any): + Graph memory pool handle, e.g., + `torch.cuda.graph_pool_handle()`. + Keyword Args: + kwargs: Additional keyword arguments for platform-specific + configurations. + """ + raise NotImplementedError + + def __call__(self, *args, **kwargs) -> Any: + """ + Executes the wrapped callable. + + If the current runtime mode in the ForwardContext matches the runtime + mode of this instance, it replays the CUDAGraph or captures it using + the callable if it hasn't been captured yet. Otherwise, it calls the + original callable directly. + + Args: + *args: Variable length input arguments to be passed into the + callable. + **kwargs: Keyword arguments to be passed into the callable. + + Returns: + Any: Output of the executed callable. + """ + raise NotImplementedError diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py new file mode 100644 index 000000000000..65a38197ad4e --- /dev/null +++ b/vllm/compilation/cuda_graph.py @@ -0,0 +1,193 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import dataclasses +from contextlib import ExitStack +from typing import Any, Callable, Optional +from unittest.mock import patch + +import torch + +import vllm.envs as envs +from vllm.compilation.counter import compilation_counter +from vllm.compilation.monitor import validate_cudagraph_capturing_enabled +from vllm.config import CUDAGraphMode, VllmConfig +from vllm.forward_context import BatchDescriptor, get_forward_context +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import weak_ref_tensors + +logger = init_logger(__name__) + + +@dataclasses.dataclass +class CUDAGraphEntry: + batch_descriptor: BatchDescriptor + cudagraph: Optional[torch.cuda.CUDAGraph] = None + output: Optional[Any] = None + + # for cudagraph debugging, track the input addresses + # during capture, and check if they are the same during replay + input_addresses: Optional[list[int]] = None + + +@dataclasses.dataclass +class CUDAGraphOptions: + debug_log_enable: bool = True + gc_disable: bool = False + weak_ref_output: bool = True + + +class CUDAGraphWrapper: + """Wraps a runnable to add CUDA graph capturing and replaying ability. And + provide attribute access to the underlying `runnable` via `__getattr__`. + + The workflow of this wrapper in the cudagraph dispatching is as follows: + 1. At initialization, a runtime mode is assigned to the wrapper (FULL or + PIECEWISE). + 2. At runtime, the wrapper receives a runtime_mode and a + batch_descriptor(key) from the forward context and blindly trust them + for cudagraph dispatching. + 3. If runtime_mode is NONE or runtime_mode does not match the mode of the + wrapper, just call the runnable directly. + 4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, + the wrapper will perform cudagraph capture(if key does not exist, create + a new entry and cache it) or replay (if key exists in the cache). + + Note: CUDAGraphWrapper does not store persistent buffers or copy any + runtime inputs into that buffers for replay. We assume implementing them + is done outside of the wrapper. That is because we do not make any + assumption on the dynamic shape (batch size) of the runtime inputs, as a + trade-off for staying orthogonal to compilation logic. Nevertheless, + tracing and checking the input addresses to be consistent during replay is + guaranteed when VLLM_LOGGING_LEVEL == "DEBUG". + """ + + def __init__(self, + runnable: Callable, + vllm_config: VllmConfig, + runtime_mode: CUDAGraphMode, + graph_pool: Any = None, + cudagraph_options: Optional[CUDAGraphOptions] = None): + self.runnable = runnable + self.vllm_config = vllm_config + self.graph_pool = graph_pool + self.runtime_mode = runtime_mode + self.compilation_config = vllm_config.compilation_config + + self.first_run_finished = False + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + + # assert runtime_mode is not NONE(no cudagraph), otherwise, we don't + # need to initialize a CUDAGraphWrapper. + assert self.runtime_mode != CUDAGraphMode.NONE + if self.graph_pool is None: + self.graph_pool = current_platform.get_global_graph_pool() + + if cudagraph_options is None: + cudagraph_options = CUDAGraphOptions() + self.cudagraph_options = cudagraph_options + # the entries for different batch descriptors that we need to capture + # cudagraphs for. + self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry]\ + = {} + + def __getattr__(self, key: str): + # allow accessing the attributes of the runnable. + if hasattr(self.runnable, key): + return getattr(self.runnable, key) + raise AttributeError(f"Attribute {key} not exists in the runnable of " + f"cudagraph wrapper: {self.runnable}") + + def unwrap(self) -> Callable: + # in case we need to access the original runnable. + return self.runnable + + def __call__(self, *args, **kwargs): + forward_context = get_forward_context() + batch_descriptor = forward_context.batch_descriptor + cudagraph_runtime_mode = forward_context.cudagraph_runtime_mode + + if cudagraph_runtime_mode == CUDAGraphMode.NONE or \ + cudagraph_runtime_mode != self.runtime_mode: + # CUDAGraphMode.NONE could mean the profile run, a warmup run, or + # running without cudagraphs. + # We do not trigger capture/replay if the runtime mode is not + # matches. This enables properly dispatching to the correct + # CUDAGraphWrapper when nesting multiple instances with different + # runtime modes. + return self.runnable(*args, **kwargs) + + if batch_descriptor not in self.concrete_cudagraph_entries: + # create a new entry for this batch descriptor + self.concrete_cudagraph_entries[batch_descriptor] = \ + CUDAGraphEntry(batch_descriptor=batch_descriptor) + + entry = self.concrete_cudagraph_entries[batch_descriptor] + + if entry.cudagraph is None: + if self.cudagraph_options.debug_log_enable: + # Since we capture cudagraph for many different shapes and + # capturing is fast, we don't need to log it for every + # shape. E.g. we only log it for the first subgraph in + # piecewise mode. + logger.debug("Capturing a cudagraph on (%s,%s)", + self.runtime_mode.name, entry.batch_descriptor) + # validate that cudagraph capturing is legal at this point. + validate_cudagraph_capturing_enabled() + + input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + entry.input_addresses = input_addresses + cudagraph = torch.cuda.CUDAGraph() + + with ExitStack() as stack: + if self.cudagraph_options.gc_disable: + # during every model forward for piecewise cudagraph + # mode, we will capture many pieces of cudagraphs + # (roughly one per layer). running gc again and again + # across layers will make the cudagraph capture very slow. + # therefore, we only run gc for the first graph, + # and disable gc for the rest of the graphs. + stack.enter_context(patch("gc.collect", lambda: None)) + stack.enter_context( + patch("torch.cuda.empty_cache", lambda: None)) + + # mind-exploding: carefully manage the reference and memory. + with torch.cuda.graph(cudagraph, pool=self.graph_pool): + # `output` is managed by pytorch's cudagraph pool + output = self.runnable(*args, **kwargs) + if self.cudagraph_options.weak_ref_output: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph in piecewise cuadgraph mode, because + # the output of the last graph will not be used by + # any other cuda graph. + output = weak_ref_tensors(output) + + # here we always use weak ref for the output + # to save memory + entry.output = weak_ref_tensors(output) + entry.cudagraph = cudagraph + + compilation_counter.num_cudagraph_captured += 1 + + # important: we need to return the output, rather than + # the weak ref of the output, so that pytorch can correctly + # manage the memory during cuda graph capture + return output + + if self.is_debugging_mode: + # check if the input addresses are the same + new_input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + assert new_input_addresses == entry.input_addresses, ( + f"Input addresses for cudagraphs are different " + f"during replay. Expected {entry.input_addresses}, " + f"got {new_input_addresses}") + + entry.cudagraph.replay() + return entry.output diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py index 8c49ea6cc107..ae26e9f1bf2b 100644 --- a/vllm/compilation/cuda_piecewise_backend.py +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -2,21 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses -from contextlib import ExitStack -from typing import Any, Callable, Optional -from unittest.mock import patch +from typing import Any, Callable -import torch import torch.fx as fx import vllm.envs as envs from vllm.compilation.backends import VllmBackend -from vllm.compilation.counter import compilation_counter from vllm.compilation.monitor import end_monitoring_torch_compile from vllm.config import VllmConfig -from vllm.forward_context import get_forward_context from vllm.logger import init_logger -from vllm.utils import weak_ref_tensors logger = init_logger(__name__) @@ -24,44 +18,29 @@ @dataclasses.dataclass class ConcreteSizeEntry: runtime_shape: int - need_to_compile: bool # the size is in compile_sizes - use_cudagraph: bool # the size is in cudagraph_capture_sizes - compiled: bool = False runnable: Callable = None # type: ignore - num_finished_warmup: int = 0 - cudagraph: Optional[torch.cuda.CUDAGraph] = None - output: Optional[Any] = None - - # for cudagraph debugging, track the input addresses - # during capture, and check if they are the same during replay - input_addresses: Optional[list[int]] = None -class CUDAPiecewiseBackend: +class PiecewiseBackend: def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, - graph_pool: Any, piecewise_compile_index: int, - total_piecewise_compiles: int, sym_shape_indices: list[int], + piecewise_compile_index: int, total_piecewise_compiles: int, + sym_shape_indices: list[int], compiled_graph_for_general_shape: Callable, vllm_backend: VllmBackend): """ The backend for piecewise compilation. - It mainly handles the compilation and cudagraph capturing. + It mainly handles the compilation of static shapes and + dispatching based on runtime shape. We will compile `self.graph` once for the general shape, and then compile for different shapes specified in `compilation_config.compile_sizes`. - - Independently, we will capture cudagraph for different shapes. - - If a shape needs both compilation and cudagraph, we will - compile it first, and then capture cudagraph. """ self.graph = graph self.vllm_config = vllm_config self.compilation_config = vllm_config.compilation_config - self.graph_pool = graph_pool self.piecewise_compile_index = piecewise_compile_index self.total_piecewise_compiles = total_piecewise_compiles self.vllm_backend = vllm_backend @@ -70,11 +49,10 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, self.is_last_graph = ( piecewise_compile_index == total_piecewise_compiles - 1) + self.is_full_graph = total_piecewise_compiles == 1 + self.compile_sizes: set[int] = set( self.compilation_config.compile_sizes) - self.cudagraph_capture_sizes: set[int] = set( - self.compilation_config.cudagraph_capture_sizes - ) if self.compilation_config.use_cudagraph else set() self.first_run_finished = False @@ -84,18 +62,18 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" - # the entries for different shapes that we need to either - # compile or capture cudagraph + # the entries for different shapes that we need to compile self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} # to_be_compiled_sizes tracks the remaining sizes to compile, # and updates during the compilation process, so we need to copy it self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() - for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): + + # We only keep compilation management inside this class directly. + for shape in self.compile_sizes: self.concrete_size_entries[shape] = ConcreteSizeEntry( runtime_shape=shape, - need_to_compile=shape in self.compile_sizes, - use_cudagraph=shape in self.cudagraph_capture_sizes, + runnable=self.compiled_graph_for_general_shape, ) def check_for_ending_compilation(self): @@ -112,16 +90,14 @@ def __call__(self, *args) -> Any: return self.compiled_graph_for_general_shape(*args) runtime_shape = args[self.sym_shape_indices[0]] + if runtime_shape not in self.concrete_size_entries: # we don't need to do anything for this shape return self.compiled_graph_for_general_shape(*args) entry = self.concrete_size_entries[runtime_shape] - if entry.runnable is None: - entry.runnable = self.compiled_graph_for_general_shape - - if entry.need_to_compile and not entry.compiled: + if not entry.compiled: entry.compiled = True self.to_be_compiled_sizes.remove(runtime_shape) # args are real arguments @@ -138,81 +114,4 @@ def __call__(self, *args) -> Any: if self.is_last_graph and not self.to_be_compiled_sizes: self.check_for_ending_compilation() - # Skip CUDA graphs if this entry doesn't use them OR - # if we're supposed to skip them globally - skip_cuda_graphs = get_forward_context().skip_cuda_graphs - if not entry.use_cudagraph or skip_cuda_graphs: - return entry.runnable(*args) - - if entry.cudagraph is None: - if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups: # noqa - entry.num_finished_warmup += 1 - if self.is_first_graph: - logger.debug( - "Warming up %s/%s for shape %s", - entry.num_finished_warmup, - self.compilation_config.cudagraph_num_of_warmups, - runtime_shape) - return entry.runnable(*args) - - if self.is_first_graph: - # Since we capture cudagraph for many different shapes and - # capturing is fast, we don't need to log it for every shape. - # We only log it in the debug mode. - logger.debug("Capturing a cudagraph for shape %s", - runtime_shape) - - input_addresses = [ - x.data_ptr() for x in args if isinstance(x, torch.Tensor) - ] - entry.input_addresses = input_addresses - cudagraph = torch.cuda.CUDAGraph() - - with ExitStack() as stack: - if not self.is_first_graph: - # during every model forward, we will capture - # many pieces of cudagraphs (roughly one per layer). - # running gc again and again across layers will - # make the cudagraph capture very slow. - # therefore, we only run gc for the first graph, - # and disable gc for the rest of the graphs. - stack.enter_context(patch("gc.collect", lambda: None)) - stack.enter_context( - patch("torch.cuda.empty_cache", lambda: None)) - - # mind-exploding: carefully manage the reference and memory. - with torch.cuda.graph(cudagraph, pool=self.graph_pool): - # `output` is managed by pytorch's cudagraph pool - output = entry.runnable(*args) - if self.is_last_graph: - # by converting it to weak ref, - # the original `output` will immediately be released - # to save memory. It is only safe to do this for - # the last graph, because the output of the last graph - # will not be used by any other cuda graph. - output = weak_ref_tensors(output) - - # here we always use weak ref for the output - # to save memory - entry.output = weak_ref_tensors(output) - entry.cudagraph = cudagraph - - compilation_counter.num_cudagraph_captured += 1 - - # important: we need to return the output, rather than - # the weak ref of the output, so that pytorch can correctly - # manage the memory during cuda graph capture - return output - - if self.is_debugging_mode: - # check if the input addresses are the same - new_input_addresses = [ - x.data_ptr() for x in args if isinstance(x, torch.Tensor) - ] - assert new_input_addresses == entry.input_addresses, ( - "Input addresses for cudagraphs are different during replay." - f" Expected {entry.input_addresses}, got {new_input_addresses}" - ) - - entry.cudagraph.replay() - return entry.output + return entry.runnable(*args) diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index 1e059b59fb64..9047bf3cbf8e 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -37,3 +37,21 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig): if context_manager is not None: context_manager.__exit__(None, None, None) context_manager = None + + +cudagraph_capturing_enabled: bool = True + + +def validate_cudagraph_capturing_enabled(): + # used to monitor whether an cudagraph capturing is legal at runtime. + # should be called before any cudagraph capturing. + # if an illegal cudagraph capturing happens, raise an error. + global cudagraph_capturing_enabled + if not cudagraph_capturing_enabled: + raise RuntimeError("CUDA graph capturing detected at an inappropriate " + "time. This operation is currently disabled.") + + +def set_cudagraph_capturing_enabled(enabled: bool): + global cudagraph_capturing_enabled + cudagraph_capturing_enabled = enabled diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 8d5df1061eda..96d4eae2ee9a 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -11,7 +11,8 @@ import torch import vllm.envs as envs -from vllm.config import CompilationLevel, get_current_vllm_config +from vllm.config import (CompilationLevel, CUDAGraphMode, + get_current_vllm_config) from vllm.logger import init_logger logger = init_logger(__name__) @@ -115,8 +116,8 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): except Exception: pass - if self.vllm_config.compilation_config.use_cudagraph and \ - "update" in new_code.co_names: + if self.vllm_config.compilation_config.cudagraph_mode != \ + CUDAGraphMode.NONE and "update" in new_code.co_names: import depyf src = depyf.decompile(new_code) msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src # noqa diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 82ef8db673fe..280ae60c91ff 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -32,7 +32,7 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType, PrefixCachingHashAlgo) from vllm.config.compilation import (CompilationConfig, CompilationLevel, - PassConfig) + CUDAGraphMode, PassConfig) from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.utils import ConfigType, config @@ -3529,11 +3529,21 @@ def __post_init__(self): else: self.compilation_config.level = \ CompilationLevel.NO_COMPILATION + else: # NB: Passing both --enforce-eager and a compilation level # in V0 means the compilation level wins out. self.compilation_config.level = CompilationLevel.NO_COMPILATION + # if cudagraph_mode is not explicitly set by users, set default value + if self.compilation_config.cudagraph_mode is None: + if envs.VLLM_USE_V1 and self.compilation_config.level \ + == CompilationLevel.PIECEWISE: + self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: @@ -3541,12 +3551,13 @@ def __post_init__(self): True if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - if envs.VLLM_USE_V1 and self.model_config is not None and \ - not self.model_config.enforce_eager: - # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph - # is set to True, full CUDA graphs will be used. + + # disable cudagraph when enforce eager execution + if self.model_config is not None and self.model_config.enforce_eager: + logger.info("Cudagraph is disabled under eager mode") + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + elif envs.VLLM_USE_V1: self.compilation_config.cudagraph_num_of_warmups = 1 - self.compilation_config.set_splitting_ops_for_v1() self._set_cudagraph_sizes() @@ -3566,12 +3577,6 @@ def __post_init__(self): "Disabling `torch.compile`.") self.compilation_config.level = CompilationLevel.NO_COMPILATION - if self.compilation_config.full_cuda_graph and \ - not self.model_config.disable_cascade_attn: - logger.info("full_cuda_graph is not supported with " - "cascade attention. Disabling cascade attention.") - self.model_config.disable_cascade_attn = True - disable_chunked_prefill_reasons: list[str] = [] if self.model_config and self.model_config.pooler_config: @@ -3612,9 +3617,32 @@ def __post_init__(self): "to True to enable.") current_platform.check_and_update_config(self) + # final check of cudagraph mode after platform-specific update + if envs.VLLM_USE_V1: + if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \ + and self.model_config is not None and \ + not self.model_config.disable_cascade_attn: + logger.info("CUDAGraphMode.FULL is not supported with " + "cascade attention currently. Disabling cascade" + "attention.") + self.model_config.disable_cascade_attn = True + + if self.compilation_config.cudagraph_mode\ + .requires_piecewise_compilation(): + assert self.compilation_config.level == \ + CompilationLevel.PIECEWISE, \ + "Compilation level should be CompilationLevel.PIECEWISE "\ + "when cudagraph_mode piecewise cudagraphs is used, "\ + f"cudagraph_mode={self.compilation_config.cudagraph_mode}" + if not self.instance_id: self.instance_id = random_uuid()[:5] + # Do this after all the updates to compilation_config.level + if envs.VLLM_USE_V1 and \ + self.compilation_config.level == CompilationLevel.PIECEWISE: + self.compilation_config.set_splitting_ops_for_v1() + if (envs.VLLM_USE_V1 and not self.scheduler_config.disable_hybrid_kv_cache_manager): # logger should only print warning message for hybrid models. As we diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 8a78d811b9a2..56a2183f8e2c 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -1,12 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import enum import hashlib from collections import Counter from dataclasses import asdict, field -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union -from pydantic import TypeAdapter +from pydantic import TypeAdapter, field_validator from pydantic.dataclasses import dataclass import vllm.envs as envs @@ -31,6 +32,40 @@ class CompilationLevel: PIECEWISE = 3 +class CUDAGraphMode(enum.Enum): + """ Constants for the cudagraph mode in CompilationConfig. + Meanwhile, the subset enum `NONE`, `PIECEWISE` and `FULL` are also + treated as concrete runtime mode for cudagraph runtime dispatching. + """ + NONE = 0 + PIECEWISE = 1 + FULL = 2 + FULL_DECODE_ONLY = (FULL, NONE) + FULL_AND_PIECEWISE = (FULL, PIECEWISE) + + def decode_mode(self) -> 'CUDAGraphMode': + return CUDAGraphMode(self.value[0]) if \ + self.separate_routine() else self + + def mixed_mode(self) -> 'CUDAGraphMode': + return CUDAGraphMode(self.value[1]) if \ + self.separate_routine() else self + + def requires_piecewise_compilation(self) -> bool: + return (self.decode_mode() == CUDAGraphMode.PIECEWISE + or self.mixed_mode() == CUDAGraphMode.PIECEWISE) + + def max_cudagraph_mode(self) -> 'CUDAGraphMode': + return CUDAGraphMode(max( + self.value)) if self.separate_routine() else self + + def has_full_cudagraphs(self) -> bool: + return self.max_cudagraph_mode() == CUDAGraphMode.FULL + + def separate_routine(self) -> bool: + return isinstance(self.value, tuple) + + @config @dataclass class PassConfig: @@ -91,6 +126,7 @@ class CompilationConfig: - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops] - CudaGraph capture: - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph] + - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode] - [`cudagraph_capture_sizes`] [vllm.config.CompilationConfig.cudagraph_capture_sizes] - [`cudagraph_num_of_warmups`] @@ -157,7 +193,7 @@ class CompilationConfig: By default, all custom ops are enabled when running without Inductor and disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. Inductor generates (fused) Triton kernels for disabled custom ops.""" - splitting_ops: list[str] = field(default_factory=list) + splitting_ops: Optional[list[str]] = None """A list of ops to split the full graph into subgraphs, used in piecewise compilation.""" @@ -187,7 +223,43 @@ class CompilationConfig: constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`.""" # CudaGraph compilation - use_cudagraph: bool = field(default_factory=lambda: envs.VLLM_USE_V1) + cudagraph_mode: Optional[CUDAGraphMode] = None + """ + The mode of the cudagraph. + - NONE, no cudagraph capture. + - PIECEWISE. (v1 default) + - FULL. + - FULL_DECODE_ONLY. + - FULL_AND_PIECEWISE. + + PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph + incompatiable ops (i.e. some attention ops) outside the cudagraph + for general flexibility. + This is the default mode. + + FULL mode: Capture full cudagraph for all batches. Can be good for small + models or workloads with small prompts; not supported by many backends. + Generally for performance FULL_AND_PIECEWISE is better. + + FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only. + Mixed prefill-decode batches are run without cudagraphs. Can be good for + decode instances in a P/D setup where prefill is not as important so we + can save some memory. + + FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and + piecewise cudagraph for prefill and mixed prefill-decode batches. + This is like the most performant mode for most models. + + Currently, the cudagraph mode is only used for the v1 engine. + Note that the cudagraph logic is generally orthogonal to the + compilation logic. While piecewise cudagraphs require piecewise + compilation (level=PIECEWISE and non-empty splitting_ops), full + cudagraphs are supported with and without compilation. + + Warning: This flag is new and subject to change in addition + more modes may be added. + """ + use_cudagraph: bool = True """Whether to use cudagraph inside compilation. - False: cudagraph inside compilation is not used. - True: cudagraph inside compilation is used. It requires @@ -197,8 +269,9 @@ class CompilationConfig: CompilationLevel.PIECEWISE (aka -O3). Note that this is orthogonal to the cudagraph capture logic outside of compilation. - TODO: move outside cudagraph logic into compilation. - torch.compile will handle cudagraph capture logic in the future.""" + Warning: This flag is deprecated and will be removed in the next major or + minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead. + """ cudagraph_num_of_warmups: int = 0 """Number of warmup runs for cudagraph. It means the first several runs will be treated as warmup runs. @@ -213,12 +286,17 @@ class CompilationConfig: cudagraph. If the caller can guarantee that the same input buffers are always used, it can set this to False. Otherwise, it should set this to True, and the compiler will copy the input to an - internally managed buffer. Default is False.""" - full_cuda_graph: bool = False + internally managed buffer. Default is False. + Note that this flag is only effective when cudagraph_mode is PIECEWISE. + """ + full_cuda_graph: Optional[bool] = False """whether to use a full cuda graph for the entire forward pass rather than splitting certain operations such as attention into subgraphs. Thus this flag cannot be used together with splitting_ops. This may provide - performance benefits for smaller models.""" + performance benefits for smaller models. + Warning: This flag is deprecated and will be removed in the next major or + minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead. + """ pass_config: PassConfig = field(default_factory=PassConfig) """Custom inductor passes, see PassConfig for more details""" @@ -253,6 +331,13 @@ class CompilationConfig: Map from layer name to layer objects that need to be accessed outside model code, e.g., Attention, FusedMOE when dp_size>1.""" + # Attention ops; used for piecewise cudagraphs + _attention_ops: ClassVar[list[str]] = [ + "vllm.unified_attention", + "vllm.unified_attention_with_output", + "vllm.mamba_mixer2", + ] + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -297,13 +382,26 @@ def __repr__(self) -> str: if pass_config_exclude: exclude["pass_config"] = pass_config_exclude - return TypeAdapter(CompilationConfig).dump_json( - self, - exclude=exclude, # type: ignore[arg-type] - exclude_unset=True).decode() + # The cast to string is necessary because Pydantic is mocked in docs + # builds and sphinx-argparse doesn't know the return type of decode() + return str( + TypeAdapter(CompilationConfig).dump_json( + self, + exclude=exclude, # type: ignore[arg-type] + exclude_unset=True).decode()) __str__ = __repr__ + @field_validator("cudagraph_mode", mode="before") + @classmethod + def validate_cudagraph_mode_before(cls, value: Any) -> Any: + """ + enable parse the `cudagraph_mode` enum type from string + """ + if isinstance(value, str): + return CUDAGraphMode[value.upper()] + return value + def __post_init__(self) -> None: count_none = self.custom_ops.count("none") count_all = self.custom_ops.count("all") @@ -341,7 +439,26 @@ def __post_init__(self) -> None: if isinstance(self.pass_config, dict): self.pass_config = PassConfig(**self.pass_config) - def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]: + # migrate the deprecated flags + if not self.use_cudagraph: + logger.warning("use_cudagraph is deprecated, use " + "cudagraph_mode=NONE instead.") + if self.cudagraph_mode is not None: + raise ValueError( + "use_cudagraph and cudagraph_mode are mutually" + " exclusive, prefer cudagraph_mode since " + "use_cudagraph is deprecated.") + self.cudagraph_mode = CUDAGraphMode.NONE + if self.full_cuda_graph: + logger.warning("full_cuda_graph is deprecated, use " + "cudagraph_mode=FULL instead.") + if self.cudagraph_mode is not None: + raise ValueError("full_cuda_graph and cudagraph_mode are " + "mutually exclusive, prefer cudagraph_mode " + "since full_cuda_graph is deprecated.") + self.cudagraph_mode = CUDAGraphMode.FULL + + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -414,15 +531,34 @@ def init_with_cudagraph_sizes(self, self.max_capture_size] = self.max_capture_size def set_splitting_ops_for_v1(self): - # NOTE: this function needs to be called - if self.splitting_ops and self.full_cuda_graph: - raise ValueError("full_cuda_graph cannot be used together with " - "splitting_ops, as Full CUDA graph will override " - f"the splitting_ops: {self.splitting_ops}") - - if not self.splitting_ops: - self.splitting_ops = [] if self.full_cuda_graph else [ - "vllm.unified_attention", - "vllm.unified_attention_with_output", - "vllm.mamba_mixer2", - ] + # NOTE: this function needs to be called only when level is + # CompilationLevel.PIECEWISE + assert self.level == CompilationLevel.PIECEWISE, ( + "set_splitting_ops_for_v1 should only be called when " + "level is CompilationLevel.PIECEWISE") + + if self.splitting_ops is None: + # NOTE: When using full cudagraph, instead of setting an empty + # list and capture the full cudagraph inside the flattened fx + # graph, we keep the piecewise fx graph structure but capture the + # full cudagraph outside the fx graph. This reduces some cpu + # overhead when the runtime batch_size is not cudagraph captured. + # see https://github.com/vllm-project/vllm/pull/20059 for details. + self.splitting_ops = self._attention_ops + elif len(self.splitting_ops) == 0: + logger.warning_once("Using piecewise compilation with empty " + "splitting_ops.") + if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.warning_once( + "When compilation level is piecewise with empty " + "splitting_ops, PIECEWISE cudagraph_mode will be " + "treated as FULL cudagraph_mode. Please ensure you are " + "using attention backends that support cudagraph or set " + "cudagraph_mode to NONE explicitly if encountering " + "any problems.") + self.cudagraph_mode = CUDAGraphMode.FULL + self.splitting_ops = [] + + def splitting_ops_contain_attention(self) -> bool: + return self.splitting_ops is not None and all( + op in self.splitting_ops for op in self._attention_ops) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 4686ba24e65f..c57c51d289ac 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -5,13 +5,13 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union import torch import torch.distributed as dist import vllm.envs as envs -from vllm.config import ParallelConfig, VllmConfig +from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger if TYPE_CHECKING: @@ -26,6 +26,27 @@ batchsize_forward_time: defaultdict = defaultdict(list) +class BatchDescriptor(NamedTuple): + """ + Batch descriptor for cudagraph dispatching. We should keep the num of + items as minimal as possible to properly and uniquely describe the padded + batch for cudagraph. + """ + num_tokens: int + uniform_decode: bool = False + """ + False can also be used for an uniform decode batch to dispatch to the + cudagraph supporting non-uniform batches. + """ + + @property + def non_uniform(self) -> "BatchDescriptor": + """ + Return a non-uniform version of current batch descriptor. + """ + return BatchDescriptor(self.num_tokens, uniform_decode=False) + + def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: list[int], max_num_tokens: int, chunk_idx: int) -> list[int]: @@ -152,7 +173,15 @@ class ForwardContext: virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass dp_metadata: Optional[DPMetadata] = None - skip_cuda_graphs: bool = False + # determine the cudagraph style at runtime to be FULL, PIECEWISE, or NONE. + # by default NONE, no cudagraph is used. + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE + batch_descriptor: Optional[BatchDescriptor] = None + + def __post_init__(self): + assert self.cudagraph_runtime_mode in [ + CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \ + f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}" _forward_context: Optional[ForwardContext] = None @@ -168,13 +197,13 @@ def get_forward_context() -> ForwardContext: @contextmanager def set_forward_context( - attn_metadata: Any, - vllm_config: VllmConfig, - virtual_engine: int = 0, - num_tokens: Optional[int] = None, - num_tokens_across_dp: Optional[torch.Tensor] = None, - skip_cuda_graphs: bool = False, -): + attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0, + num_tokens: Optional[int] = None, + num_tokens_across_dp: Optional[torch.Tensor] = None, + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + batch_descriptor: Optional[BatchDescriptor] = None): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. @@ -198,7 +227,8 @@ def set_forward_context( virtual_engine=virtual_engine, attn_metadata=attn_metadata, dp_metadata=dp_metadata, - skip_cuda_graphs=skip_cuda_graphs, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, ) try: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 483d5e1531a9..321db8287c0f 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -177,17 +177,20 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: logger.info("Forcing kv cache block size to 128 for " "CUTLASS_MLA backend.") + # lazy import to avoid circular import + from vllm.config import CUDAGraphMode + compilation_config = vllm_config.compilation_config if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" and parallel_config.data_parallel_size > 1 - and compilation_config.use_cudagraph): + and compilation_config.cudagraph_mode != CUDAGraphMode.NONE): logger.info( - "Data Parallel: Forcing enforce eager to be True since DP " + "Data Parallel: disabling cudagraphs since DP " "with DeepEP high-throughput kernels are not CUDA Graph " "compatible. The DeepEP low-latency kernels are CUDA Graph " "compatible. Set the all_to_all backend to deepep_low_latency " "to use those kernels instead.") - compilation_config.use_cudagraph = False + compilation_config.cudagraph_mode = CUDAGraphMode.NONE if model_config is not None: model_config.enforce_eager = True @@ -454,8 +457,8 @@ def use_custom_allreduce(cls) -> bool: return True @classmethod - def get_piecewise_backend_cls(cls) -> str: - return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa + def get_static_graph_wrapper_cls(cls) -> str: + return "vllm.compilation.cuda_graph.CUDAGraphWrapper" @classmethod def stateless_init_device_torch_dist_pg( diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 91d5314900c8..4017f1ca7eec 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -7,7 +7,7 @@ import sys from datetime import timedelta from platform import uname -from typing import TYPE_CHECKING, NamedTuple, Optional, Union +from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union import numpy as np import torch @@ -137,6 +137,8 @@ class Platform: additional_env_vars: list[str] = [] + _global_graph_pool: Optional[Any] = None + @property def supported_dtypes(self) -> list[torch.dtype]: """Returns the supported dtypes for the current platform.""" @@ -522,6 +524,15 @@ def __getattr__(self, key: str): " attribute.", self.device_type, key) return None + def get_global_graph_pool(self) -> Any: + """ + Return the global graph pool for the this platform. + """ + cls = self.__class__ + if cls._global_graph_pool is None: + cls._global_graph_pool = self.graph_pool_handle() + return cls._global_graph_pool + @classmethod def get_cu_count(cls, device_id: int = 0) -> int: """ @@ -530,11 +541,11 @@ def get_cu_count(cls, device_id: int = 0) -> int: raise NotImplementedError @classmethod - def get_piecewise_backend_cls(cls) -> str: + def get_static_graph_wrapper_cls(cls) -> str: """ - Get piecewise backend class for piecewise graph. + Get static graph wrapper class for static graph. """ - return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend" # noqa + return "vllm.compilation.base_static_graph.AbstractStaticGraphWrapper" @classmethod def stateless_init_device_torch_dist_pg( diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 2d5bee5fc505..3ede86e15855 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -421,8 +421,8 @@ def is_navi(cls) -> bool: return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName @classmethod - def get_piecewise_backend_cls(cls) -> str: - return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa + def get_static_graph_wrapper_cls(cls) -> str: + return "vllm.compilation.cuda_graph.CUDAGraphWrapper" @classmethod def stateless_init_device_torch_dist_pg( diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index c7522a89c257..ba06abd07f08 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -99,7 +99,7 @@ def inference_mode(cls): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - from vllm.config import CompilationLevel + from vllm.config import CompilationLevel, CUDAGraphMode cache_config = vllm_config.cache_config # For v0, the default block size is 16. @@ -109,9 +109,17 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # TPU only supports DYNAMO_ONCE compilation level if compilation_config.level != CompilationLevel.DYNAMO_ONCE: - logger.info("[TPU] Forcing DYNAMO_ONCE compilation level") + logger.info("[TPU] Forcing DYNAMO_ONCE compilation level, and " + "disabling cudagraph.") compilation_config.level = CompilationLevel.DYNAMO_ONCE + if compilation_config.cudagraph_mode is None or \ + compilation_config.cudagraph_mode.max_cudagraph_mode() \ + != CUDAGraphMode.NONE: + logger.info("[TPU] CUDA graph is not supported on TPU, " + "disabling cudagraphs.") + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + if compilation_config.backend == "": compilation_config.backend = "openxla" diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index abd58dbbcbf4..66ebc8ad9d22 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -7,6 +7,7 @@ import torch import vllm.envs as envs +from vllm.config import CUDAGraphMode from vllm.logger import init_logger from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS @@ -100,16 +101,17 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # Instances created using VllmConfig() typically have model_config as # None by default. The modification involves adding a check to prevent # potential null exceptions check and update model config. - if model_config is not None: - if model_config.dtype == torch.bfloat16: - bf16_supported = cls.device_support_bf16() - if not bf16_supported: - model_config.dtype = torch.float16 - if not model_config.enforce_eager: - logger.warning( - "CUDA graph is not supported on XPU, fallback to the eager " - "mode.") - model_config.enforce_eager = True + if model_config is not None and model_config.dtype == torch.bfloat16 \ + and not cls.device_support_bf16(): + model_config.dtype = torch.float16 + + compilation_config = vllm_config.compilation_config + if compilation_config.cudagraph_mode is None or \ + compilation_config.cudagraph_mode.max_cudagraph_mode() \ + != CUDAGraphMode.NONE: + logger.info("[XPU] CUDA graph is not supported on XPU, " + "disabling cudagraphs.") + compilation_config.cudagraph_mode = CUDAGraphMode.NONE # check and update parallel config parallel_config = vllm_config.parallel_config diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a411477bc3e3..ab7a71a399b3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import numpy as np import torch @@ -154,9 +154,26 @@ def _get_sliding_window_configs( class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.NEVER if get_flash_attn_version() == 2 \ - else AttentionCGSupport.ALWAYS + # FA3: + # Supports full cudagraphs for all cases. + # + # FA2: + # For FA2, a graph is captured with max_query_len=1, (which is what we + # capture by default for num_tokens <= max_num_seqs when there is no + # spec-decode) then these graphs will not work for mixed prefill-decode + # (unlike FA3). This is due to special max_query_len=1 packed-GQA handling + # in FA2. + # In summary if we are running with spec decodes the graphs would + # work for mixed prefill-decode and uniform-decode. But for non-spec decodes + # the graphs would not work for mixed prefill-decode; sorta the inverse + # of UNIFORM_SINGLE_TOKEN_DECODE. + # Theres probably a better way to describe this using `AttentionCGSupport` + # but for now just set it to `UNIFORM_BATCH` to get use to drop down + # to FULL_AND_PIECEWISE. + # TODO(luka, lucas): audit FA2 as part of: + # https://github.com/vllm-project/vllm/issues/22945 + cudagraph_support = AttentionCGSupport.ALWAYS \ + if get_flash_attn_version() == 3 else AttentionCGSupport.UNIFORM_BATCH def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): @@ -177,17 +194,13 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], self.max_num_splits = 0 # No upper bound on the number of splits. self.aot_schedule = (get_flash_attn_version() == 3) - self.use_full_cuda_graph = self.compilation_config.full_cuda_graph - if self.use_full_cuda_graph: - if not self.aot_schedule: - raise ValueError( - "AoT scheduling is required for full cuda graph.") - capture_sizes = self.compilation_config.cudagraph_capture_sizes - if not capture_sizes: - raise ValueError( - "cudagraph_capture_sizes should not be None when " - "full_cuda_graph is True.") - self.max_cudagraph_size = max(capture_sizes) + + self.use_full_cuda_graph = \ + self.compilation_config.cudagraph_mode.has_full_cudagraphs() + + if self.use_full_cuda_graph and self.aot_schedule: + self.max_cudagraph_size = self.compilation_config.max_capture_size + if self.max_cudagraph_size > 992: # This condition derives from FA3's internal heuristic. # TODO(woosuk): Support larger cudagraph sizes. @@ -310,9 +323,9 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens, seqlens=seq_lens, max_seq_len=max_seq_len, causal=causal) - - if self.use_full_cuda_graph: - assert scheduler_metadata is not None + # For FA3 + full cudagraph + max_num_splits = 0 + if self.use_full_cuda_graph and scheduler_metadata is not None: n = scheduler_metadata.shape[0] self.scheduler_metadata[:n] = scheduler_metadata # NOTE(woosuk): We should zero out the rest of the scheduler @@ -322,14 +335,12 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens, self.scheduler_metadata[n:] = 0 scheduler_metadata = self.scheduler_metadata[:n] - max_num_splits = 0 - if (self.use_full_cuda_graph - and num_actual_tokens <= self.max_cudagraph_size): - # NOTE(woosuk): Setting num_splits > 1 may increase the memory - # usage, because the intermediate buffers of size [num_splits, - # num_heads, num_tokens, head_size] are allocated. Therefore, - # we only set num_splits when using cuda graphs. - max_num_splits = self.max_num_splits + if num_actual_tokens <= self.max_cudagraph_size: + # NOTE(woosuk): Setting num_splits > 1 may increase the memory + # usage, because the intermediate buffers of size [num_splits, + # num_heads, num_tokens, head_size] are allocated. Therefore, + # we only set num_splits when using cuda graphs. + max_num_splits = self.max_num_splits attn_metadata = FlashAttentionMetadata( num_actual_tokens=num_actual_tokens, @@ -350,11 +361,6 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens, causal=causal) return attn_metadata - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - # Full CUDA Graph always supported (FA2 support checked separately) - return True - def use_cascade_attention(self, *args, **kwargs) -> bool: return use_cascade_attention(*args, **kwargs) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 12e5542d691c..02decb171fc0 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -17,7 +17,7 @@ import vllm.envs as envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionType) -from vllm.config import VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.utils import cdiv, is_pin_memory_available from vllm.utils.flashinfer import use_trtllm_attention @@ -183,8 +183,8 @@ def __post_init__(self): class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.PURE_DECODE_ONLY + cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE reorder_batch_threshold: ClassVar[int] = 1 @@ -203,7 +203,8 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], self.kv_cache_spec.block_size) max_num_reqs = vllm_config.scheduler_config.max_num_seqs max_num_pages = max_num_reqs * max_num_pages_per_req - self.enable_cuda_graph = self.compilation_config.full_cuda_graph + self.enable_cuda_graph = self.compilation_config.cudagraph_mode.\ + decode_mode() == CUDAGraphMode.FULL if self.enable_cuda_graph: # For full cudagraph capture, one `decode_wrapper` for each batch # size is needed for FlashInfer. @@ -586,10 +587,6 @@ def build_for_cudagraph_capture( return self.build(0, m) - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - return common_attn_metadata.max_query_len == 1 - def use_cascade_attention(self, *args, **kwargs) -> bool: if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype: # TODO: The cascade wrapper currently does not support setting diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 3f84f8967db7..ace078e2b27c 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -89,8 +89,8 @@ class Mamba2AttentionMetadata: class Mamba2AttentionMetadataBuilder( AttentionMetadataBuilder[Mamba2AttentionMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.PURE_DECODE_ONLY + cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE reorder_batch_threshold: ClassVar[int] = 1 @@ -203,7 +203,3 @@ def build_for_cudagraph_capture( m.max_query_len = 1 # decode-only return self.build(0, m) - - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - return common_attn_metadata.max_query_len == 1 diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index badff67656c2..f2610671f769 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -575,7 +575,7 @@ def build_for_cudagraph_capture( "MLA only supports decode-only full CUDAGraph capture. " \ "Make sure all cudagraph capture sizes <= max_num_seq." - m.max_query_len = 1 # decode-only + assert m.max_query_len == 1 # decode-only return self.build(0, m) @@ -728,10 +728,6 @@ def build(self, return attn_metadata - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - return common_attn_metadata.max_query_len == 1 - class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): """ diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index b076613c8645..6e1e5d6533da 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -22,7 +22,7 @@ class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): # enable full CUDA Graph support for decode-only capture attn_cudagraph_support: ClassVar[ - AttentionCGSupport] = AttentionCGSupport.PURE_DECODE_ONLY + AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE class CutlassMLABackend(MLACommonBackend): diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 2b0f52cf80bf..11674423400c 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -55,8 +55,8 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]): class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.PURE_DECODE_ONLY + cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.UNIFORM_BATCH def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): @@ -73,7 +73,7 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], device_properties = torch.cuda.get_device_properties(self.device) num_sms = device_properties.multi_processor_count - if self.compilation_config.full_cuda_graph: + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): self.cg_buf_tile_scheduler_metadata = torch.zeros( # Upper bound on size (<= #SMs, TileSchedulerMetaDataSize) # TileSchedulerMetaDataSize = 8 @@ -95,7 +95,10 @@ def _build_decode(self, block_table_tensor: torch.Tensor, 1, # MQA for the decode path ) - if self.compilation_config.full_cuda_graph: + # TODO: we can disambiguate between decode and mixed-prefill decode here + # so we can only use the persistent buffer if a cudagraph is actually + # being used. + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): assert self.cg_buf_tile_scheduler_metadata is not None assert self.cg_buf_num_splits is not None diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 8b55e1a30199..082c7e6f7c62 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -65,8 +65,10 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.PURE_DECODE_ONLY + # TODO(luka, lucas): audit this as part of: + # https://github.com/vllm-project/vllm/issues/22945 + cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): @@ -82,7 +84,10 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], max_num_pages = max_num_reqs * max_num_pages_per_req # Preparing persistent buffers - if vllm_config.compilation_config.full_cuda_graph: + # TODO: we can disambiguate between decode and mixed-prefill decode here + # so we can only use the persistent buffer if a cudagraph is actually + # being used. + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): self.paged_kv_indptr = torch.zeros(max_num_reqs + 1, dtype=torch.int32, device=device) @@ -120,7 +125,7 @@ def _build_decode(self, block_table_tensor: torch.Tensor, block_table_bounds.cumsum(dim=0, dtype=torch.int32) ]) - if self.compilation_config.full_cuda_graph: + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): num_actual_pages = paged_kv_indices.size(0) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index e8bffbef4415..7d09ac0a4a3a 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -311,11 +311,6 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens, ) return attn_metadata - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - # Full CUDA Graph always supported (FA2 support checked separately) - return True - def use_cascade_attention(self, *args, **kwargs) -> bool: return False diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index c33afbfebcde..48a9af3decac 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -58,8 +58,7 @@ class TritonAttentionMetadata: class TritonAttentionMetadataBuilder( AttentionMetadataBuilder[TritonAttentionMetadata]): - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.ALWAYS + cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): @@ -132,11 +131,6 @@ def build(self, ) return attn_metadata - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - # Full CUDA Graph always supported - return True - class TritonAttentionBackend(AttentionBackend): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 91eb84245ac0..1c7d08798964 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -158,18 +158,21 @@ class AttentionCGSupport(enum.Enum): Here we do not consider the cascade attention, as currently it is never cudagraph supported.""" + ALWAYS = 3 + """Cudagraph always supported; supports mixed-prefill-decode""" + UNIFORM_BATCH = 2 + """Cudagraph supported for batches the only contain query lengths that are + the same, this can be used for spec-decode + i.e. "decodes" are 1 + num_speculative_tokens""" + UNIFORM_SINGLE_TOKEN_DECODE = 1 + """Cudagraph supported for batches the only contain query_len==1 decodes""" NEVER = 0 """NO cudagraph support""" - PURE_DECODE_ONLY = 1 - """Cudagraph supported for pure decode, need to run without - cudagraph for mixed prefill-decode batches""" - ALWAYS = 2 - """Cudagraph always supported""" class AttentionMetadataBuilder(abc.ABC, Generic[M]): - # Does this backend/builder support CUDA Graphs for attention. - attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + # Does this backend/builder support CUDA Graphs for attention (default: no). + cudagraph_support: ClassVar[AttentionCGSupport] = \ AttentionCGSupport.NEVER # Does this backend/builder reorder the batch? # If not, set this to None. Otherwise set it to the query @@ -199,13 +202,6 @@ def build(self, """ raise NotImplementedError - def can_run_in_cudagraph( - self, common_attn_metadata: CommonAttentionMetadata) -> bool: - """ - Can this batch (with given metadata) use CUDA Graphs for attention. - """ - return False - def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata) -> M: """ diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py new file mode 100644 index 000000000000..02e65820b7c0 --- /dev/null +++ b/vllm/v1/cudagraph_dispatcher.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig +from vllm.forward_context import BatchDescriptor +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class CudagraphDispatcher: + """ + Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs. + + The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one + for FULL cudagraph runtime mode. The keys are initialized depending on + attention support and what cudagraph mode is set in CompilationConfig. The + keys stored in dispatcher are the only source of truth for valid + cudagraphs that can be dispatched at runtime. + + At runtime, the dispatch method generates the runtime cudagraph mode (FULL, + PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor) + based on the input key. After dispatching (commuicate via forward context), + the cudagraph wrappers will trust the dispatch key to do either capturing + or replaying (if mode matched), or pass through to the underlying runnable + without cudagraph (if mode no match or mode is NONE). + """ + + def __init__(self, vllm_config: VllmConfig): + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config + self.cudagraph_mode = self.compilation_config.cudagraph_mode + + # Dict to store valid cudagraph dispatching keys. + self.cudagraph_keys: dict[CUDAGraphMode, set[BatchDescriptor]] = { + CUDAGraphMode.PIECEWISE: set(), + CUDAGraphMode.FULL: set(), + } + + assert not self.cudagraph_mode.requires_piecewise_compilation() or \ + (self.compilation_config.level == CompilationLevel.PIECEWISE and + self.compilation_config.splitting_ops_contain_attention()), \ + "Compilation level should be CompilationLevel.PIECEWISE when "\ + "cudagraph_mode piecewise cudagraphs is used, "\ + f"cudagraph_mode={self.cudagraph_mode}, "\ + f"compilation_level={self.compilation_config.level}, "\ + f"splitting_ops={self.compilation_config.splitting_ops}" + + self.keys_initialized = False + + def add_cudagraph_key(self, runtime_mode: CUDAGraphMode, + batch_descriptor: BatchDescriptor): + assert runtime_mode in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \ + f"Invalid cudagraph runtime mode: {runtime_mode}" + self.cudagraph_keys[runtime_mode].add(batch_descriptor) + + def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode, + uniform_decode_query_len: int): + # This should be called only after attention backend is initialized. + + # Note: we create all valid keys possible for cudagraph but do not + # guarantee all keys would be used. For example, we create keys for + # piecewise cudagraphs when it is piecewise compilation, which is always + # valid, but for attention backend support unified routine, we may not + # trigger capturing/replaying the piecewise cudagraphs depending on + # CompilationConfig.cudagraph_mode. In addition, if we allow lazy + # capturing in future PR, some keys may never be triggered. + if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE: + for bs in self.compilation_config.cudagraph_capture_sizes: + self.add_cudagraph_key( + cudagraph_mode.mixed_mode(), + BatchDescriptor(num_tokens=bs, uniform_decode=False)) + + # if decode cudagraph mode is FULL, and we don't already have mixed + # mode full cudagraphs then add them here. + if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL \ + and cudagraph_mode.separate_routine(): + max_num_tokens = uniform_decode_query_len * \ + self.vllm_config.scheduler_config.max_num_seqs + cudagraph_capture_sizes_for_decode = [ + x for x in self.compilation_config.cudagraph_capture_sizes + if x <= max_num_tokens and x >= uniform_decode_query_len + ] + for bs in cudagraph_capture_sizes_for_decode: + self.add_cudagraph_key( + CUDAGraphMode.FULL, + BatchDescriptor(num_tokens=bs, uniform_decode=True)) + self.keys_initialized = True + + def dispatch( + self, batch_descriptor: BatchDescriptor + ) -> tuple[CUDAGraphMode, Optional[BatchDescriptor]]: + """ + Given a batch descriptor, dispatch to a cudagraph mode. + A new batch descriptor is returned as we might dispatch a uniform batch + to a graph that supports a more general batch (uniform to non-uniform). + """ + # if not initialized, just skip dispatching. + if not self.keys_initialized: + logger.warning_once("cudagraph dispatching keys are not " + "initialized. No cudagraph will be used.") + return CUDAGraphMode.NONE, None + + # check if key exists for full cudagraph + if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]: + return CUDAGraphMode.FULL, batch_descriptor + + # otherwise, check if non-uniform key exists + non_uniform_key = batch_descriptor.non_uniform + if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]: + return CUDAGraphMode.FULL, non_uniform_key + + # also check if non-uniform key exists for more "general" + # piecewise cudagraph + if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]: + return CUDAGraphMode.PIECEWISE, non_uniform_key + + # finally, just return no cudagraphs + return CUDAGraphMode.NONE, None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d5325287889f..9460d91c5832 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -21,7 +21,9 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.counter import compilation_counter -from vllm.config import (CompilationLevel, VllmConfig, +from vllm.compilation.cuda_graph import CUDAGraphWrapper +from vllm.compilation.monitor import set_cudagraph_capturing_enabled +from vllm.config import (CompilationLevel, CUDAGraphMode, VllmConfig, get_layers_from_vllm_config, update_config) from vllm.distributed.eplb.eplb_state import EplbState from vllm.distributed.kv_transfer import (get_kv_transfer_group, @@ -29,7 +31,8 @@ from vllm.distributed.parallel_state import ( get_pp_group, get_tp_group, graph_capture, is_global_first_rank, prepare_communication_buffer_for_model) -from vllm.forward_context import DPMetadata, set_forward_context +from vllm.forward_context import (BatchDescriptor, DPMetadata, + set_forward_context) from vllm.logger import init_logger from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding @@ -48,13 +51,15 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, - is_pin_memory_available, round_up, supports_dynamo) + GiB_bytes, LazyLoader, cdiv, check_use_alibi, + get_dtype_size, is_pin_memory_available, round_up, + supports_dynamo) from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, make_kv_sharing_fast_prefill_attention_metadata, reorder_batch_to_split_decodes_and_prefills) +from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.v1.kv_cache_interface import (AttentionSpec, ChunkedLocalAttentionSpec, FullAttentionSpec, KVCacheConfig, @@ -218,11 +223,6 @@ def __init__( is_spec_decode=bool(self.vllm_config.speculative_config), ) - self.use_cuda_graph = ( - self.vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE - and self.vllm_config.compilation_config.use_cudagraph - and not self.model_config.enforce_eager) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. # The convention is different. # self.cudagraph_batch_sizes sorts in ascending order. @@ -230,8 +230,6 @@ def __init__( self.cudagraph_batch_sizes = list( reversed(self.compilation_config.cudagraph_capture_sizes)) - self.full_cuda_graph = self.compilation_config.full_cuda_graph - # Cache the device properties. self._init_device_properties() @@ -326,6 +324,12 @@ def __init__( self.kv_sharing_fast_prefill_logits_indices = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=self.device) + self.uniform_decode_query_len = 1 if not self.speculative_config else \ + 1 + self.speculative_config.num_speculative_tokens + + # Cudagraph dispatcher for runtime cudagraph dispatching. + self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config) + self.mm_budget = (MultiModalBudget( self.model_config, self.scheduler_config, @@ -471,7 +475,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: assert (task := pooling_params.task) is not None, ( "You did not set `task` in the API") - model = cast(VllmModelForPooling, self.model) + model = cast(VllmModelForPooling, self.get_model()) to_update = model.pooler.get_pooling_updates(task) to_update.apply(pooling_params) @@ -679,13 +683,11 @@ def _get_cumsum_and_arange( def _prepare_inputs( self, scheduler_output: "SchedulerOutput", - ) -> tuple[dict[str, - Any], bool, torch.Tensor, Optional[SpecDecodeMetadata], - np.ndarray, Optional[CommonAttentionMetadata]]: + ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata], + np.ndarray, Optional[CommonAttentionMetadata], int]: """ :return: tuple[ attn_metadata: layer-to-attention_metadata mapping, - attention_cuda_graphs: whether attention can run in cudagraph logits_indices, spec_decode_metadata ] """ @@ -820,7 +822,7 @@ def _prepare_inputs( # valid, we fill the padded indices with the last index. self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_( logits_indices[-1].item()) - if (self.use_cuda_graph + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and num_logits <= self.cudagraph_batch_sizes[-1]): # Use piecewise CUDA graphs. # Add padding to the batch size. @@ -925,17 +927,13 @@ def _prepare_inputs( continue attn_metadata[layer_name] = attn_metadata_i - attention_cuda_graphs = all( - g.metadata_builder.can_run_in_cudagraph(common_attn_metadata) - for g in self._attn_group_iterator()) - # Hot-Swap lora model if self.lora_config: self.set_active_loras(self.input_batch, num_scheduled_tokens) - return (attn_metadata, attention_cuda_graphs, logits_indices, - spec_decode_metadata, num_scheduled_tokens, - spec_decode_common_attn_metadata) + return (attn_metadata, logits_indices, spec_decode_metadata, + num_scheduled_tokens, spec_decode_common_attn_metadata, + max_num_scheduled_tokens) def _compute_cascade_attn_prefix_len( self, @@ -1259,6 +1257,9 @@ def _gather_mm_embeddings( return mm_embeds def get_model(self) -> nn.Module: + # get raw model out of the cudagraph wrapper. + if isinstance(self.model, CUDAGraphWrapper): + return self.model.unwrap() return self.model def get_supported_generation_tasks(self) -> list[GenerationTask]: @@ -1415,9 +1416,10 @@ def eplb_step(self, return assert self.eplb_state is not None - assert is_mixture_of_experts(self.model) + model = self.get_model() + assert is_mixture_of_experts(model) self.eplb_state.step( - self.model, + model, is_dummy, is_profile, log_stats=self.parallel_config.eplb_log_balancedness, @@ -1507,15 +1509,14 @@ def execute_model( self.vllm_config) # Prepare the decoder inputs. - (attn_metadata, attention_cuda_graphs, logits_indices, - spec_decode_metadata, num_scheduled_tokens_np, - spec_decode_common_attn_metadata) = ( - self._prepare_inputs(scheduler_output)) + (attn_metadata, logits_indices, spec_decode_metadata, + num_scheduled_tokens_np, spec_decode_common_attn_metadata, + max_query_len) = (self._prepare_inputs(scheduler_output)) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - if (self.use_cuda_graph + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): - # Use piecewise CUDA graphs. + # Use CUDA graphs. # Add padding to the batch size. num_input_tokens = self.vllm_config.pad_for_cudagraph( num_scheduled_tokens) @@ -1581,10 +1582,12 @@ def execute_model( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_input_tokens, intermediate_tensors, True) - # Some attention backends only support CUDA Graphs in pure decode. - # If attention doesn't support CUDA Graphs for this batch, but we - # compiled with full CUDA graphs, we have to skip them entirely. - skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs + uniform_decode = (max_query_len == self.uniform_decode_query_len) and ( + num_scheduled_tokens == self.input_batch.num_reqs * max_query_len) + batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens, + uniform_decode=uniform_decode) + cudagraph_runtime_mode, batch_descriptor = \ + self.cudagraph_dispatcher.dispatch(batch_descriptor) # Run the model. # Use persistent buffers for CUDA graphs. @@ -1593,10 +1596,10 @@ def execute_model( self.vllm_config, num_tokens=num_input_tokens, num_tokens_across_dp=num_tokens_across_dp, - skip_cuda_graphs=skip_cuda_graphs, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, ), self.maybe_get_kv_connector_output( scheduler_output) as kv_connector_output: - model_output = self.model( input_ids=input_ids, positions=positions, @@ -2021,20 +2024,31 @@ def load_model(self, eep_scale_up: bool = False) -> None: self.model.compile( fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend) + return + # for other compilation levels, cudagraph behavior is controlled by + # CudagraphWraper and CudagraphDispatcher of vllm. + + # wrap the model with full cudagraph wrapper if needed. + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): + self.model = CUDAGraphWrapper(self.model, + self.vllm_config, + runtime_mode=CUDAGraphMode.FULL) def reload_weights(self) -> None: assert getattr(self, "model", None) is not None, \ "Cannot reload weights before model is loaded." model_loader = get_model_loader(self.load_config) logger.info("Reloading weights inplace...") - model_loader.load_weights(self.model, model_config=self.model_config) + model = self.get_model() + model_loader.load_weights(model, model_config=self.model_config) def save_tensorized_model( self, tensorizer_config: "TensorizerConfig", ) -> None: + model = self.get_model() TensorizerLoader.save_model( - self.model, + model, tensorizer_config=tensorizer_config, model_config=self.model_config, ) @@ -2210,31 +2224,82 @@ def _get_mm_dummy_batch( def _dummy_run( self, num_tokens: int, - capture_attn_cudagraph: bool = False, + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + force_attention: bool = False, + uniform_decode: bool = False, skip_eplb: bool = False, is_profile: bool = False, ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Run a dummy forward pass to warm up/profile run or capture the + CUDA graph for the model. + + Args: + num_tokens: Number of tokens to run the dummy forward pass. + cudagraph_runtime_mode: used to control the behavior. + - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run + - CUDAGraphMode.PIECEWISE: Piecewise cudagraph. + - CUDAGraphMode.FULL: Full cudagraph, attention metadata is + needed. + force_attention: If True, always create attention metadata. Used to + warm up attention backend when mode is NONE. + uniform_decode: If True, the batch is a uniform decode batch. + skip_eplb: If True, skip EPLB state update. + is_profile: If True, this is a profile run. + """ + assert cudagraph_runtime_mode in { + CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL + } # Padding for DP num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) num_tokens += num_pad + # If cudagraph_mode.decode_mode() == FULL and + # cudagraph_mode.seperate_routine(). This means that we are using + # different graphs and/or modes for mixed prefill-decode batches vs. + # uniform decode batches. A uniform decode batch means that all + # requests have identical query length, except a potential virtual + # request (shorter) in the batch account for padding. + # Uniform decode batch could either be common pure decode, where + # max_query_len == 1, or speculative decode, where + # max_query_len == 1 + num_spec_decode_tokens. + + # When setting max_query_len = 1, we switch to and capture the optimized + # routine of FA2 for pure decode, i.e., Flashdecode + an optimization + # for GQA/MQA. + max_query_len = self.uniform_decode_query_len if uniform_decode else \ + num_tokens + # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively # has num_tokens in total. assert num_tokens <= self.scheduler_config.max_num_batched_tokens max_num_reqs = self.scheduler_config.max_num_seqs - num_reqs = min(num_tokens, max_num_reqs) - min_tokens_per_req = num_tokens // num_reqs - num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs - num_scheduled_tokens_list[-1] += num_tokens % num_reqs + if uniform_decode: + num_reqs = cdiv(num_tokens, max_query_len) + assert num_reqs <= max_num_reqs, \ + "Do not capture num_reqs > max_num_reqs for uniform batch" + num_scheduled_tokens_list = [max_query_len] * num_reqs + if num_tokens % max_query_len != 0: + num_scheduled_tokens_list[-1] = num_tokens % max_query_len + else: + num_reqs = min(num_tokens, max_num_reqs) + min_tokens_per_req = num_tokens // num_reqs + num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs + num_scheduled_tokens_list[-1] += num_tokens % num_reqs + assert sum(num_scheduled_tokens_list) == num_tokens assert len(num_scheduled_tokens_list) == num_reqs num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) attn_metadata: Optional[dict[str, Any]] = None - if capture_attn_cudagraph: + + # If force_attention is True, we always capture attention. Otherwise, + # it only happens for cudagraph_runtime_mode=FULL. + if force_attention or cudagraph_runtime_mode == \ + CUDAGraphMode.FULL: attn_metadata = {} # Make sure max_model_len is used at the graph capture time. @@ -2255,7 +2320,7 @@ def _dummy_run( num_computed_tokens_cpu_tensor[:num_reqs], num_reqs=num_reqs, num_actual_tokens=num_tokens, - max_query_len=num_tokens, + max_query_len=max_query_len, block_table_tensor=self.input_batch.block_table[ kv_cache_group_id].get_device_tensor()[:num_reqs], slot_mapping=self.input_batch. @@ -2299,12 +2364,26 @@ def _dummy_run( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_tokens, None, False) + if cudagraph_runtime_mode == CUDAGraphMode.NONE: + batch_descriptor = None + else: + # filter out the valid batch descriptor + _cg_mode, batch_descriptor = \ + self.cudagraph_dispatcher.dispatch( + BatchDescriptor(num_tokens=num_tokens, + uniform_decode=uniform_decode)) + # sanity check + assert cudagraph_runtime_mode == _cg_mode, ( + f"Cudagraph runtime mode mismatch at dummy_run. " + f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.") with self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, self.vllm_config, num_tokens=num_tokens, - num_tokens_across_dp=num_tokens_across_dp): + num_tokens_across_dp=num_tokens_across_dp, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor): outputs = self.model( input_ids=input_ids, positions=positions, @@ -2436,7 +2515,7 @@ def _dummy_pooler_run_task( dtype=torch.int32, device=self.device) - model = cast(VllmModelForPooling, self.model) + model = cast(VllmModelForPooling, self.get_model()) dummy_pooling_params = PoolingParams(task=task) to_update = model.pooler.get_pooling_updates(task) to_update.apply(dummy_pooling_params) @@ -2546,12 +2625,13 @@ def profile_run(self) -> None: gc.collect() def capture_model(self) -> None: - if not self.use_cuda_graph: + if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: logger.warning( "Skipping CUDA graph capture. To turn on CUDA graph capture, " - "set -O %s and ensure `use_cudagraph` was not manually set to " - "False", CompilationLevel.PIECEWISE) + "ensure `cudagraph_mode` was not manually set to `NONE`") return + else: + self.initialize_cudagraph_capture() compilation_counter.num_gpu_runner_capture_triggers += 1 @@ -2576,25 +2656,41 @@ def freeze_gc(): # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. + set_cudagraph_capturing_enabled(True) with freeze_gc(), graph_capture(device=self.device): - full_cg = self.full_cuda_graph - # Only rank 0 should print progress bar during capture - compilation_cases = reversed(self.cudagraph_batch_sizes) - if is_global_first_rank(): - compilation_cases = tqdm( - list(compilation_cases), - disable=not self.load_config.use_tqdm_on_load, - desc="Capturing CUDA graph shapes") - for num_tokens in compilation_cases: - # We skip EPLB here since we don't want to record dummy metrics - for _ in range( - self.compilation_config.cudagraph_num_of_warmups): - self._dummy_run(num_tokens, - capture_attn_cudagraph=full_cg, - skip_eplb=True) - self._dummy_run(num_tokens, - capture_attn_cudagraph=full_cg, - skip_eplb=True) + cudagraph_mode = self.compilation_config.cudagraph_mode + if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE: + cudagraph_runtime_mode = cudagraph_mode.mixed_mode() + + compilation_cases = list(reversed(self.cudagraph_batch_sizes)) + self._capture_cudagraphs( + compilation_cases, + cudagraph_runtime_mode=cudagraph_runtime_mode, + uniform_decode=False) + + # Capture full cudagraph for uniform decode batches if we have + # dont already have full mixed prefill-decode cudagraphs + if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \ + cudagraph_mode.separate_routine(): + max_num_tokens = self.scheduler_config.max_num_seqs * \ + self.uniform_decode_query_len + decode_cudagraph_batch_sizes = [ + x for x in self.cudagraph_batch_sizes if + x <= max_num_tokens and x >= self.uniform_decode_query_len + ] + compilation_cases_decode = list( + reversed(decode_cudagraph_batch_sizes)) + self._capture_cudagraphs( + compilation_cases=compilation_cases_decode, + cudagraph_runtime_mode=CUDAGraphMode.FULL, + uniform_decode=True) + + # Disable cudagraph capturing globally, so any unexpected cudagraph + # capturing will be detected and raise an error after here. + # Note: We don't put it into graph_capture context manager because + # we may doing lazy capturing in future that still allows capturing + # after here. + set_cudagraph_capturing_enabled(False) end_time = time.perf_counter() end_free_gpu_memory = torch.cuda.mem_get_info()[0] @@ -2604,6 +2700,41 @@ def freeze_gc(): logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) + def _capture_cudagraphs(self, compilation_cases: list[int], + cudagraph_runtime_mode: CUDAGraphMode, + uniform_decode: bool): + assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \ + cudagraph_runtime_mode in [CUDAGraphMode.FULL, + CUDAGraphMode.PIECEWISE] + + # Only rank 0 should print progress bar during capture + if is_global_first_rank(): + compilation_cases = tqdm( + compilation_cases, + disable=not self.load_config.use_tqdm_on_load, + desc="Capturing CUDA graphs ({}, {})".format( + "decode" if uniform_decode else "mixed prefill-decode", + cudagraph_runtime_mode.name)) + # We skip EPLB here since we don't want to record dummy metrics + for num_tokens in compilation_cases: + for _ in range(self.compilation_config.cudagraph_num_of_warmups): + # Use CUDAGraphRuntimeStyle.NONE (default) for warmup. + # But be careful, warm up with `NONE`is orthogonal to + # if we want to warm up attention or not. This is + # different from the case where `FULL` implies capture + # attention while `PIECEWISE` implies no attention. + force_attention = ( + cudagraph_runtime_mode == CUDAGraphMode.FULL) + self._dummy_run(num_tokens, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + force_attention=force_attention, + uniform_decode=uniform_decode, + skip_eplb=True) + self._dummy_run(num_tokens, + cudagraph_runtime_mode=cudagraph_runtime_mode, + uniform_decode=uniform_decode, + skip_eplb=True) + def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize the attention backends and attention metadata builders. @@ -2648,25 +2779,6 @@ def create_attn_groups( attn_metadata_builder_i, layer_names) attn_groups.append(attn_group) - - if self.full_cuda_graph: - if attn_metadata_builder_i.attn_cudagraph_support == \ - AttentionCGSupport.NEVER: - raise ValueError( - f"Full CUDAGraph not supported for " - f"{attn_backend.__name__}. Turn off " - f"CompilationConfig.full_cuda_graph or use a " - f" different attention backend.") - if attn_metadata_builder_i.attn_cudagraph_support == \ - AttentionCGSupport.PURE_DECODE_ONLY: - # Limit the max cudagraph size to the max number of - # sequences for pure decode only cudagraph backend, - # whose max_query_len is 1. - self.cudagraph_batch_sizes = [ - size for size in self.cudagraph_batch_sizes - if size <= self.scheduler_config.max_num_seqs - ] - return attn_groups for kv_cache_group_spec in kv_cache_config.kv_cache_groups: @@ -2734,6 +2846,75 @@ def create_attn_groups( "All or none of the layers are expected to be encoder-only" self.is_encoder_only_model = True + def initialize_cudagraph_capture(self) -> None: + min_cg_support = AttentionCGSupport.ALWAYS + min_cg_builder_name = None + + for attn_group in self._attn_group_iterator(): + builder = attn_group.metadata_builder + if builder.cudagraph_support.value < min_cg_support.value: + min_cg_support = builder.cudagraph_support + min_cg_builder_name = builder.__class__.__name__ + + # Flexible resolve the cudagraph mode + cudagraph_mode = self.compilation_config.cudagraph_mode + # check cudagraph for mixed batch is supported + if cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL \ + and min_cg_support != AttentionCGSupport.ALWAYS: + msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported " + f"with {min_cg_builder_name} backend (support: " + f"{min_cg_support})") + if min_cg_support == AttentionCGSupport.NEVER: + # if not supported any full cudagraphs, just raise it. + msg += "; please try cudagraph_mode=PIECEWISE, and "\ + "make sure compilation level is piecewise" + raise ValueError(msg) + + # attempt to resolve the full cudagraph related mode + if self.compilation_config.splitting_ops_contain_attention(): + msg += "; setting cudagraph_mode=FULL_AND_PIECEWISE" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.FULL_AND_PIECEWISE + else: + msg += "; setting cudagraph_mode=FULL_DECODE_ONLY" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.FULL_DECODE_ONLY + logger.warning(msg) + + # check that if we are doing spec-decode + decode full-cudagraphs it is + # supported + if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL + and self.uniform_decode_query_len > 1 and min_cg_support.value + < AttentionCGSupport.UNIFORM_BATCH.value): + msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported" + f" with spec-decode for attention backend " + f"{min_cg_builder_name} (support: {min_cg_support})") + if self.compilation_config.splitting_ops_contain_attention(): + msg += "; setting cudagraph_mode=PIECEWISE" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + else: + msg += "; setting cudagraph_mode=NONE" + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.NONE + logger.warning(msg) + + # double check that we can support full cudagraph if they are requested + # even after automatic downgrades + if cudagraph_mode.has_full_cudagraphs() \ + and min_cg_support == AttentionCGSupport.NEVER: + raise ValueError(f"CUDAGraphMode.{cudagraph_mode.name} is not " + f"supported with {min_cg_builder_name} backend (" + f"support:{min_cg_support}) " + "; please try cudagraph_mode=PIECEWISE, " + "and make sure compilation level is piecewise") + + # Trigger cudagraph dispatching keys initialization here (after + # initializing attn backends). + self.cudagraph_dispatcher.initialize_cudagraph_keys( + self.compilation_config.cudagraph_mode, + self.uniform_decode_query_len) + def calculate_reorder_batch_threshold(self) -> None: """ Check that if any backends reorder batches; that the reordering diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 84f065f25f2e..04de8d36680a 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -322,16 +322,11 @@ def compile_or_warm_up_model(self) -> None: if get_pp_group().is_last_rank: max_num_reqs = min(self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens) - # activate building attn_metadata for this dummy run to avoid - # potential illegal memory access for full cudagraph relay. - attn_cudagraph = self.compilation_config.full_cuda_graph and\ - not self.model_config.enforce_eager # We skip EPLB here since we don't want to record dummy metrics hidden_states, last_hidden_states = \ self.model_runner._dummy_run( num_tokens=max_num_reqs, - capture_attn_cudagraph=attn_cudagraph, skip_eplb=True, ) if self.model_runner.is_pooling_model: From 50c1a08a1734e634251db213a7bdc16547217625 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 15 Aug 2025 08:22:31 -0700 Subject: [PATCH 048/231] [V0 Deprecation] Remove advance_step (#22969) Signed-off-by: Woosuk Kwon Signed-off-by: Duncan Moss --- CMakeLists.txt | 1 - csrc/ops.h | 16 - csrc/prepare_inputs/advance_step.cu | 336 ------------------ csrc/prepare_inputs/advance_step.cuh | 19 - csrc/torch_bindings.cpp | 19 - vllm/_custom_ops.py | 32 -- vllm/attention/backends/abstract.py | 5 - .../backends/differential_flash_attn.py | 76 +--- vllm/attention/backends/flash_attn.py | 76 +--- vllm/attention/backends/flashinfer.py | 65 +--- vllm/attention/backends/flashmla.py | 15 +- vllm/attention/backends/mla/common.py | 87 +---- vllm/attention/backends/placeholder_attn.py | 62 +--- vllm/attention/backends/rocm_aiter_mla.py | 21 -- vllm/attention/backends/rocm_flash_attn.py | 68 +--- vllm/worker/model_runner.py | 3 +- 16 files changed, 9 insertions(+), 892 deletions(-) delete mode 100644 csrc/prepare_inputs/advance_step.cu delete mode 100644 csrc/prepare_inputs/advance_step.cuh diff --git a/CMakeLists.txt b/CMakeLists.txt index dcec854a0872..cda1ffc795d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -249,7 +249,6 @@ set(VLLM_EXT_SRC "csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/activation_kernels.cu" "csrc/cuda_utils_kernels.cu" - "csrc/prepare_inputs/advance_step.cu" "csrc/custom_all_reduce.cu" "csrc/torch_bindings.cpp") diff --git a/csrc/ops.h b/csrc/ops.h index 207291eceb16..3e29f0a973dd 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -145,22 +145,6 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input); void gelu_quick(torch::Tensor& out, torch::Tensor& input); -void advance_step_flashattn(int64_t num_seqs, int64_t num_queries, - int64_t block_size, torch::Tensor& input_tokens, - torch::Tensor& sampled_token_ids, - torch::Tensor& input_positions, - torch::Tensor& seq_lens, - torch::Tensor& slot_mapping, - torch::Tensor& block_tables); - -void advance_step_flashinfer( - int64_t num_seqs, int64_t num_queries, int64_t block_size, - torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids, - torch::Tensor& input_positions, torch::Tensor& seq_lens, - torch::Tensor& slot_mapping, torch::Tensor& block_tables, - torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr, - torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds); - void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope, torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache, diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu deleted file mode 100644 index 3d5077d9de46..000000000000 --- a/csrc/prepare_inputs/advance_step.cu +++ /dev/null @@ -1,336 +0,0 @@ -/* - * The goal of this GPU kernel is to advance input tensors on the GPU directly - * PR: https://github.com/vllm-project/vllm/pull/6338 - * Current restrictions: - * 1. Specialized for DraftModelRunner - * 2. Supports flash_attn only - */ - -#include "advance_step.cuh" - -namespace prepare_inputs { - -// -template -__global__ void advance_step_flashattn_kernel( - int num_seqs, int num_queries, int block_size, long* input_tokens_ptr, - long const* sampled_token_ids_ptr, long* input_positions_ptr, - int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr, - int64_t const block_tables_stride) { - int const n_pad = num_seqs - num_queries; - if (n_pad && blockIdx.x == 0) { - // Handle cuda graph padding - int const offset = num_queries; - for (int i = threadIdx.x; i < n_pad; i += blockDim.x) { - input_tokens_ptr[offset + i] = 0; - input_positions_ptr[offset + i] = 0; - slot_mapping_ptr[offset + i] = -1; - } - } - - int num_query_blocks = div_ceil(num_queries, num_threads); - - if (blockIdx.x >= num_query_blocks) { - return; - } - - int cur_query_id = blockIdx.x * num_threads + threadIdx.x; - - if (cur_query_id >= num_queries) { - return; - } - - // Update input_tokens - input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id]; - - int seq_len = seq_lens_ptr[cur_query_id]; - int next_seq_len = seq_len + 1; - int next_input_pos = next_seq_len - 1; - - // Update seq_lens - seq_lens_ptr[cur_query_id] = next_seq_len; - // Update input_positions - input_positions_ptr[cur_query_id] = next_input_pos; - - int const* seq_block_tables_ptr = - block_tables_ptr + block_tables_stride * cur_query_id; - - int block_index = next_input_pos / block_size; - int block_offset = next_input_pos % block_size; - - int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset; - // Update slot_mapping - slot_mapping_ptr[cur_query_id] = slot_num; -} - -inline void verify_tensor(std::string const& name, torch::Tensor const& t, - int64_t const size_0, int64_t const size_1, - c10::ScalarType const type) { - bool size_0_cond = true; - if (size_0 != -1) { - size_0_cond = t.size(0) == size_0; - } - - bool size_1_cond = true; - if (size_1 != -1) { - size_1_cond = t.size(1) == size_1; - } - - bool is_contiguous = t.is_contiguous(); - bool same_type = t.dtype() == type; - - bool pass = size_0_cond && size_1_cond && is_contiguous && same_type; - if (!pass) { - TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(), - " is_cont = ", t.is_contiguous(), ", type = ", t.dtype(), - " is not as expected: shape = [", size_0, ", ", size_1, - "], type = ", type); - } -} - -/// each thread processes a block per query -__global__ void advance_step_flashinfer_kernel( - int num_threads, int num_seqs, int num_queries, int block_size, - long* input_tokens_ptr, long const* sampled_token_ids_ptr, - long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr, - int const* block_tables_ptr, int64_t const block_tables_stride, - int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) { - int const n_pad = num_seqs - num_queries; - if (n_pad && blockIdx.x == 0) { - // Handle cuda graph padding - int const offset = num_queries; - for (int i = threadIdx.x; i < n_pad; i += blockDim.x) { - input_tokens_ptr[offset + i] = 0; - input_positions_ptr[offset + i] = 0; - slot_mapping_ptr[offset + i] = -1; - } - } - int num_query_blocks = div_ceil(num_queries, num_threads); - - if (blockIdx.x < num_query_blocks) { - int cur_query_id = blockIdx.x * num_threads + threadIdx.x; - - if (cur_query_id < num_queries) { - // Update input_tokens - input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id]; - - int seq_len = seq_lens_ptr[cur_query_id]; - int next_seq_len = seq_len + 1; - int next_input_pos = next_seq_len - 1; - - // Update seq_lens - seq_lens_ptr[cur_query_id] = next_seq_len; - // Update input_positions - input_positions_ptr[cur_query_id] = next_input_pos; - - int const* seq_block_tables_ptr = - block_tables_ptr + block_tables_stride * cur_query_id; - - int block_index = next_input_pos / block_size; - int block_offset = next_input_pos % block_size; - - // Update paged_kv_last_page_len - paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1; - - int slot_num = - seq_block_tables_ptr[block_index] * block_size + block_offset; - // Update slot_mapping - slot_mapping_ptr[cur_query_id] = slot_num; - block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size); - } - } -} - -__global__ void advance_step_flashinfer_indptr_kernel( - int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr, - int* block_table_bound_ptr) { - int idx = blockIdx.x * num_threads + threadIdx.x; - // Update paged_kv_indptr - if (idx == 0) { - paged_kv_indptr_ptr[idx] = 0; - } - if (idx < num_queries) { - int sum = 0; - for (int i = 0; i <= idx; ++i) { - sum += block_table_bound_ptr[i]; - } - paged_kv_indptr_ptr[idx + 1] = sum; - } -} - -__global__ void advance_step_flashinfer_indices_kernel( - int num_seqs, int num_queries, int const* block_tables_ptr, - int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr, - int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { - // note: max_num_blocks_per_seq = block_tables.stride(0) - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - // when cuda graphs are enabled, paged_kv_indptr tensor - // has to be updated for the padded queries - // tid represents a query# for paged_kv_indptr tensor - if (num_queries < tid && tid <= num_seqs) { - paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries]; - } - - // each thread processes a block_ptr in block_tables - // block_tables shape: [num_queries, max_num_blocks_per_seq] - // paged_kv_indices is flattened block_tables. - for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq); - idx += (gridDim.x * blockDim.x)) { - // block_tables-row = paged_kv_indptr[queryNum] - int queryNum = idx / max_num_blocks_per_seq; - int col = idx % max_num_blocks_per_seq; - if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) { - int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col; - int block_tables_idx = queryNum * max_num_blocks_per_seq + col; - paged_kv_indices_ptr[indices_arr_idx] = - block_tables_ptr[block_tables_idx]; - } - } -} - -void advance_step_flashattn(int num_seqs, int num_queries, int block_size, - torch::Tensor& input_tokens, // type: long - torch::Tensor& sampled_token_ids, // type: long - torch::Tensor& input_positions, // type: long - torch::Tensor& seq_lens, // type: int - torch::Tensor& slot_mapping, // type: long - torch::Tensor& block_tables) { // type: int - - if (logging) { - printf("advance_step_flashattn:\n"); - printf(" num_seqs = %d\n", num_seqs); - printf(" num_queries = %d\n", num_queries); - printf(" block_size = %d\n", block_size); - } - // Verify all tensors - verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong); - verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1, - at::kLong); - verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong); - verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt); - verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong); - verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt); - - int dev = sampled_token_ids.get_device(); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev); - - int blocks; - cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); - - advance_step_flashattn_kernel - <<>>( - num_seqs, num_queries, block_size, - reinterpret_cast(input_tokens.data_ptr()), - reinterpret_cast(sampled_token_ids.data_ptr()), - reinterpret_cast(input_positions.data_ptr()), - reinterpret_cast(seq_lens.data_ptr()), - reinterpret_cast(slot_mapping.data_ptr()), - reinterpret_cast(block_tables.data_ptr()), - block_tables.stride(0)); -} - -void advance_step_flashinfer( - int num_seqs, int num_queries, int block_size, - torch::Tensor& input_tokens, // type: long - torch::Tensor& sampled_token_ids, // type: long - torch::Tensor& input_positions, // type: long - torch::Tensor& seq_lens, // type: int - torch::Tensor& slot_mapping, // type: long - torch::Tensor& block_tables, // type: int - torch::Tensor& paged_kv_indices, // type: int - torch::Tensor& paged_kv_indptr, // type: int - torch::Tensor& paged_kv_last_page_len, // type: int - torch::Tensor& block_table_bound) { // type: int - - if (logging) { - printf("advance_step_flashinfer:\n"); - printf(" num_seqs = %d\n", num_seqs); - printf(" num_queries = %d\n", num_queries); - printf(" block_size = %d\n", block_size); - printf(" block_tables.stride(0) = %zu\n", block_tables.stride(0)); - } - // Verify all tensors - verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong); - // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1, - // at::kLong); - verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong); - verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt); - verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong); - verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt); - - verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt); - verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt); - verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1, - at::kInt); - - verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt); - - int dev = sampled_token_ids.get_device(); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev); - - int blocks; - int threads; - cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); - cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - - TORCH_CHECK((blocks * threads > num_queries), - "multi-step: not enough threads to map to num_queries = ", - num_queries, " block_tables.stride(0) = ", block_tables.stride(0), - " blocks = ", blocks, " max_threads = ", threads); - if (logging) { - printf("launching kernels with %d blocks and %d threads\n", blocks, - threads); - } - advance_step_flashinfer_kernel<<>>( - threads, num_seqs, num_queries, block_size, - reinterpret_cast(input_tokens.data_ptr()), - reinterpret_cast(sampled_token_ids.data_ptr()), - reinterpret_cast(input_positions.data_ptr()), - reinterpret_cast(seq_lens.data_ptr()), - reinterpret_cast(slot_mapping.data_ptr()), - reinterpret_cast(block_tables.data_ptr()), - block_tables.stride(0), - reinterpret_cast(paged_kv_last_page_len.data_ptr()), - reinterpret_cast(block_table_bound.data_ptr())); - - advance_step_flashinfer_indptr_kernel<<>>( - threads, num_seqs, num_queries, - reinterpret_cast(paged_kv_indptr.data_ptr()), - reinterpret_cast(block_table_bound.data_ptr())); - - advance_step_flashinfer_indices_kernel<<>>( - num_seqs, num_queries, - reinterpret_cast(block_tables.data_ptr()), - block_tables.stride(0), - reinterpret_cast(paged_kv_indices.data_ptr()), - reinterpret_cast(paged_kv_indptr.data_ptr()), - reinterpret_cast(block_table_bound.data_ptr())); -} - -} // namespace prepare_inputs - -void advance_step_flashattn(int64_t num_seqs, int64_t num_queries, - int64_t block_size, torch::Tensor& input_tokens, - torch::Tensor& sampled_token_ids, - torch::Tensor& input_positions, - torch::Tensor& seq_lens, - torch::Tensor& slot_mapping, - torch::Tensor& block_tables) { - prepare_inputs::advance_step_flashattn( - num_seqs, num_queries, block_size, input_tokens, sampled_token_ids, - input_positions, seq_lens, slot_mapping, block_tables); -} - -void advance_step_flashinfer( - int64_t num_seqs, int64_t num_queries, int64_t block_size, - torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids, - torch::Tensor& input_positions, torch::Tensor& seq_lens, - torch::Tensor& slot_mapping, torch::Tensor& block_tables, - torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr, - torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) { - prepare_inputs::advance_step_flashinfer( - num_seqs, num_queries, block_size, input_tokens, sampled_token_ids, - input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices, - paged_kv_indptr, paged_kv_last_page_len, block_table_bound); -} diff --git a/csrc/prepare_inputs/advance_step.cuh b/csrc/prepare_inputs/advance_step.cuh deleted file mode 100644 index f21574681b1a..000000000000 --- a/csrc/prepare_inputs/advance_step.cuh +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include - -namespace prepare_inputs { - -static constexpr int max_threads = 256; -static constexpr bool logging = false; - -constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } - -} // namespace prepare_inputs diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 8c207be083d8..a547baec50d6 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -142,25 +142,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_quick(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_quick", torch::kCUDA, &gelu_quick); - // prepare_inputs advance_step - ops.def( - "advance_step_flashattn(int num_seqs, int num_queries, int block_size, " - "Tensor! input_tokens, Tensor sampled_token_ids, " - "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, " - "Tensor block_tables) -> ()"); - ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn); - - ops.def( - "advance_step_flashinfer(" - " int num_seqs, int num_queries, int block_size," - " Tensor! input_tokens, Tensor sampled_token_ids," - " Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping," - " Tensor block_tables, Tensor! paged_kv_indices," - " Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len," - " Tensor! block_table_bounds" - ") -> ()"); - ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer); - // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index a020b171e894..a318637c5aeb 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -319,38 +319,6 @@ def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor, repetition_penalties) -def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, - input_tokens: torch.Tensor, - sampled_token_ids: torch.Tensor, - input_positions: torch.Tensor, - seq_lens: torch.Tensor, slot_mapping: torch.Tensor, - block_tables: torch.Tensor) -> None: - """Advance a step on GPU for existing inputs for a multi-step runner""" - return torch.ops._C.advance_step_flashattn(num_seqs, num_queries, - block_size, input_tokens, - sampled_token_ids, - input_positions, seq_lens, - slot_mapping, block_tables) - - -def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int, - input_tokens: torch.Tensor, - sampled_token_ids: torch.Tensor, - input_positions: torch.Tensor, - seq_lens: torch.Tensor, slot_mapping: torch.Tensor, - block_tables: torch.Tensor, - paged_kv_indices: torch.Tensor, - paged_kv_indptr: torch.Tensor, - paged_kv_last_page_len: torch.Tensor, - block_table_bound: torch.Tensor) -> None: - - return torch.ops._C.advance_step_flashinfer( - num_seqs, num_queries, block_size, input_tokens, sampled_token_ids, - input_positions, seq_lens, slot_mapping, block_tables, - paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len, - block_table_bound) - - # fused quant layer norm ops def rms_norm_dynamic_per_token_quant( input: torch.Tensor, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 2417fe06a675..d21f07756871 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -101,11 +101,6 @@ def copy_blocks( ) -> None: raise NotImplementedError - def advance_step(self, model_input: "ModelRunnerInputBase", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, num_seqs: int, num_queries: int) -> None: - raise NotImplementedError - @classmethod def full_cls_name(cls) -> tuple[str, str]: return (cls.__module__, cls.__qualname__) diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py index bd9bc427728d..fac3c318a87a 100644 --- a/vllm/attention/backends/differential_flash_attn.py +++ b/vllm/attention/backends/differential_flash_attn.py @@ -35,8 +35,7 @@ flash_attn_with_kvcache) if TYPE_CHECKING: - from vllm.worker.model_runner import (ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata) + from vllm.worker.model_runner import ModelInputForGPUBuilder logger = init_logger(__name__) @@ -326,79 +325,6 @@ def decode_metadata( cross_block_tables=self.cross_block_tables) return self._cached_decode_metadata - def advance_step(self, - model_input: "ModelInputForGPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - turn_prefills_into_decodes: bool = False): - """ - Update metadata in-place to advance one decode step. - """ - # When using cudagraph, the num_seqs is padded to the next captured - # batch sized, but num_queries tracks the actual number of requests in - # the batch. For --enforce-eager mode, num_seqs == num_queries - if num_seqs != num_queries: - assert num_seqs > num_queries - assert self.use_cuda_graph - - if turn_prefills_into_decodes: - # When Multi-Step is enabled with Chunked-Prefill, prefills and - # decodes are scheduled together. In the first step, all the - # prefills turn into decodes. This update reflects that - # conversion. - assert self.num_decode_tokens + self.num_prefills == num_seqs - self.num_decode_tokens += self.num_prefills - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.max_prefill_seq_len = 0 - self.max_query_len = 1 - - self.slot_mapping = self.slot_mapping[:num_seqs] - else: - assert self.seq_lens is not None - assert self.max_decode_seq_len == max(self.seq_lens) - - assert self.num_prefills == 0 - assert self.num_prefill_tokens == 0 - assert self.num_decode_tokens == num_seqs - assert self.slot_mapping.shape == (num_seqs, ) - - assert self.seq_lens is not None - assert len(self.seq_lens) == num_seqs - assert self.seq_lens_tensor is not None - assert self.seq_lens_tensor.shape == (num_seqs, ) - assert self.max_query_len == 1 - assert self.max_prefill_seq_len == 0 - - assert self.query_start_loc is not None - assert self.query_start_loc.shape == (num_queries + 1, ) - assert self.seq_start_loc is not None - assert self.seq_start_loc.shape == (num_seqs + 1, ) - - assert self.context_lens_tensor is not None - assert self.context_lens_tensor.shape == (num_queries, ) - - assert self.block_tables is not None - assert self.block_tables.shape[0] == num_seqs - - # Update query lengths. Note that we update only queries and not seqs, - # since tensors may be padded due to captured cuda graph batch size - for i in range(num_queries): - self.seq_lens[i] += 1 - self.max_decode_seq_len = max(self.seq_lens) - - ops.advance_step_flashattn(num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=model_input.input_tokens, - sampled_token_ids=sampled_token_ids, - input_positions=model_input.input_positions, - seq_lens=self.seq_lens_tensor, - slot_mapping=self.slot_mapping, - block_tables=self.block_tables) - class DifferentialFlashAttentionMetadataBuilder( AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]): diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index ee36fd19e012..e52480d5c5ce 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -32,8 +32,7 @@ flash_attn_with_kvcache) if TYPE_CHECKING: - from vllm.worker.model_runner import (ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata) + from vllm.worker.model_runner import ModelInputForGPUBuilder logger = init_logger(__name__) @@ -309,79 +308,6 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: cross_block_tables=self.cross_block_tables) return self._cached_decode_metadata - def advance_step(self, - model_input: "ModelInputForGPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - turn_prefills_into_decodes: bool = False): - """ - Update metadata in-place to advance one decode step. - """ - # When using cudagraph, the num_seqs is padded to the next captured - # batch sized, but num_queries tracks the actual number of requests in - # the batch. For --enforce-eager mode, num_seqs == num_queries - if num_seqs != num_queries: - assert num_seqs > num_queries - assert self.use_cuda_graph - - if turn_prefills_into_decodes: - # When Multi-Step is enabled with Chunked-Prefill, prefills and - # decodes are scheduled together. In the first step, all the - # prefills turn into decodes. This update reflects that - # conversion. - assert self.num_decode_tokens + self.num_prefills == num_seqs - self.num_decode_tokens += self.num_prefills - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.max_prefill_seq_len = 0 - self.max_query_len = 1 - - self.slot_mapping = self.slot_mapping[:num_seqs] - else: - assert self.seq_lens is not None - assert self.max_decode_seq_len == max(self.seq_lens) - - assert self.num_prefills == 0 - assert self.num_prefill_tokens == 0 - assert self.num_decode_tokens == num_seqs - assert self.slot_mapping.shape == (num_seqs, ) - - assert self.seq_lens is not None - assert len(self.seq_lens) == num_seqs - assert self.seq_lens_tensor is not None - assert self.seq_lens_tensor.shape == (num_seqs, ) - assert self.max_query_len == 1 - assert self.max_prefill_seq_len == 0 - - assert self.query_start_loc is not None - assert self.query_start_loc.shape == (num_queries + 1, ) - assert self.seq_start_loc is not None - assert self.seq_start_loc.shape == (num_seqs + 1, ) - - assert self.context_lens_tensor is not None - assert self.context_lens_tensor.shape == (num_queries, ) - - assert self.block_tables is not None - assert self.block_tables.shape[0] == num_seqs - - # Update query lengths. Note that we update only queries and not seqs, - # since tensors may be padded due to captured cuda graph batch size - for i in range(num_queries): - self.seq_lens[i] += 1 - self.max_decode_seq_len = max(self.seq_lens) - - ops.advance_step_flashattn(num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=model_input.input_tokens, - sampled_token_ids=sampled_token_ids, - input_positions=model_input.input_positions, - seq_lens=self.seq_lens_tensor, - slot_mapping=self.slot_mapping, - block_tables=self.block_tables) - class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 78d8a67e37f8..208cacec38eb 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -51,8 +51,7 @@ logger = init_logger(__name__) if TYPE_CHECKING: - from vllm.worker.model_runner import (ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata) + from vllm.worker.model_runner import ModelInputForGPUBuilder class FlashInferBackend(AttentionBackend): @@ -428,7 +427,7 @@ class FlashInferMetadata(AttentionMetadata): query_start_loc: Optional[torch.Tensor] = None block_tables: Optional[torch.Tensor] = None - # used for GPU in-place advance_step + # used for GPU operations seq_lens_tensor: Optional[torch.Tensor] = None block_table_bound: Optional[torch.Tensor] = None @@ -587,66 +586,6 @@ def decode_metadata(self) -> Optional["FlashInferMetadata"]: return None return self - def advance_step(self, - model_input: "ModelInputForGPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - turn_prefills_into_decodes: bool = False): - """ - Update metadata in-place to advance one decode step. - """ - - if turn_prefills_into_decodes: - # When Multi-Step is enabled with Chunked-Prefill, prefills and - # decodes are scheduled together. In the first step, all the - # prefills turn into decodes. This update reflects that - # conversion. - assert self.num_decode_tokens + self.num_prefills == num_seqs - # Flashinfer doesn't support speculative decoding + chunked-prefill - # + multi-step scheduling yet. - assert self.decode_query_len == 1 - self.num_decode_tokens += self.num_prefills - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.max_prefill_seq_len = 0 - self.max_query_len = 1 - - self.slot_mapping = self.slot_mapping[:num_seqs] - else: - assert self.seq_lens_tensor is not None - - assert num_seqs > 0 - assert num_queries > 0 - assert model_input.attn_metadata is not None - assert sampled_token_ids is not None - - # When using cudagraph, the num_seqs is padded to the next captured - # batch sized, but num_queries tracks the actual number of requests in - # the batch. For --enforce-eager mode, num_seqs == num_queries - if num_seqs != num_queries: - assert num_seqs > num_queries - assert self.use_cuda_graph - - model_input.input_tokens[:num_queries] = sampled_token_ids.flatten() - - # Update GPU tensors - ops.advance_step_flashinfer( - num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=model_input.input_tokens, - sampled_token_ids=model_input.input_tokens, - input_positions=model_input.input_positions, - seq_lens=self.seq_lens_tensor, - slot_mapping=self.slot_mapping, - block_tables=self.block_tables, - paged_kv_indices=self.paged_kv_indices, - paged_kv_indptr=self.paged_kv_indptr, - paged_kv_last_page_len=self.paged_kv_last_page_len, - block_table_bound=self.block_table_bound) - class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index a242ac9bbe0b..f23c096952ce 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type import torch @@ -18,9 +18,6 @@ get_mla_metadata, is_flashmla_supported) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - class FlashMLABackend(MLACommonBackend): @@ -62,16 +59,6 @@ def decode_metadata(self): self.decode_num_splits return decode_metadata - def advance_step(self, - model_input: "ModelInputForGPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - turn_prefills_into_decodes: bool = False): - raise NotImplementedError( - "advance_step is not implemented for FlashMLA") - class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 52c4a9e7da3d..8ff7f5674323 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -234,8 +234,7 @@ flash_attn_varlen_func = None if TYPE_CHECKING: - from vllm.worker.model_runner import (ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata) + from vllm.worker.model_runner import ModelInputForGPUBuilder is_hip = current_platform.is_rocm() @@ -631,90 +630,6 @@ def decode_metadata(self): is_profile_run=self.is_profile_run) return self._cached_decode_metadata - def advance_step(self, - model_input: "ModelInputForGPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - turn_prefills_into_decodes: bool = False): - """ - Update metadata in-place to advance one decode step. - """ - # When using cudagraph, the num_seqs is padded to the next captured - # batch sized, but num_queries tracks the actual number of requests in - # the batch. For --enforce-eager mode, num_seqs == num_queries - if num_seqs != num_queries: - assert num_seqs > num_queries - - if turn_prefills_into_decodes: - # When Multi-Step is enabled with Chunked-Prefill, prefills and - # decodes are scheduled together. In the first step, all the - # prefills turn into decodes. This update reflects that - # conversion. - assert self.num_decode_tokens + self.num_prefills == num_seqs - self.num_decode_tokens += self.num_prefills - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.max_prefill_seq_len = 0 - self.max_query_len = 1 - - self.slot_mapping = self.slot_mapping[:num_seqs] - else: - assert self.seq_lens is not None - assert self.max_decode_seq_len == max(self.seq_lens) - - assert self.num_prefills == 0 - assert self.num_prefill_tokens == 0 - assert self.num_decode_tokens == num_seqs - assert self.slot_mapping.shape == (num_seqs, ) - - assert self.seq_lens is not None - assert len(self.seq_lens) == num_seqs - assert self.seq_lens_tensor is not None - assert self.seq_lens_tensor.shape == (num_seqs, ) - assert self.max_query_len == 1 - assert self.max_prefill_seq_len == 0 - - assert self.query_start_loc is not None - assert self.query_start_loc.shape == (num_queries + 1, ) - assert self.seq_start_loc is not None - assert self.seq_start_loc.shape == (num_seqs + 1, ) - - assert self.context_lens_tensor is not None - assert self.context_lens_tensor.shape == (num_queries, ) - - assert self.block_tables is not None - assert self.block_tables.shape[0] == num_seqs - - # Update query lengths. Note that we update only queries and not seqs, - # since tensors may be padded due to captured cuda graph batch size - for i in range(num_queries): - self.seq_lens[i] += 1 - self.max_decode_seq_len = max(self.seq_lens) - - self._ops_advance_step(num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=model_input.input_tokens, - sampled_token_ids=sampled_token_ids, - input_positions=model_input.input_positions) - - def _ops_advance_step(self, num_seqs: int, num_queries: int, - block_size: int, input_tokens: torch.Tensor, - sampled_token_ids: torch.Tensor, - input_positions: torch.Tensor) -> None: - # here we use advance_step_flashinfo to update the paged_kv_* tensors - ops.advance_step_flashattn(num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=input_tokens, - sampled_token_ids=sampled_token_ids, - input_positions=input_positions, - seq_lens=self.seq_lens_tensor, - slot_mapping=self.slot_mapping, - block_tables=self.block_tables) - class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): """ diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 820ddcab77d7..e630a6c6de8c 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -15,8 +15,7 @@ from vllm.multimodal import MultiModalPlaceholderMap if TYPE_CHECKING: - from vllm.worker.model_runner import (ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata) + from vllm.worker.model_runner import (ModelInputForGPUBuilder) from vllm.utils import async_tensor_h2d # Placeholder attention backend for models like Mamba and pooling models that @@ -201,65 +200,6 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: ) return self._cached_decode_metadata - def advance_step(self, - model_input: "ModelInputForGPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - turn_prefills_into_decodes: bool = False): - """ - Update metadata in-place to advance one decode step. - """ - # When using cudagraph, the num_seqs is padded to the next captured - # batch sized, but num_queries tracks the actual number of requests in - # the batch. For --enforce-eager mode, num_seqs == num_queries - if num_seqs != num_queries: - assert num_seqs > num_queries - assert self.use_cuda_graph - - assert not turn_prefills_into_decodes, \ - ("Multi-Step + Chunked-Prefill is not supported for attention-free" - "models. turn_prefills_into_decodes is a " - "Multi-Step + Chunked-Prefill specific parameter.") - - assert self.seq_lens is not None - assert self.max_decode_seq_len == max(self.seq_lens) - - assert self.num_prefills == 0 - assert self.num_prefill_tokens == 0 - assert self.num_decode_tokens == num_seqs - - assert self.seq_lens is not None - assert len(self.seq_lens) == num_seqs - assert self.seq_lens_tensor is not None - assert self.seq_lens_tensor.shape == (num_seqs, ) - assert self.max_query_len == 1 - assert self.max_prefill_seq_len == 0 - - assert self.query_start_loc is not None - assert self.query_start_loc.shape == (num_queries + 1, ) - assert self.seq_start_loc is not None - assert self.seq_start_loc.shape == (num_seqs + 1, ) - - assert self.context_lens_tensor is not None - assert self.context_lens_tensor.shape == (num_queries, ) - - # Update query lengths. Note that we update only queries and not seqs, - # since tensors may be padded due to captured cuda graph batch size - for i in range(num_queries): - self.seq_lens[i] += 1 - self.max_decode_seq_len = max(self.seq_lens) - - # Update sequences, masking off entries greater than num_queries - device = self.seq_lens_tensor.device - mask = torch.arange(self.seq_lens_tensor.size(0), - device=device) < num_queries - self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype) - if sampled_token_ids is not None: - model_input.input_tokens.masked_scatter_( - mask, sampled_token_ids[:num_queries]) - class PlaceholderAttentionMetadataBuilder( AttentionMetadataBuilder[PlaceholderAttentionMetadata]): diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index a165a786d63d..a2e9710437d9 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -7,7 +7,6 @@ import torch -import vllm._custom_ops as ops import vllm.envs as envs from vllm.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, @@ -107,26 +106,6 @@ def decode_metadata(self): return self._cached_decode_metadata - def _ops_advance_step(self, num_seqs: int, num_queries: int, - block_size: int, input_tokens: torch.Tensor, - sampled_token_ids: torch.Tensor, - input_positions: torch.Tensor) -> None: - - ops.advance_step_flashinfer( - num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=input_tokens, - sampled_token_ids=sampled_token_ids, - input_positions=input_positions, - seq_lens=self.seq_lens_tensor, - slot_mapping=self.slot_mapping, - block_tables=self.block_tables, - paged_kv_indices=self.paged_kv_indices, - paged_kv_indptr=self.paged_kv_indptr, - paged_kv_last_page_lens=self.paged_kv_last_page_lens, - block_table_bound=self.block_table_bound) - class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): BLOCK_TABLE_EXTENDER: list[list[int]] = [[]] diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index da3d9ff32830..63e467f5a7a2 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -4,7 +4,7 @@ import itertools from dataclasses import dataclass from functools import cache -from typing import TYPE_CHECKING, List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type import torch @@ -23,9 +23,6 @@ GroupShape) from vllm.platforms import current_platform -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - logger = init_logger(__name__) _PARTITION_SIZE_ROCM = 256 @@ -261,69 +258,6 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: self._cached_decode_metadata.query_start_loc = qs - qs[0] return self._cached_decode_metadata - def advance_step(self, - model_input: "ModelInputForGPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - turn_prefills_into_decodes: bool = False): - """ - Update metadata in-place to advance one decode step. - """ - - assert not turn_prefills_into_decodes, \ - ("Chunked prefill is not supported with rocm_flash_attn yet." - "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill " - "specific parameter.") - - # When using cudagraph, the num_seqs is padded to the next captured - # batch sized, but num_queries tracks the actual number of requests in - # the batch. For --enforce-eager mode, num_seqs == num_queries - if num_seqs != num_queries: - assert num_seqs > num_queries - assert self.use_cuda_graph - - assert self.num_prefills == 0 - assert self.num_prefill_tokens == 0 - assert self.num_decode_tokens == num_seqs - assert self.slot_mapping.shape == (num_seqs, ) - - assert self.seq_lens is not None - assert len(self.seq_lens) == num_seqs - assert self.seq_lens_tensor is not None - assert self.seq_lens_tensor.shape == (num_seqs, ) - assert self.max_query_len == 1 - assert self.max_prefill_seq_len == 0 - assert self.max_decode_seq_len == max(self.seq_lens) - - assert self.query_start_loc is not None - assert self.query_start_loc.shape == (num_queries + 1, ) - assert self.seq_start_loc is not None - assert self.seq_start_loc.shape == (num_seqs + 1, ) - - assert self.context_lens_tensor is not None - assert self.context_lens_tensor.shape == (num_queries, ) - - assert self.block_tables is not None - assert self.block_tables.shape[0] == num_seqs - - # Update query lengths. Note that we update only queries and not seqs, - # since tensors may be padded due to captured cuda graph batch size - for i in range(num_queries): - self.seq_lens[i] += 1 - self.max_decode_seq_len = max(self.seq_lens) - - ops.advance_step_flashattn(num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=model_input.input_tokens, - sampled_token_ids=sampled_token_ids, - input_positions=model_input.input_positions, - seq_lens=self.seq_lens_tensor, - slot_mapping=self.slot_mapping, - block_tables=self.block_tables) - class ROCmFlashAttentionMetadataBuilder( CommonMetadataBuilder[ROCmFlashAttentionMetadata]): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a63797e3a46a..a1c08fa814db 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -762,8 +762,7 @@ def _get_cuda_graph_pad_size(self, has Prefills (if any). The rest of the steps are guaranteed to be all decodes. In this case, we set up the padding as if all the sequences are decodes so we may run all steps except the first step in CUDA graph - mode. The padding is accounted for in the multi-step `advance_step` - family of functions. + mode. Args: num_seqs (int): Number of sequences scheduled to run. From 8db45ebcd216a4d0a5db0684092c9604815af7bd Mon Sep 17 00:00:00 2001 From: sstamenk Date: Fri, 15 Aug 2025 19:17:31 +0200 Subject: [PATCH 049/231] [BugFix] Skip the Q component for QKVParallelLinear in the case of QKVCrossParallelLinear since its width is 0 (#22369) Signed-off-by: sstamenk Signed-off-by: Duncan Moss --- vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index ddb50968904d..659029fd37f7 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -121,6 +121,9 @@ def requantize_with_max_scale( if unfused_module_in_checkpoint: start = 0 for idx, logical_width in enumerate(logical_widths): + # Skip any component with zero width. + if logical_width == 0: + continue end = start + logical_width weight_dq = per_tensor_dequantize(weight[start:end, :], weight_scale[idx]) From 51b5895640ea9922802c690a5c2701b091399d4a Mon Sep 17 00:00:00 2001 From: JartX Date: Fri, 15 Aug 2025 19:42:49 +0200 Subject: [PATCH 050/231] [FIXBUG] Correctly Apply Grammar Bitmask in Mixed Batches (#22896) Signed-off-by: JartX Signed-off-by: Duncan Moss --- vllm/v1/worker/gpu_model_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9460d91c5832..3ea39dc519d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1337,9 +1337,10 @@ def apply_grammar_bitmask( out_indices = [] # Reorder the bitmask to match the order of the requests in the batch. - sorted_bitmask = np.zeros_like(grammar_bitmask, - shape=(logits.shape[0], - grammar_bitmask.shape[1])) + sorted_bitmask = np.full(shape=(logits.shape[0], + grammar_bitmask.shape[1]), + fill_value=-1, + dtype=grammar_bitmask.dtype) cumulative_index = 0 seq = sorted(scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1]) From 6c4c5ea42e89d873f93ed893e37a4c5cd825d1d3 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Fri, 15 Aug 2025 11:23:06 -0700 Subject: [PATCH 051/231] [Benchmarks] Include image data when ShareGPT4V dataset is used. (#22955) Signed-off-by: Chenheli Hua Signed-off-by: Duncan Moss --- benchmarks/README.md | 49 +++++++++++++++++++++++++++++++++ benchmarks/benchmark_dataset.py | 8 +++++- vllm/benchmarks/datasets.py | 8 +++++- 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index d6442a4fc387..caff8f034214 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -22,6 +22,17 @@ become available. ✅ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + ShareGPT4V (Image) + ✅ + ✅ + + wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +
+
Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
+ wget http://images.cocodataset.org/zips/train2017.zip + + BurstGPT ✅ @@ -616,3 +627,41 @@ python3 benchmarks/benchmark_prioritization.py \ ``` + +## 👁️ Example - Multi-Modal Benchmark + +
+Show more + +
+ +Benchmark the performance of multi-modal requests in vLLM. + +### Images (ShareGPT4V) + +Start vLLM: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dtype bfloat16 \ + --limit-mm-per-prompt '{"image": 1}' \ + --allowed-local-media-path /path/to/sharegpt4v/images +``` + +Send requests with images: + +```bash +python benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \ + --num-prompts 100 \ + --save-result \ + --result-dir ~/vllm_benchmark_results \ + --save-detailed \ + --endpoint /v1/chat/completion +``` + +
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index ea684f18a742..572292a5aca4 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -430,14 +430,20 @@ def sample( skip_min_output_len_check=output_len is not None, ): continue + # TODO: Also support ShareGPT4Video. + if image_path := entry.get("image"): + mm_content = process_image(image_path) + else: + mm_content = None if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation(prompt, None) + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) samples.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=new_output_len, lora_request=lora_request, + multi_modal_data=mm_content, ) ) self.maybe_oversample_requests(samples, num_requests) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 4e8ac5162542..5299dcf54b39 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -454,15 +454,21 @@ def sample( skip_min_output_len_check=output_len is not None): continue + # TODO: Also support ShareGPT4Video. + if image_path := entry.get("image"): + mm_content = process_image(image_path) + else: + mm_content = None if enable_multimodal_chat: prompt = self.apply_multimodal_chat_transformation( - prompt, None) + prompt, mm_content) samples.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=new_output_len, lora_request=lora_request, + multi_modal_data=mm_content, )) self.maybe_oversample_requests(samples, num_requests) return samples From 04c52c0cecf94fe9dbd0c251ca161b8f28bcfd0a Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Sat, 16 Aug 2025 02:29:25 +0800 Subject: [PATCH 052/231] [Structured Output] Make the output of structured output example more complete (#22481) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Duncan Moss --- examples/offline_inference/structured_outputs.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 8ef121ebe848..f46064931dba 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -15,6 +15,8 @@ from vllm import LLM, SamplingParams from vllm.sampling_params import GuidedDecodingParams +MAX_TOKENS = 50 + # Guided decoding by Choice (list of possible options) guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"]) sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice) @@ -23,7 +25,9 @@ # Guided decoding by Regex guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n") sampling_params_regex = SamplingParams( - guided_decoding=guided_decoding_params_regex, stop=["\n"] + guided_decoding=guided_decoding_params_regex, + stop=["\n"], + max_tokens=MAX_TOKENS, ) prompt_regex = ( "Generate an email address for Alan Turing, who works in Enigma." @@ -48,7 +52,10 @@ class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema() guided_decoding_params_json = GuidedDecodingParams(json=json_schema) -sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json) +sampling_params_json = SamplingParams( + guided_decoding=guided_decoding_params_json, + max_tokens=MAX_TOKENS, +) prompt_json = ( "Generate a JSON with the brand, model and car_type of" "the most iconic car from the 90's" @@ -64,7 +71,10 @@ class CarDescription(BaseModel): number ::= "1 " | "2 " """ guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar) -sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar) +sampling_params_grammar = SamplingParams( + guided_decoding=guided_decoding_params_grammar, + max_tokens=MAX_TOKENS, +) prompt_grammar = ( "Generate an SQL query to show the 'username' and 'email'from the 'users' table." ) From ba6499ceedbd573386f5dede88043d7840f0d886 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:46:00 -0400 Subject: [PATCH 053/231] [Kernels] Clean up FusedMoeMethodBase and modular kernel setup. Remove extra arguments from modular kernel methods. (#22035) Signed-off-by: Bill Nell Co-authored-by: Michael Goin Signed-off-by: Duncan Moss --- .buildkite/test-pipeline.yaml | 1 + docs/design/fused_moe_modular_kernel.md | 10 +- examples/offline_inference/data_parallel.py | 23 +- .../moe/modular_kernel_tools/common.py | 540 +++++++++--------- .../moe/modular_kernel_tools/mk_objects.py | 461 ++++++++++++++- .../profile_modular_kernel.py | 4 +- .../kernels/moe/modular_kernel_tools/utils.py | 117 ---- tests/kernels/moe/test_batched_moe.py | 4 +- tests/kernels/moe/test_block_fp8.py | 31 +- tests/kernels/moe/test_block_int8.py | 15 +- .../kernels/moe/test_cutlass_grouped_gemm.py | 17 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 6 +- tests/kernels/moe/test_deepgemm.py | 6 +- tests/kernels/moe/test_flashinfer_moe.py | 147 +++++ .../moe/test_modular_kernel_combinations.py | 141 +++-- tests/kernels/moe/test_nvfp4_moe.py | 64 +-- tests/kernels/moe/test_pplx_cutlass_moe.py | 11 +- tests/kernels/moe/test_pplx_moe.py | 4 +- tests/kernels/moe/utils.py | 75 ++- .../base_device_communicator.py | 7 +- .../layers/fused_moe/__init__.py | 4 +- .../layers/fused_moe/batched_deep_gemm_moe.py | 36 +- .../batched_triton_or_deep_gemm_moe.py | 38 +- .../model_executor/layers/fused_moe/config.py | 11 +- .../layers/fused_moe/cutlass_moe.py | 328 ++++++----- .../layers/fused_moe/deep_gemm_moe.py | 3 +- .../fused_moe/deepep_ht_prepare_finalize.py | 30 +- .../fused_moe/deepep_ll_prepare_finalize.py | 32 +- .../fused_moe/flashinfer_cutlass_moe.py | 59 +- .../flashinfer_cutlass_prepare_finalize.py | 52 +- .../layers/fused_moe/fused_batched_moe.py | 98 ++-- .../layers/fused_moe/fused_moe.py | 7 +- .../fused_moe/gpt_oss_triton_kernels_moe.py | 15 +- vllm/model_executor/layers/fused_moe/layer.py | 91 +-- .../layers/fused_moe/modular_kernel.py | 117 ++-- .../layers/fused_moe/pplx_prepare_finalize.py | 33 +- .../layers/fused_moe/prepare_finalize.py | 43 +- .../layers/fused_moe/triton_deep_gemm_moe.py | 37 +- vllm/model_executor/layers/fused_moe/utils.py | 18 +- .../layers/quantization/auto_round.py | 4 +- .../model_executor/layers/quantization/awq.py | 2 +- .../layers/quantization/awq_marlin.py | 18 +- .../layers/quantization/bitsandbytes.py | 12 +- .../compressed_tensors_moe.py | 168 ++++-- .../layers/quantization/experts_int8.py | 17 +- .../model_executor/layers/quantization/fp8.py | 43 +- .../layers/quantization/gguf.py | 15 +- .../layers/quantization/gptq_marlin.py | 14 +- .../layers/quantization/modelopt.py | 99 ++-- .../layers/quantization/moe_wna16.py | 16 +- .../layers/quantization/mxfp4.py | 2 +- .../layers/quantization/quark/quark_moe.py | 39 +- .../model_executor/layers/quantization/rtn.py | 13 +- .../quantization/utils/flashinfer_fp4_moe.py | 129 +---- 54 files changed, 2022 insertions(+), 1305 deletions(-) delete mode 100644 tests/kernels/moe/modular_kernel_tools/utils.py create mode 100644 tests/kernels/moe/test_flashinfer_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 04d7cdc3d885..87296a08e207 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -399,6 +399,7 @@ steps: - label: Kernels MoE Test %N mirror_hardwares: [amdexperimental] source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ - csrc/moe/ - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 3ef1232051b0..4b917ab408ee 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -175,11 +175,19 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking ### FusedMoEModularKernel Initialization -`FusedMoEMethodBase` class has 2 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are, +`FusedMoEMethodBase` class has 3 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are, +* maybe_make_prepare_finalize, * select_gemm_impl, and * init_prepare_finalize +#### maybe_make_prepare_finalize + +The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled. The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case. Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case. +Please refer to the implementations in, + +* `ModelOptNvFp4FusedMoE` + #### select_gemm_impl The `select_gemm_impl` method is undefined in the base class. It is the responsibility of the derived class to implement a method that constructs a valid/appropriate `FusedMoEPermuteExpertsUnpermute` object. diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index dbf8ed58cc47..dd7559451c4c 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -70,12 +70,27 @@ def parse_args(): default=64, help=("Maximum number of sequences to be processed in a single iteration."), ) + parser.add_argument( + "--max-model-len", + type=int, + help=("Maximum number of tokens to be processed in a single iteration."), + ) + parser.add_argument( + "--timeout", + type=int, + default=300, + help=("Number of seconds before unresponsive process is killed."), + ) parser.add_argument( "--gpu-memory-utilization", type=float, default=0.8, help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), ) + parser.add_argument( + "--quantization", + type=str, + ) return parser.parse_args() @@ -90,7 +105,9 @@ def main( enforce_eager, trust_remote_code, max_num_seqs, + max_model_len, gpu_memory_utilization, + quantization, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) @@ -142,7 +159,9 @@ def start(rank): enable_expert_parallel=True, trust_remote_code=trust_remote_code, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, gpu_memory_utilization=gpu_memory_utilization, + quantization=quantization, ) outputs = llm.generate(prompts, sampling_params) # Print the outputs. @@ -198,14 +217,16 @@ def start(rank): args.enforce_eager, args.trust_remote_code, args.max_num_seqs, + args.max_model_len, args.gpu_memory_utilization, + args.quantization, ), ) proc.start() procs.append(proc) exit_code = 0 for proc in procs: - proc.join(timeout=300) + proc.join(timeout=args.timeout) if proc.exitcode is None: print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") proc.kill() diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index fd99e8dc5c98..a10666b6ec9a 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -7,41 +7,22 @@ import vllm._custom_ops as ops import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from tests.kernels.moe.utils import make_test_weights, per_token_cast_to_fp8 +from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, + FLOAT8_E4M3_MAX, + dequantize_nvfp4_to_dtype) from tests.kernels.utils import torch_experts from vllm.config import VllmConfig from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size -# Fused experts and PrepareFinalize imports -from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - BatchedDeepGemmExperts) -from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 - BatchedTritonOrDeepGemmExperts) +from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig) -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedTritonExperts, NaiveBatchedExperts) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk -from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase, - TritonExperts) -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP) -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( - TritonOrDeepGemmExperts) from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx +from .mk_objects import (expert_info, make_fused_experts, + make_prepare_finalize, prepare_finalize_info) from .parallel_utils import ProcessGroupInfo -from .utils import (make_block_quant_fp8_weights, make_non_quant_weights, - make_quant_fp8_weights, per_token_cast_to_fp8) - -if has_pplx(): - from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( - PplxPrepareAndFinalize) -if has_deep_ep(): - from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 - DeepEPHTPrepareAndFinalize) - from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 - DeepEPLLPrepareAndFinalize) def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str: @@ -69,24 +50,31 @@ class Config: torch_trace_dir_path: Optional[str] = None + def __post_init__(self): + if self.quant_config is None: + self.quant_config = FusedMoEQuantConfig() + def describe(self) -> str: s = "" - s += "== Config: \n" - s += f" world_size={self.world_size} \n" - s += f" PF={self.prepare_finalize_type.__name__} \n" - s += f" FE={self.fused_experts_type.__name__} \n" - s += f" topk={self.topks} \n" - s += f" dtype={self.dtype} \n" - s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n" - s += " Quant: \n" - s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n " + s += "== Config:\n" + s += f" world_size={self.world_size}\n" + s += f" PF={self.prepare_finalize_type.__name__}\n" + s += f" FE={self.fused_experts_type.__name__}\n" + s += f" E={self.E}\n" + s += f" Ms={self.Ms}\n" + s += f" N={self.N}\n" + s += f" K={self.K}\n" + s += f" topk={self.topks}\n" + s += f" dtype={self.dtype}\n" + s += f" fused_moe_chunk_size={self.fused_moe_chunk_size}\n" + s += " Quant:\n" if self.quant_config is not None: - s += f" q_dtype={self.quant_dtype} \n" - s += f" q_block_shape={self.quant_block_shape} \n" - s += f" q_per_out_ch_quant={self.is_per_out_ch_quant} \n" - s += f" q_per_act_token={self.is_per_act_token_quant} \n" + s += f" q_dtype={self.quant_dtype}\n" + s += f" q_block_shape={self.quant_block_shape}\n" + s += f" q_per_out_ch_quant={self.is_per_out_ch_quant}\n" + s += f" q_per_act_token={self.is_per_act_token_quant}\n" else: - s += " quant=None \n" + s += " quant=None\n" return s @property @@ -95,34 +83,28 @@ def M(self) -> int: return self.Ms @property - def quant_dtype(self) -> Optional[torch.dtype]: - if self.quant_config is None: - return None + def quant_dtype(self) -> Union[torch.dtype, str, None]: + assert self.quant_config is not None return self.quant_config.quant_dtype @property def is_per_act_token_quant(self) -> bool: - if self.quant_config is None: - return False + assert self.quant_config is not None return self.quant_config.per_act_token_quant @property def is_per_tensor_act_quant(self) -> bool: - if self.quant_config is None: - return False return (not self.is_per_act_token_quant and self.quant_block_shape is None) @property def is_per_out_ch_quant(self) -> bool: - if self.quant_config is None: - return False + assert self.quant_config is not None return self.quant_config.per_out_ch_quant @property def quant_block_shape(self) -> Optional[list[int]]: - if self.quant_config is None: - return None + assert self.quant_config is not None return self.quant_config.block_shape @property @@ -130,36 +112,30 @@ def topk(self) -> int: assert isinstance(self.topks, int) return self.topks - @property - def topk_ids_dtype(self) -> Optional[torch.dtype]: - topk_ids_dtype = None - if self.prepare_finalize_type == PplxPrepareAndFinalize: - topk_ids_dtype = torch.uint32 - elif self.prepare_finalize_type in [ - DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize - ]: - topk_ids_dtype = torch.int64 - return topk_ids_dtype - @property def num_local_experts(self) -> int: return self.E // self.world_size def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]: """ - make env data for vllm launch. + make env data for vllm launch. """ vllm_config = VllmConfig() vllm_config.parallel_config.data_parallel_size = self.world_size vllm_config.parallel_config.enable_expert_parallel = True env_dict = { - "VLLM_ALL2ALL_BACKEND": self.all2all_backend(), "VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())), } + + backend = self.all2all_backend() + if backend is not None: + env_dict.update({"VLLM_ALL2ALL_BACKEND": backend}) + if self.fused_moe_chunk_size is not None: env_dict.update( {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)}) + return vllm_config, env_dict def is_fp8_block_quantized(self): @@ -167,85 +143,59 @@ def is_fp8_block_quantized(self): and self.quant_block_shape is not None) def is_batched_prepare_finalize(self): - return self.prepare_finalize_type in [ - PplxPrepareAndFinalize, DeepEPLLPrepareAndFinalize - ] + info = prepare_finalize_info(self.prepare_finalize_type) + return (mk.FusedMoEActivationFormat.BatchedExperts == + info.activation_format) def is_batched_fused_experts(self): - return self.fused_experts_type in [ - CutlassExpertsFp8, BatchedDeepGemmExperts, BatchedTritonExperts, - NaiveBatchedExperts, BatchedTritonOrDeepGemmExperts - ] + info = expert_info(self.fused_experts_type) + return (mk.FusedMoEActivationFormat.BatchedExperts == + info.activation_format) def is_standard_fused_experts(self): - return self.fused_experts_type in [ - CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts, - TritonExperts - ] - - def is_fe_16bit_supported(self): - return self.fused_experts_type in [ - BatchedTritonExperts, BatchedTritonOrDeepGemmExperts, - NaiveBatchedExperts, TritonExperts - ] - - def is_fe_fp8_supported(self): - return self.fused_experts_type in [ - BatchedDeepGemmExperts, - BatchedTritonExperts, - BatchedTritonOrDeepGemmExperts, - CutlassExpertsFp8, - DeepGemmExperts, - TritonExperts, - TritonOrDeepGemmExperts, - NaiveBatchedExperts, - ] - - def is_fe_block_fp8_supported(self): - return self.fused_experts_type in [ - BatchedDeepGemmExperts, - BatchedTritonOrDeepGemmExperts, - DeepGemmExperts, - TritonExperts, - TritonOrDeepGemmExperts, - BatchedTritonExperts, - NaiveBatchedExperts, - ] + info = expert_info(self.fused_experts_type) + return mk.FusedMoEActivationFormat.Standard == info.activation_format + + def fe_supported_types(self): + info = expert_info(self.fused_experts_type) + return info.supported_dtypes + + def pf_supported_types(self): + info = prepare_finalize_info(self.prepare_finalize_type) + return info.supported_dtypes + + def is_block_quant_supported(self): + info = expert_info(self.fused_experts_type) + return info.blocked_quantization_support def is_fe_supports_chunking(self): - return self.fused_experts_type in [ - CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts, - TritonExperts - ] + info = expert_info(self.fused_experts_type) + return info.supports_chunking + + def supports_expert_map(self): + info = expert_info(self.fused_experts_type) + return info.supports_expert_map + + def supports_apply_weight_on_input(self): + info = prepare_finalize_info(self.prepare_finalize_type) + return info.supports_apply_weight_on_input def needs_deep_gemm(self): - return self.fused_experts_type in [ - BatchedDeepGemmExperts, - DeepGemmExperts, - ] + info = expert_info(self.fused_experts_type) + return info.needs_deep_gemm def needs_pplx(self): - return self.prepare_finalize_type in [PplxPrepareAndFinalize] + info = prepare_finalize_info(self.prepare_finalize_type) + return info.backend == "pplx" def needs_deep_ep(self): - return self.prepare_finalize_type in [ - DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize - ] + info = prepare_finalize_info(self.prepare_finalize_type) + return (info.backend == "deepep_high_throughput" + or info.backend == "deepep_low_latency") def all2all_backend(self): - if self.needs_pplx(): - return "pplx" - if self.prepare_finalize_type == DeepEPHTPrepareAndFinalize: - return "deepep_high_throughput" - if self.prepare_finalize_type == DeepEPLLPrepareAndFinalize: - return "deepep_low_latency" - return "naive" - - def needs_all2all(self): - return self.prepare_finalize_type in [ - PplxPrepareAndFinalize, DeepEPHTPrepareAndFinalize, - DeepEPLLPrepareAndFinalize - ] + info = prepare_finalize_info(self.prepare_finalize_type) + return info.backend def is_valid(self): # Check prepare-finalize and fused-experts compatibility @@ -267,28 +217,28 @@ def is_valid(self): # invalid quant config return False - # check bf16 / fp16 support - is_16bit = (self.dtype.itemsize == 2 and self.quant_dtype is None) - if is_16bit and not self.is_fe_16bit_supported(): - return False - - # Check fp8 support - is_fp8 = self.quant_dtype == torch.float8_e4m3fn - if is_fp8 and not self.is_fe_fp8_supported(): - return False + # check type support + if self.quant_dtype is None: + if (self.dtype not in self.pf_supported_types() + or self.dtype not in self.fe_supported_types()): + return False + else: + if (self.quant_dtype not in self.pf_supported_types() + or self.quant_dtype not in self.fe_supported_types()): + return False - # Check fp8 block quanization support + # Check block quanization support is_block_quatized = self.quant_block_shape is not None - if is_block_quatized and not is_fp8: + if is_block_quatized and self.quant_dtype is None: return False - if is_block_quatized and not self.is_fe_block_fp8_supported(): + if is_block_quatized and not self.is_block_quant_supported(): return False # deep_gemm only works with block-quantized if self.needs_deep_gemm() and not is_block_quatized: return False - # Check dependencies + # Check dependencies (turn into asserts?) if self.needs_deep_ep() and not has_deep_ep(): return False if self.needs_deep_gemm() and not has_deep_gemm(): @@ -305,6 +255,8 @@ class WeightTensors: w2: torch.Tensor w1_scale: Optional[torch.Tensor] w2_scale: Optional[torch.Tensor] + w1_gs: Optional[torch.Tensor] = None + w2_gs: Optional[torch.Tensor] = None def describe(self): s = "" @@ -313,13 +265,20 @@ def describe(self): s += f' - {_describe_tensor(self.w2, "w2")} \n' s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n' s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n' + s += f' - {_describe_tensor(self.w1_gs, "w1_gs")} \n' + s += f' - {_describe_tensor(self.w2_gs, "w2_gs")} \n' return s + def is_quantized(self) -> bool: + # or w1_scale is not None? + return (self.w1.dtype == torch.float8_e4m3fn + or self.w1.dtype == torch.uint8 or self.w1.dtype == torch.int8) + def to_current_device(self): self.w1 = self.w1.to(device=torch.cuda.current_device()) self.w2 = self.w2.to(device=torch.cuda.current_device()) - is_quantized = self.w1.dtype == torch.float8_e4m3fn - if is_quantized: + + if self.is_quantized(): assert self.w1_scale is not None assert self.w2_scale is not None self.w1_scale = self.w1_scale.to( @@ -327,56 +286,51 @@ def to_current_device(self): self.w2_scale = self.w2_scale.to( device=torch.cuda.current_device()) + if self.w1_gs is not None: + assert self.w2_gs is not None + self.w1_gs = self.w1_gs.to(device=torch.cuda.current_device()) + self.w2_gs = self.w2_gs.to(device=torch.cuda.current_device()) + def slice_weights(self, rank: int, num_local_experts: int) -> "WeightTensors": s = rank * num_local_experts e = s + num_local_experts w1 = self.w1[s:e, :, :] w2 = self.w2[s:e, :, :] - is_quantized = self.w1.dtype == torch.float8_e4m3fn + w1_scale, w2_scale = (None, None) - if is_quantized: + if self.is_quantized(): assert self.w1_scale is not None assert self.w2_scale is not None w1_scale = self.w1_scale[s:e, :, :] w2_scale = self.w2_scale[s:e, :, :] - return WeightTensors(w1, w2, w1_scale, w2_scale) - @staticmethod - def make(config: Config) -> "WeightTensors": + w1_gs = self.w1_gs + w2_gs = self.w2_gs + if w1_gs is not None: + assert w2_gs is not None + w1_gs = w1_gs[s:e] + w2_gs = w2_gs[s:e] - if config.quant_dtype is None: - # just make normal dtype weights - w1, w2 = make_non_quant_weights(e=config.E, - n=config.N, - k=config.K, - dtype=config.dtype) - return WeightTensors(w1=w1, w2=w2, w1_scale=None, w2_scale=None) - - assert config.quant_dtype == torch.float8_e4m3fn - if not config.is_fp8_block_quantized(): - w1, w2, w1_scale, w2_scale = make_quant_fp8_weights( - e=config.E, - n=config.N, - k=config.K, - per_out_channel_quant=config.is_per_out_ch_quant, - ) - return WeightTensors(w1=w1, - w2=w2, - w1_scale=w1_scale, - w2_scale=w2_scale) + return WeightTensors(w1, w2, w1_scale, w2_scale, w1_gs, w2_gs) - assert config.quant_block_shape is not None - w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights( + @staticmethod + def make(config: Config) -> "WeightTensors": + (_, w1, w1_scale, w1_gs), (_, w2, w2_scale, w2_gs) = make_test_weights( e=config.E, n=config.N, k=config.K, - block_size=config.quant_block_shape, + in_dtype=config.dtype, + quant_dtype=config.quant_dtype, + block_shape=config.quant_block_shape, + per_act_token_quant=config.is_per_out_ch_quant, ) return WeightTensors(w1=w1, w2=w2, w1_scale=w1_scale, - w2_scale=w2_scale) + w2_scale=w2_scale, + w1_gs=w1_gs, + w2_gs=w2_gs) @dataclass @@ -449,7 +403,6 @@ def make(config: Config, pgi: ProcessGroupInfo): dtype=dtype) topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk, False) - topk_ids = topk_ids.to(config.topk_ids_dtype) # distribute topk_ids evenly for mi in range(m): @@ -457,7 +410,7 @@ def make(config: Config, pgi: ProcessGroupInfo): topk_ids = topk_ids.to(device=torch.cuda.current_device()) expert_map = None - if config.world_size > 1: + if config.world_size > 1 and config.supports_expert_map(): expert_map = torch.full((global_num_experts, ), fill_value=-1, dtype=torch.int32) @@ -480,92 +433,100 @@ def make(config: Config, pgi: ProcessGroupInfo): def reference_moe_impl(config: Config, weights: WeightTensors, rank_tensors: RankTensors) -> torch.Tensor: - return torch_experts(a=rank_tensors.hidden_states, - w1=weights.w1, - w2=weights.w2, + if config.quant_dtype == "nvfp4": + quant_blocksize = 16 + dtype = config.dtype + + w1_q = weights.w1 + w1_blockscale = weights.w1_scale + w1_gs = weights.w1_gs + + w2_q = weights.w2 + w2_blockscale = weights.w2_scale + w2_gs = weights.w2_gs + + a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax( + rank_tensors.hidden_states.flatten(), dim=-1)).to(torch.float32) + + assert w1_gs is not None + assert w2_gs is not None + assert w1_blockscale is not None + assert w2_blockscale is not None + + assert w1_blockscale.shape[1] % 128 == 0 + assert w1_blockscale.shape[2] % 4 == 0 + assert w2_blockscale.shape[1] % 128 == 0 + assert w2_blockscale.shape[2] % 4 == 0 + + a_fp4, a_scale_interleaved = ops.scaled_fp4_quant( + rank_tensors.hidden_states, a_global_scale) + + a = dequantize_nvfp4_to_dtype(a_fp4, + a_scale_interleaved, + a_global_scale, + dtype=dtype, + device=a_fp4.device, + block_size=quant_blocksize) + + e = w1_q.shape[0] + n = w1_q.shape[1] // 2 + k = w2_q.shape[1] + + w1 = torch.zeros((e, 2 * n, k), device="cuda", dtype=dtype) + w2 = torch.zeros((e, k, n), device="cuda", dtype=dtype) + + for idx in range(0, e): + w1[idx] = dequantize_nvfp4_to_dtype(w1_q[idx], + w1_blockscale[idx], + w1_gs[idx], + dtype=dtype, + device=w1_q.device, + block_size=quant_blocksize) + w2[idx] = dequantize_nvfp4_to_dtype(w2_q[idx], + w2_blockscale[idx], + w2_gs[idx], + dtype=dtype, + device=w2_q.device, + block_size=quant_blocksize) + a_scale = None + w1_scale = None + w2_scale = None + quant_dtype = None + per_act_token_quant = False + block_shape = None + else: + a = rank_tensors.hidden_states + a_scale = rank_tensors.hidden_states_scale + w1 = weights.w1 + w1_scale = weights.w1_scale + w2 = weights.w2 + w2_scale = weights.w2_scale + quant_dtype = config.quant_dtype + per_act_token_quant = config.is_per_act_token_quant + block_shape = config.quant_block_shape + + return torch_experts(a=a, + w1=w1, + w2=w2, topk_weight=rank_tensors.topk_weights, topk_ids=rank_tensors.topk_ids, global_num_experts=config.E, expert_map=None, - w1_scale=weights.w1_scale, - w2_scale=weights.w2_scale, - a1_scale=rank_tensors.hidden_states_scale, - quant_dtype=config.quant_dtype, - per_act_token_quant=config.is_per_act_token_quant, - block_shape=config.quant_block_shape, - apply_router_weights_on_input=config.topk == 1) - - -def make_fused_experts( - config: Config, moe: FusedMoEConfig, - num_dispatchers: int) -> mk.FusedMoEPermuteExpertsUnpermute: - - use_fp8 = config.quant_dtype == torch.float8_e4m3fn - batch_kwargs = { - "max_num_tokens": moe.max_num_tokens, - "num_dispatchers": num_dispatchers, - } - quant_kwargs = { - "use_fp8_w8a8": use_fp8, - "use_int8_w8a8": False, - "use_int8_w8a16": False, - "use_int4_w4a16": False, - "block_shape": config.quant_block_shape, - "per_act_token_quant": config.is_per_act_token_quant, - } - deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()} - - if config.fused_experts_type == BatchedDeepGemmExperts: - kwargs = batch_kwargs | { - "block_shape": config.quant_block_shape, - "per_act_token_quant": config.is_per_act_token_quant, - } - print(f"Making BatchedDeepGemmExperts {kwargs} ...") - experts = BatchedDeepGemmExperts(**kwargs) - elif config.fused_experts_type == BatchedTritonExperts: - kwargs = batch_kwargs | quant_kwargs - print(f"Making BatchedTritonExperts {kwargs} ...") - experts = BatchedTritonExperts(**kwargs) - elif config.fused_experts_type == BatchedTritonOrDeepGemmExperts: - kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs - print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...") - experts = BatchedTritonOrDeepGemmExperts(**kwargs) - elif config.fused_experts_type == DeepGemmExperts: - print("Making DeepGemmExperts () ...") - experts = DeepGemmExperts() - elif config.fused_experts_type == TritonExperts: - kwargs = quant_kwargs - print(f"Making TritonExperts {kwargs} ...") - experts = TritonExperts(**kwargs) - elif config.fused_experts_type == TritonOrDeepGemmExperts: - kwargs = quant_kwargs | deepgemm_kwargs - print(f"Making TritonOrDeepGemmExperts {kwargs} ...") - experts = TritonOrDeepGemmExperts(**kwargs) - elif config.fused_experts_type == NaiveBatchedExperts: - kwargs = batch_kwargs | quant_kwargs - print(f"Making NaiveBatchedExperts {kwargs} ...") - experts = NaiveBatchedExperts(**kwargs) - elif config.fused_experts_type == CutlassExpertsFp8: - use_batched_format = config.is_batched_prepare_finalize() - num_experts = (moe.num_local_experts - if use_batched_format else moe.num_experts) - kwargs = { - "max_experts_per_worker": num_experts, - "out_dtype": moe.in_dtype, - "per_act_token_quant": config.is_per_act_token_quant, - "per_out_ch_quant": config.is_per_out_ch_quant, - "block_shape": config.quant_block_shape, - "num_dispatchers": num_dispatchers, - "use_batched_format": use_batched_format - } - print(f"Making CutlassExpertsFp8 {kwargs} ...") - experts = CutlassExpertsFp8(**kwargs) + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + apply_router_weights_on_input=config.topk == 1 + and config.supports_apply_weight_on_input()) - return experts - -def make_modular_kernel(config: Config, - vllm_config: VllmConfig) -> mk.FusedMoEModularKernel: +def make_modular_kernel( + config: Config, + vllm_config: VllmConfig, + weights: WeightTensors, +) -> mk.FusedMoEModularKernel: def next_power_of_2(x): import math @@ -579,6 +540,7 @@ def next_power_of_2(x): dp_size_=get_dp_group().world_size, vllm_parallel_config=vllm_config.parallel_config, ) + moe = FusedMoEConfig( num_experts=config.E, experts_per_token=config.topk, @@ -591,15 +553,16 @@ def next_power_of_2(x): ) # make modular kernel - prepare_finalize = None - if config.needs_all2all(): - prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize(moe) - assert prepare_finalize is not None - else: - prepare_finalize = MoEPrepareAndFinalizeNoEP() - - fused_experts = make_fused_experts(config, moe, - prepare_finalize.num_dispatchers()) + prepare_finalize = make_prepare_finalize(config.prepare_finalize_type, + config.all2all_backend(), moe) + + fused_experts = make_fused_experts( + config.fused_experts_type, + moe, + prepare_finalize.num_dispatchers(), + weights.w1_gs, + weights.w2_gs, + ) modular_kernel = mk.FusedMoEModularKernel( prepare_finalize=prepare_finalize, fused_experts=fused_experts) @@ -620,22 +583,45 @@ def run_modular_kernel( # weights for rank rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts) - mk = make_modular_kernel(config, vllm_config) + mk = make_modular_kernel(config, vllm_config, weights) mk_kwargs = { - "hidden_states": rank_tensors.hidden_states.clone( + "hidden_states": + rank_tensors.hidden_states.clone( ), # impls might update the tensor in place - "w1": rank_weights.w1, - "w2": rank_weights.w2, - "topk_weights": rank_tensors.topk_weights, - "topk_ids": rank_tensors.topk_ids, - "expert_map": rank_tensors.expert_map, - "w1_scale": rank_weights.w1_scale, - "w2_scale": rank_weights.w2_scale, - "a1_scale": rank_tensors.hidden_states_scale, - "global_num_experts": config.E, - "apply_router_weight_on_input": config.topk == 1, + "w1": + rank_weights.w1, + "w2": + rank_weights.w2, + "topk_weights": + rank_tensors.topk_weights, + "topk_ids": + rank_tensors.topk_ids.to(mk.prepare_finalize.topk_indices_dtype()), + "expert_map": + rank_tensors.expert_map, + "w1_scale": + rank_weights.w1_scale, + "w2_scale": + rank_weights.w2_scale, + "a1_scale": + rank_tensors.hidden_states_scale, + "global_num_experts": + config.E, + "apply_router_weight_on_input": + config.topk == 1 and config.supports_apply_weight_on_input(), } - out = mk.forward(**mk_kwargs) + + num_tokens = rank_tensors.hidden_states.shape[0] + num_tokens_across_dp = torch.tensor([num_tokens] * config.world_size, + device="cuda", + dtype=torch.int) + + with set_forward_context( + None, + vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + ): + out = mk.forward(**mk_kwargs) return out diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index 73214066f7ea..aecffae36ae5 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -1,58 +1,316 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass +from typing import Optional, Union import torch # Fused experts and PrepareFinalize imports +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( BatchedDeepGemmExperts) from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 BatchedTritonOrDeepGemmExperts) -from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts, NaiveBatchedExperts) -from vllm.model_executor.layers.fused_moe.layer import TritonExperts +from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase, + TritonExperts) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts) -from vllm.utils import has_deep_ep, has_pplx +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + cutlass_fp4_supported) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + cutlass_fp8_supported) +from vllm.platforms import current_platform +from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx +from vllm.utils.deep_gemm import is_deep_gemm_supported +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe -if has_deep_ep(): + +@dataclass +class PrepareFinalizeInfo: + activation_format: mk.FusedMoEActivationFormat + supported_dtypes: list[Union[torch.dtype, str]] + blocked_quantization_support: bool + backend: Optional[str] + supports_apply_weight_on_input: bool = True + + +@dataclass +class ExpertInfo: + activation_format: mk.FusedMoEActivationFormat + supported_dtypes: list[Union[torch.dtype, str]] + blocked_quantization_support: bool + supports_chunking: bool + supports_expert_map: bool + needs_matching_quant: bool = False + needs_deep_gemm: bool = False + + +PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize, + PrepareFinalizeInfo] = {} +EXPERT_INFO: dict[mk.FusedMoEPermuteExpertsUnpermute, ExpertInfo] = {} +MK_ALL_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = [] +MK_MULTI_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = [] +MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = [] +MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = [] + +standard_format = mk.FusedMoEActivationFormat.Standard +batched_format = mk.FusedMoEActivationFormat.BatchedExperts +common_float_types: list[Union[torch.dtype, str]] = [ + torch.float8_e4m3fn, torch.bfloat16, torch.float16, torch.float32 +] +common_float_and_int_types = common_float_types + [torch.int8] +nv_fp4_types = ["nvfp4"] +fp8_types = [torch.float8_e4m3fn] + + +def register_prepare_and_finalize( + kind, + activation_format: mk.FusedMoEActivationFormat, + supported_dtypes: list[Union[torch.dtype, str]], + blocked_quantization_support: bool, + backend: Optional[str], + force_multigpu: bool = False, + supports_apply_weight_on_input: bool = True, +): + global PREPARE_FINALIZE_INFO + global MK_ALL_PREPARE_FINALIZE_TYPES + global MK_MULTI_GPU_PREPARE_FINALIZE_TYPES + global MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES + assert kind not in PREPARE_FINALIZE_INFO + + PREPARE_FINALIZE_INFO[kind] = PrepareFinalizeInfo( + activation_format, + supported_dtypes, + blocked_quantization_support, + backend, + supports_apply_weight_on_input, + ) + MK_ALL_PREPARE_FINALIZE_TYPES.append(kind) + if backend is not None or force_multigpu: + MK_MULTI_GPU_PREPARE_FINALIZE_TYPES.append(kind) + else: + MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES.append(kind) + + +def register_experts( + kind, + activation_format: mk.FusedMoEActivationFormat, + supported_dtypes: list[Union[torch.dtype, str]], + blocked_quantization_support: bool, + supports_chunking: bool, + supports_expert_map: bool, + needs_matching_quant: bool = False, + needs_deep_gemm: bool = False, +): + global EXPERT_INFO + global MK_FUSED_EXPERT_TYPES + assert kind not in EXPERT_INFO + + EXPERT_INFO[kind] = ExpertInfo( + activation_format, + supported_dtypes, + blocked_quantization_support, + supports_chunking, + supports_expert_map, + needs_matching_quant, + needs_deep_gemm, + ) + + MK_FUSED_EXPERT_TYPES.append(kind) + + +def prepare_finalize_info(kind) -> PrepareFinalizeInfo: + info = PREPARE_FINALIZE_INFO.get(kind) + assert info is not None + return info + + +def expert_info(kind) -> ExpertInfo: + info = EXPERT_INFO.get(kind) + assert info is not None + return info + + +register_prepare_and_finalize( + MoEPrepareAndFinalizeNoEP, + standard_format, + common_float_types, + blocked_quantization_support=True, + backend=None, +) + +register_experts( + BatchedTritonExperts, + batched_format, + common_float_types, + blocked_quantization_support=True, + supports_chunking=False, + supports_expert_map=False, + needs_matching_quant=True, +) + +register_experts( + TritonExperts, + standard_format, + common_float_and_int_types, + blocked_quantization_support=True, + supports_chunking=True, + supports_expert_map=True, + needs_matching_quant=True, +) + +register_experts( + NaiveBatchedExperts, + batched_format, + common_float_and_int_types, + blocked_quantization_support=True, + supports_chunking=False, + supports_expert_map=True, +) + +# Disable on blackwell for now +if has_deep_ep() and not current_platform.has_device_capability(100): from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 DeepEPHTPrepareAndFinalize) from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 DeepEPLLPrepareAndFinalize) + register_prepare_and_finalize( + DeepEPHTPrepareAndFinalize, + standard_format, + common_float_types, + blocked_quantization_support=True, + backend="deepep_high_throughput", + ) + + register_prepare_and_finalize( + DeepEPLLPrepareAndFinalize, + batched_format, + common_float_types, + blocked_quantization_support=True, + backend="deepep_low_latency", + ) + if has_pplx(): from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize) + register_prepare_and_finalize( + PplxPrepareAndFinalize, + batched_format, + common_float_and_int_types, + blocked_quantization_support=True, + backend="pplx", + ) -MK_MULTI_GPU_PREPARE_FINALIZE_TYPES = [] -if has_pplx(): - MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [PplxPrepareAndFinalize] -if has_deep_ep(): - MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [ - DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize - ] +if (has_flashinfer_cutlass_fused_moe() + and current_platform.has_device_capability(100)): + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + FlashInferExperts) + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + FlashInferCutlassMoEPrepareAndFinalize) -MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES = [MoEPrepareAndFinalizeNoEP] + register_prepare_and_finalize( + FlashInferCutlassMoEPrepareAndFinalize, + standard_format, + nv_fp4_types, + blocked_quantization_support=True, + backend=None, + force_multigpu=True, + supports_apply_weight_on_input=False, + ) -MK_ALL_PREPARE_FINALIZE_TYPES = (MK_MULTI_GPU_PREPARE_FINALIZE_TYPES + - MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES) + register_experts( + FlashInferExperts, + standard_format, + nv_fp4_types, + blocked_quantization_support=True, + supports_chunking=True, + # Note: this is a hack to get it to run for now + supports_expert_map=True, + ) +else: + FlashInferCutlassMoEPrepareAndFinalize = None -MK_FUSED_EXPERT_TYPES = [ - BatchedDeepGemmExperts, - BatchedTritonExperts, - NaiveBatchedExperts, - BatchedTritonOrDeepGemmExperts, - CutlassExpertsFp8, - DeepGemmExperts, - TritonOrDeepGemmExperts, - TritonExperts, -] +if has_deep_gemm() and is_deep_gemm_supported(): + register_experts( + BatchedDeepGemmExperts, + batched_format, + fp8_types, + blocked_quantization_support=True, + supports_chunking=False, + supports_expert_map=False, + needs_matching_quant=False, + needs_deep_gemm=True, + ) + register_experts( + DeepGemmExperts, + standard_format, + fp8_types, + blocked_quantization_support=True, + supports_chunking=True, + supports_expert_map=True, + needs_matching_quant=False, + needs_deep_gemm=True, + ), + register_experts( + BatchedTritonOrDeepGemmExperts, + batched_format, + common_float_and_int_types, + blocked_quantization_support=True, + supports_chunking=False, + supports_expert_map=False, + needs_matching_quant=True, + needs_deep_gemm=True, + ) + register_experts( + TritonOrDeepGemmExperts, + standard_format, + common_float_and_int_types, + blocked_quantization_support=True, + supports_chunking=True, + supports_expert_map=True, + needs_matching_quant=True, + needs_deep_gemm=True, + ) + +if cutlass_fp8_supported(): + from vllm.model_executor.layers.fused_moe import (CutlassBatchedExpertsFp8, + CutlassExpertsFp8) + register_experts( + CutlassExpertsFp8, + standard_format, + fp8_types, + blocked_quantization_support=False, + supports_chunking=True, + supports_expert_map=False, + ) + register_experts( + CutlassBatchedExpertsFp8, + batched_format, + fp8_types, + blocked_quantization_support=False, + supports_chunking=False, + supports_expert_map=False, + ) + +if cutlass_fp4_supported(): + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + CutlassExpertsFp4) + register_experts( + CutlassExpertsFp4, + standard_format, + nv_fp4_types, + blocked_quantization_support=True, + supports_chunking=True, + supports_expert_map=False, + ) MK_QUANT_CONFIGS = [ None, @@ -85,3 +343,156 @@ # block-quantized weights and per-token activations # block-quantized weights and per-tensor activations ] + +if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe(): + MK_QUANT_CONFIGS += [ + FusedMoEQuantConfig(quant_dtype="nvfp4", + per_out_ch_quant=False, + per_act_token_quant=False, + block_shape=None), + ] + + +def _make_gscale(num_experts: int) -> torch.Tensor: + return torch.ones((num_experts, ), + device=torch.cuda.current_device(), + dtype=torch.float32) + + +def make_prepare_finalize( + prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, + backend: Optional[str], + moe: FusedMoEConfig, +) -> mk.FusedMoEPrepareAndFinalize: + if backend != "naive" and backend is not None: + prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(moe) + assert prepare_finalize is not None + return prepare_finalize + elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize: + return FlashInferCutlassMoEPrepareAndFinalize( + use_dp=moe.moe_parallel_config.dp_size > 1, + a1_gscale=_make_gscale(moe.num_local_experts), + ) + else: + return MoEPrepareAndFinalizeNoEP() + + +def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor: + s = rank * num_local_experts + e = s + num_local_experts + return t[s:e] + + +def make_fused_experts( + fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute, + moe: FusedMoEConfig, + num_dispatchers: int, + w1_gs: Optional[torch.Tensor], + w2_gs: Optional[torch.Tensor], +) -> mk.FusedMoEPermuteExpertsUnpermute: + + use_fp8 = moe.quant_dtype == torch.float8_e4m3fn + batch_kwargs = { + "max_num_tokens": moe.max_num_tokens, + "num_dispatchers": num_dispatchers, + } + quant_kwargs = { + "use_fp8_w8a8": use_fp8, + "use_int8_w8a8": False, + "use_int8_w8a16": False, + "use_int4_w4a16": False, + "block_shape": moe.block_shape, + "per_act_token_quant": moe.per_act_token_quant, + } + deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()} + + if fused_experts_type == BatchedDeepGemmExperts: + kwargs = batch_kwargs | { + "block_shape": moe.block_shape, + "per_act_token_quant": moe.per_act_token_quant, + } + print(f"Making BatchedDeepGemmExperts {kwargs} ...") + experts = BatchedDeepGemmExperts(**kwargs) + elif fused_experts_type == BatchedTritonExperts: + kwargs = batch_kwargs | quant_kwargs + print(f"Making BatchedTritonExperts {kwargs} ...") + experts = BatchedTritonExperts(**kwargs) + elif fused_experts_type == BatchedTritonOrDeepGemmExperts: + kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs + print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...") + experts = BatchedTritonOrDeepGemmExperts(**kwargs) + elif fused_experts_type == DeepGemmExperts: + print("Making DeepGemmExperts () ...") + experts = DeepGemmExperts() + elif fused_experts_type == TritonExperts: + kwargs = quant_kwargs + print(f"Making TritonExperts {kwargs} ...") + experts = TritonExperts(**kwargs) + elif fused_experts_type == TritonOrDeepGemmExperts: + kwargs = quant_kwargs | deepgemm_kwargs + print(f"Making TritonOrDeepGemmExperts {kwargs} ...") + experts = TritonOrDeepGemmExperts(**kwargs) + elif fused_experts_type == NaiveBatchedExperts: + kwargs = batch_kwargs | quant_kwargs + print(f"Making NaiveBatchedExperts {kwargs} ...") + experts = NaiveBatchedExperts(**kwargs) + elif fused_experts_type == CutlassExpertsFp8: + kwargs = { + "out_dtype": moe.in_dtype, + "per_act_token_quant": moe.per_act_token_quant, + "per_out_ch_quant": moe.per_out_ch_quant, + "block_shape": moe.block_shape, + } + print(f"Making CutlassExpertsFp8 {kwargs} ...") + experts = CutlassExpertsFp8(**kwargs) + elif fused_experts_type == CutlassBatchedExpertsFp8: + kwargs = { + "max_experts_per_worker": moe.num_local_experts, + "num_dispatchers": num_dispatchers, + "out_dtype": moe.in_dtype, + "per_act_token_quant": moe.per_act_token_quant, + "per_out_ch_quant": moe.per_out_ch_quant, + "block_shape": moe.block_shape, + } + print(f"Making CutlassBatchedExpertsFp8 {kwargs} ...") + experts = CutlassBatchedExpertsFp8(**kwargs) + elif fused_experts_type == CutlassExpertsFp4: + assert w1_gs is not None and w2_gs is not None + num_experts = moe.num_local_experts + rank = moe.moe_parallel_config.dp_rank + kwargs = { + "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)), + "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)), + "a1_gscale": _make_gscale(num_experts), + "a2_gscale": _make_gscale(num_experts), + "max_experts_per_worker": num_experts, + "out_dtype": moe.in_dtype, + "per_act_token_quant": moe.per_act_token_quant, + "per_out_ch_quant": moe.per_out_ch_quant, + "block_shape": moe.block_shape, + "num_dispatchers": num_dispatchers, + } + print(f"Making CutlassExpertsFp4 {kwargs} ...") + experts = CutlassExpertsFp4(**kwargs) + elif fused_experts_type == FlashInferExperts: + assert w1_gs is not None and w2_gs is not None + num_experts = moe.num_local_experts + rank = moe.moe_parallel_config.dp_rank + kwargs = { + "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)), + "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)), + "a1_gscale": _make_gscale(num_experts), + "a2_gscale": _make_gscale(num_experts), + "out_dtype": moe.in_dtype, + "quant_dtype": "nvfp4", + "ep_rank": moe.ep_rank, + "ep_size": moe.ep_size, + "tp_rank": moe.tp_rank, + "tp_size": moe.tp_size, + } + print(f"Making FlashInferExperts {kwargs} ...") + experts = FlashInferExperts(**kwargs) + else: + raise RuntimeError(f"Unknown fused experts type: {fused_experts_type}") + + return experts diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index dd16ffb2eabe..0da6ee354352 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -52,7 +52,7 @@ def profile_modular_kernel( rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts) # make modular kernel - mk = make_modular_kernel(config, vllm_config) + mk = make_modular_kernel(config, vllm_config, weights) mk_kwargs = { "hidden_states": rank_tensors.hidden_states, @@ -83,7 +83,7 @@ def rank_worker( # sanity check from vllm import envs if config.fused_moe_chunk_size is not None: - assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE) + assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE # get weights to this device weights.to_current_device() diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py deleted file mode 100644 index 866f52882bee..000000000000 --- a/tests/kernels/moe/modular_kernel_tools/utils.py +++ /dev/null @@ -1,117 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -import vllm._custom_ops as ops -from vllm.utils.deep_gemm import per_block_cast_to_fp8 - - -def per_token_cast_to_fp8( - x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - pad_size = (block_size - (n % block_size)) % block_size - x = torch.nn.functional.pad(x, - (0, pad_size), value=0) if pad_size > 0 else x - x_view = x.view(m, -1, block_size) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn) - return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) - - -def make_non_quant_weights( - e: int, - n: int, - k: int, - dtype: torch.dtype, -) -> tuple[torch.Tensor, torch.Tensor]: - """ - Return weights w1, w2 - """ - device = torch.cuda.current_device() - w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 15 - w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 15 - return w1, w2 - - -def make_block_quant_fp8_weights( - e: int, - n: int, - k: int, - block_size: list[int], -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Return weights w1, w2, w1_scale, w2_scale - """ - dtype = torch.bfloat16 - device = torch.cuda.current_device() - - fp8_info = torch.finfo(torch.float8_e4m3fn) - fp8_max, fp8_min = fp8_info.max, fp8_info.min - - w1_bf16, w2_bf16 = make_non_quant_weights(e, n, k, dtype) - w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype) - w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype) - - block_n, block_k = block_size[0], block_size[1] - n_tiles_w1 = ((2 * n) + block_n - 1) // block_n - k_tiles_w1 = (k + block_k - 1) // block_k - n_tiles_w2 = (k + block_n - 1) // block_n - k_tiles_w2 = (n + block_k - 1) // block_k - - w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device) - w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device) - - w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1), - device=device, - dtype=torch.float32) - w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2), - device=device, - dtype=torch.float32) - - assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n, - (k + (block_k - 1)) // block_k) - assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2] - - for i in range(e): - w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i], - block_size=[block_k, block_n]) - w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i], - block_size=[block_k, block_n]) - - return w1, w2, w1_s, w2_s - - -def make_quant_fp8_weights( - e: int, - n: int, - k: int, - per_out_channel_quant: bool, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Return w1, w2, w1_scale, w2_scale - """ - q_dtype = torch.float8_e4m3fn - - w1, w2 = make_non_quant_weights(e, n, k, dtype=torch.bfloat16) - - # w1 -> w1_q, w2 -> w2_q - w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype) - w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype) - - n_b_scales = 2 * n if per_out_channel_quant else 1 - k_b_scales = k if per_out_channel_quant else 1 - w1_scale = torch.empty((e, n_b_scales, 1), - device="cuda", - dtype=torch.float32) - w2_scale = torch.empty((e, k_b_scales, 1), - device="cuda", - dtype=torch.float32) - - for expert in range(e): - w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant( - w1[expert], use_per_token_if_dynamic=per_out_channel_quant) - w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant( - w2[expert], use_per_token_if_dynamic=per_out_channel_quant) - return w1_q, w2_q, w1_scale, w2_scale diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index edf3e6189243..00b2d780e66f 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -133,7 +133,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, per_act_token_quant=per_act_token_quant, ) - B, B_q, B_scale, _, _, _ = make_test_weights( + (B, B_q, B_scale, _), _ = make_test_weights( num_experts, N // 2, K, @@ -243,7 +243,7 @@ def test_fused_moe_batched_experts( act_dtype = dtype quant_dtype = None - w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights( + (w1_16, w1, w1_s, _), (w2_16, w2, w2_s, _) = make_test_weights( e, n, k, diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 75b2e9f79178..9e4eaf221f24 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -161,18 +161,20 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed, a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) - _, w1, w1_s, _, w2, w2_s = make_test_weights(E, - N, - K, - dtype, - torch.float8_e4m3fn, - per_act_token_quant=False, - block_shape=block_size) + (_, w1, w1_s, _), (_, w2, w2_s, + _) = make_test_weights(E, + N, + K, + dtype, + torch.float8_e4m3fn, + per_act_token_quant=False, + block_shape=block_size) m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True, use_int8_w8a8=False, use_int8_w8a16=False, use_int4_w4a16=False, + use_mxfp4_w4a4=False, per_act_token_quant=False, block_shape=block_size) @@ -247,13 +249,14 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) - _, w1, w1_s, _, w2, w2_s = make_test_weights(E, - N, - K, - dtype, - torch.float8_e4m3fn, - per_act_token_quant=False, - block_shape=block_size) + (_, w1, w1_s, _), (_, w2, w2_s, + _) = make_test_weights(E, + N, + K, + dtype, + torch.float8_e4m3fn, + per_act_token_quant=False, + block_shape=block_size) # Note: for now use_compile will error out if the problem size is # large enough to trigger chunking. I'm leaving the flag and diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py index 8e680c722935..5e4a93963f8e 100644 --- a/tests/kernels/moe/test_block_int8.py +++ b/tests/kernels/moe/test_block_int8.py @@ -118,13 +118,14 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) - _, w1, w1_s, _, w2, w2_s = make_test_weights(E, - N, - K, - dtype, - torch.int8, - per_act_token_quant=False, - block_shape=block_size) + (_, w1, w1_s, _), (_, w2, w2_s, + _) = make_test_weights(E, + N, + K, + dtype, + torch.int8, + per_act_token_quant=False, + block_shape=block_size) # Set the context to avoid lots of warning spam. with set_current_vllm_config(vllm_config): diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py index 1aee1ed8c376..3b1618dacac7 100644 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py @@ -9,6 +9,7 @@ import pytest import torch +from tests.kernels.moe.utils import per_token_cast_to_fp8 from tests.kernels.utils import baseline_scaled_mm from vllm import _custom_ops as ops from vllm.platforms import current_platform @@ -16,20 +17,6 @@ from vllm.utils.deep_gemm import per_block_cast_to_fp8 -def per_token_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - pad_size = (128 - (n % 128)) % 128 - x = torch.nn.functional.pad(x, - (0, pad_size), value=0) if pad_size > 0 else x - x_view = x.view(m, -1, 128) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - fp8_data = (x_view * - (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn) - return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) - - @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [ (4, 8192, 7168, 4096), (4, 8192, 2048, 7168), @@ -76,7 +63,7 @@ def test_cutlass_grouped_gemm( device=device, dtype=torch.float)) for i in range(num_groups): - y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i]) + y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128]) for i in range(num_groups): a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]] diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 9b064db973dd..6f95581a5e60 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -70,8 +70,10 @@ def make_block_quant_fp8_weights( """ Return weights w1q, w2q, w1_scale, w2_scale """ - w1, w1q, w1_scale, w2, w2q, w2_scale = make_test_weights( - e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_size) + (_, w1q, w1_scale, _), (_, w2q, w2_scale, + _) = make_test_weights(e, n, k, torch.bfloat16, + torch.float8_e4m3fn, + block_size) return w1q, w2q, w1_scale, w2_scale diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index b2b78662c9de..4472f34a6291 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -132,9 +132,9 @@ def run_single_case(m, n, k, topk, num_experts, block_size): # Note: W1 has shape (E, 2N, K), so N = 512 # can trigger the deepgemm path. MNKs = [ - (1024, 512, 128), - (1024, 512, 512), - (2048, 512, 512), + (1024, 768, 128), + (1024, 768, 512), + (2048, 768, 512), (512, 1024, 1024), (512, 2048, 2048), (4096, 4096, 1024), diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py new file mode 100644 index 000000000000..1c14df2b914a --- /dev/null +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from tests.kernels.moe.utils import make_test_weights +from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, + FLOAT8_E4M3_MAX, + dequantize_nvfp4_to_dtype) +from tests.kernels.utils import torch_moe +from vllm import _custom_ops as ops +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe) +from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP) +from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe + +if not has_flashinfer_cutlass_fused_moe( +) or not current_platform.has_device_capability(100): + pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support", + allow_module_level=True) + +MNK_FACTORS = [ + (2, 1024, 1024), + (2, 1024, 1536), + (2, 3072, 1024), + (2, 3072, 1536), + (64, 1024, 1024), + (64, 1024, 1536), + (64, 3072, 1024), + (64, 2048, 1536), + (224, 1024, 1024), + (224, 1024, 1536), +] + + +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) +@pytest.mark.parametrize("e", [40, 64, 256]) +#@pytest.mark.parametrize("e", [128, 256]) +@pytest.mark.parametrize("topk", [1, 6, 8]) +@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) +@torch.inference_mode() +def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, + dtype: torch.dtype): + current_platform.seed_everything(7) + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1))): + + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + + quant_blocksize = 16 + + (_, w1_q, w1_blockscale, + w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights( + e, + n, + k, + in_dtype=dtype, + quant_dtype="nvfp4", + block_shape=None, # use quant_blocksize? + per_act_token_quant=False, + ) + + score = torch.randn((m, e), device="cuda", dtype=dtype) + topk_weights, topk_ids, _ = fused_topk(a, + score, + topk, + renormalize=False) + + a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) + a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) + + assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q) + + assert w1_gs is not None + assert w2_gs is not None + assert w1_blockscale is not None + assert w2_blockscale is not None + + flashinfer_experts = FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + FlashInferExperts( + a1_gscale=a1_gs, + g1_alphas=(1 / w1_gs), + a2_gscale=a2_gs, + g2_alphas=(1 / w2_gs), + out_dtype=dtype, + quant_dtype="nvfp4", + )) + + flashinfer_output = flashinfer_experts( + hidden_states=a, + w1=w1_q, + w1_scale=w1_blockscale, + w2=w2_q, + w2_scale=w2_blockscale, + a1_scale=a1_gs, + a2_scale=a2_gs, + topk_weights=topk_weights, + topk_ids=topk_ids, + ) + + # Reference check: + a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / + torch.amax(a.flatten(), dim=-1)).to(torch.float32) + a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale) + _, m_k = a_fp4.shape + a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4, + a_scale_interleaved, + a_global_scale, + dtype=a.dtype, + device=a.device, + block_size=quant_blocksize) + + w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype) + w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype) + + for idx in range(0, e): + w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx], + w1_blockscale[idx], + w1_gs[idx], + dtype=dtype, + device=w1_q.device, + block_size=quant_blocksize) + w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx], + w2_blockscale[idx], + w2_gs[idx], + dtype=dtype, + device=w2_q.device, + block_size=quant_blocksize) + + torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk) + + torch.testing.assert_close(torch_output, + flashinfer_output, + atol=1e-1, + rtol=1e-1) + + +if __name__ == "__main__": + test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half) diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 6f2869c3a61d..d45982384eb3 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy +import textwrap +import traceback from itertools import product from typing import Optional @@ -10,41 +12,51 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.config import VllmConfig, current_platform, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 - BatchedTritonOrDeepGemmExperts) from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedTritonExperts) -from vllm.model_executor.layers.fused_moe.layer import TritonExperts -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( - TritonOrDeepGemmExperts) from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors, reference_moe_impl, run_modular_kernel) from .modular_kernel_tools.mk_objects import ( MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, - MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES) + MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, expert_info) from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo, parallel_launch_with_config) -# TODO (varun): These requirements are very strict and could be relaxed. -has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx()) +has_any_multi_gpu_package = (has_deep_ep() or has_deep_gemm() or has_pplx() + or has_flashinfer_cutlass_fused_moe()) -meets_package_requirements = pytest.mark.skipif( - not has_all_packages, - reason="Requires deep_ep & deep_gemm & pplx packages", +meets_multi_gpu_requirements = pytest.mark.skipif( + not has_any_multi_gpu_package, + reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages", ) +def format_result(verbose, msg, ex=None): + if ex is not None: + x = str(ex) + newx = x.strip(" \n\t")[:16] + if len(newx) < len(x): + newx = newx + " ..." + + prefix = "E\t" + print(f"{textwrap.indent(traceback.format_exc(), prefix)}") + print(f"FAILED {msg} - {newx}\n") + elif verbose: + print(f"PASSED {msg}") + else: + print(".", end="") + + def rank_worker( pgi: ProcessGroupInfo, vllm_config: VllmConfig, cpu_group, config: Config, weights: WeightTensors, + verbose: bool, ): current_platform.seed_everything(pgi.rank) @@ -61,39 +73,64 @@ def rank_worker( TOPKs = config.topks assert isinstance(TOPKs, list) - for m, topk in product(Ms, TOPKs): - print(f"Running m={m}, topk={topk} ...") - # override m and topk - cfgx = copy.deepcopy(config) - cfgx.Ms = m - cfgx.topks = topk - - # inputs for rank - rank_tensors = RankTensors.make(cfgx, pgi) - - # modular kernel out - mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, - rank_tensors) + exceptions = [] + count = 0 - with set_current_vllm_config(vllm_config): - ref_out = reference_moe_impl(cfgx, weights, rank_tensors) - - torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2) - - -def run(config: Config): + for m, topk in product(Ms, TOPKs): + try: + print(f"Running[{pgi.rank}]: m={m}, topk={topk} ...") + count = count + 1 + # override m and topk + cfgx = copy.deepcopy(config) + cfgx.Ms = m + cfgx.topks = topk + + # inputs for rank + rank_tensors = RankTensors.make(cfgx, pgi) + + # modular kernel out + mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, + rank_tensors) + + with set_current_vllm_config(vllm_config): + ref_out = reference_moe_impl(cfgx, weights, rank_tensors) + + if config.quant_dtype == "nvfp4": + atol = 1e-1 + rtol = 1e-1 + else: + atol = 3e-2 + rtol = 3e-2 + + torch.testing.assert_close(ref_out, mk_out, atol=atol, rtol=rtol) + format_result(verbose, config.describe()) + except Exception as ex: + format_result(verbose, config.describe(), ex) + exceptions.append(ex) + + if len(exceptions) > 0: + raise RuntimeError( + f"{len(exceptions)} of {count} tests failed in child process, " + f"rank={pgi.rank}.") + else: + print(f"{count} of {count} tests passed in child process, " + f"rank={pgi.rank}.") + + +def run(config: Config, verbose: bool): assert config.is_valid() - print(f"Testing config \n{config.describe()} ...") weights: WeightTensors = WeightTensors.make(config) vllm_config, env_dict = config.make_env_data() parallel_launch_with_config(config.world_size, rank_worker, vllm_config, - env_dict, config, weights) + env_dict, config, weights, verbose) Ms = [32, 64] -Ks = [7168] # hidden sizes +# hidden sizes, making this too large will cause fp4 tests to fail. +# Also needs to be a multiple of 1024 for deep_gemm. +Ks = [2048] Ns = [2048] TOPKs = [4, 1] Es = [32] @@ -103,19 +140,16 @@ def run(config: Config): def is_nyi_config(config: Config) -> bool: # We know these configs to be legitimate. but still fail. + info = expert_info(config.fused_experts_type) - if (config.fused_experts_type in [ - BatchedTritonExperts, BatchedTritonOrDeepGemmExperts, - TritonExperts, TritonOrDeepGemmExperts - ]): + if info.needs_matching_quant: # The triton kernels expect both per-act-token-quant and # per-out-ch-quant or neither. unsupported_quant_config = ((config.is_per_act_token_quant + config.is_per_out_ch_quant) == 1) return unsupported_quant_config - # cutlass kernels dont support expert_maps yet. - return config.fused_experts_type == CutlassExpertsFp8 + return not info.supports_expert_map @pytest.mark.parametrize("k", Ks) @@ -128,13 +162,13 @@ def is_nyi_config(config: Config) -> bool: product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES)) @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs) @pytest.mark.parametrize("world_size", [2]) -@meets_package_requirements +@meets_multi_gpu_requirements def test_modular_kernel_combinations_multigpu( k: int, n: int, e: int, dtype: torch.dtype, - quant_config: FusedMoEQuantConfig, + quant_config: Optional[FusedMoEQuantConfig], combination: tuple[mk.FusedMoEPrepareAndFinalize, mk.FusedMoEPermuteExpertsUnpermute], - fused_moe_chunk_size: Optional[int], world_size: int): + fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig): config = Config( Ms=Ms, @@ -149,14 +183,15 @@ def test_modular_kernel_combinations_multigpu( fused_moe_chunk_size=fused_moe_chunk_size, world_size=world_size, ) + if not config.is_valid(): pytest.skip(f"Tests config {config} is not valid. Skipping ...") if is_nyi_config(config): pytest.skip(f"Tests config {config} is nyi. Skipping ...") - print(f"{config.describe()}") - run(config) + verbosity = pytestconfig.getoption('verbose') + run(config, verbosity > 0) @pytest.mark.parametrize("k", Ks) @@ -169,13 +204,12 @@ def test_modular_kernel_combinations_multigpu( product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES)) @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs) @pytest.mark.parametrize("world_size", [1]) -@meets_package_requirements def test_modular_kernel_combinations_singlegpu( k: int, n: int, e: int, dtype: torch.dtype, - quant_config: FusedMoEQuantConfig, + quant_config: Optional[FusedMoEQuantConfig], combination: tuple[mk.FusedMoEPrepareAndFinalize, mk.FusedMoEPermuteExpertsUnpermute], - fused_moe_chunk_size: Optional[int], world_size: int): + fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig): config = Config( Ms=Ms, K=k, @@ -196,7 +230,8 @@ def test_modular_kernel_combinations_singlegpu( if is_nyi_config(config): pytest.skip(f"Tests config {config} is nyi. Skipping ...") - run(config) + verbosity = pytestconfig.getoption('verbose') + run(config, verbosity > 0) if __name__ == '__main__': @@ -211,4 +246,4 @@ def test_modular_kernel_combinations_singlegpu( args = parser.parse_args() config = make_config(args) - run(config) + run(config, True) diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index 3ff385360299..30388ef9375d 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -3,6 +3,7 @@ import pytest import torch +from tests.kernels.moe.utils import make_test_weights from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dtype) @@ -43,41 +44,20 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, VllmConfig(parallel_config=ParallelConfig( pipeline_parallel_size=1))): - a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 - w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 quant_blocksize = 16 - round_up = lambda x, y: (x + y - 1) // y * y - sf_w1_2n = round_up(2 * n, 128) - sf_w1_k = round_up(k // quant_blocksize, 4) - w1_blockscale = torch.empty((e, sf_w1_2n, sf_w1_k), - device="cuda", - dtype=torch.float8_e4m3fn) - - w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 - sf_w2_k = round_up(k, 128) - sf_w2_n = round_up(n // quant_blocksize, 4) - w2_blockscale = torch.empty((e, sf_w2_k, sf_w2_n), - device="cuda", - dtype=torch.float8_e4m3fn) - - w1_q = torch.empty((e, 2 * n, k // 2), - device="cuda", - dtype=torch.uint8) - w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8) - w1_gs = torch.empty((e, ), device="cuda", dtype=torch.float32) - w2_gs = torch.empty((e, ), device="cuda", dtype=torch.float32) - - for expert in range(e): - w1_amax = torch.abs(w1).max().to(torch.float32) - w2_amax = torch.abs(w2).max().to(torch.float32) - w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax - w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax - - w1_q[expert], w1_blockscale[expert] = ops.scaled_fp4_quant( - w1[expert], w1_gs[expert]) - - w2_q[expert], w2_blockscale[expert] = ops.scaled_fp4_quant( - w2[expert], w2_gs[expert]) + + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + + (_, w1_q, w1_blockscale, + w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights( + e, + n, + k, + in_dtype=dtype, + quant_dtype="nvfp4", + block_shape=None, # use quant_blocksize? + per_act_token_quant=False, + ) score = torch.randn((m, e), device="cuda", dtype=dtype) topk_weights, topk_ids, _ = fused_topk(a, @@ -88,6 +68,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) + assert w1_gs is not None + assert w2_gs is not None + assert w1_blockscale is not None + assert w2_blockscale is not None + cutlass_output = cutlass_moe_fp4( a=a, a1_gscale=a1_gs, @@ -104,14 +89,13 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, n=n, k=k, e=e, - device=a.device, ) # Reference check: a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a.flatten(), dim=-1)).to(torch.float32) a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale) - _, m_k = a_fp4.shape + a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4, a_scale_interleaved, a_global_scale, @@ -126,14 +110,14 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx], w1_blockscale[idx], w1_gs[idx], - dtype=w1.dtype, - device=w1.device, + dtype=dtype, + device=w1_q.device, block_size=quant_blocksize) w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx], w2_blockscale[idx], w2_gs[idx], - dtype=w2.dtype, - device=w2.device, + dtype=dtype, + device=w2_q.device, block_size=quant_blocksize) torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk) diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index e4f4a393dfd5..f98937ee6c52 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -9,7 +9,8 @@ from tests.kernels.utils import torch_experts from vllm import _custom_ops as ops from vllm.config import VllmConfig, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 +from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + CutlassBatchedExpertsFp8) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) @@ -123,12 +124,8 @@ def pplx_cutlass_moe( num_local_experts=num_local_experts, num_dispatchers=num_dispatchers) - experts = CutlassExpertsFp8(num_local_experts, - out_dtype, - per_act_token, - per_out_ch, - num_dispatchers=num_dispatchers, - use_batched_format=True) + experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers, + out_dtype, per_act_token, per_out_ch) fused_cutlass_experts = FusedMoEModularKernel( prepare_finalize, diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index fbef6706beaf..c2064de97358 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -770,7 +770,7 @@ def test_pplx_moe_slow( a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) - _, w1, w1_s, _, w2, w2_s = make_test_weights( + (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( e, n, k, @@ -836,7 +836,7 @@ def format_result(msg, ex=None): args = dict() if make_weights: - _, w1, w1_s, _, w2, w2_s = make_test_weights( + (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( e, n, k, diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index c33134981acc..82960bd57345 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional +from typing import Optional, Union import torch import vllm._custom_ops as ops from tests.kernels.quant_utils import per_block_cast_to_int8 +from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, + FLOAT8_E4M3_MAX) from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts) @@ -169,28 +171,41 @@ def make_quantized_test_activations( def moe_quantize_weights( w: torch.Tensor, w_s: Optional[torch.Tensor], - quant_dtype: Optional[torch.dtype], + quant_dtype: Union[torch.dtype, str, None], per_token_quant: bool, block_shape: Optional[list[int]], -) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - assert (quant_dtype == torch.float8_e4m3fn - or quant_dtype == torch.int8), "only fp8/int8 supported" +) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + assert (quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8 + or quant_dtype == "nvfp4"), "only fp8/int8/nvfp4 supported" + + w_gs = None if block_shape is not None: assert not per_token_quant if quant_dtype == torch.int8: w, w_s = per_block_cast_to_int8(w, block_shape) - else: + elif quant_dtype == torch.float8_e4m3fn: w, w_s = per_block_cast_to_fp8(w, block_shape) + elif quant_dtype == "nvfp4": + raise RuntimeError("blocked quantization not supported for nvfp4") + else: + raise RuntimeError(f"Unsupported quant type {quant_dtype}") else: if quant_dtype == torch.int8: w, w_s = ops.scaled_int8_quant( w, w_s, use_per_token_if_dynamic=per_token_quant) - else: + elif quant_dtype == torch.float8_e4m3fn: w, w_s = ops.scaled_fp8_quant( w, w_s, use_per_token_if_dynamic=per_token_quant) + elif quant_dtype == "nvfp4": + assert not per_token_quant + w_amax = torch.abs(w).max().to(torch.float32) + w_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w_amax + w, w_s = ops.scaled_fp4_quant(w, w_gs) + else: + raise RuntimeError(f"Unsupported quant type {quant_dtype}") - return w, w_s + return w, w_s, w_gs def make_test_weight( @@ -198,21 +213,26 @@ def make_test_weight( rows: int, cols: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Optional[torch.dtype] = None, + quant_dtype: Union[torch.dtype, str, None] = None, block_shape: Optional[list[int]] = None, per_act_token_quant: bool = False, -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor]]: w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15 + w_gs = None if quant_dtype is not None: w_l = [None] * e w_s_l = [None] * e + w_gs_l = [None] * e for idx in range(e): - w_l[idx], w_s_l[idx] = moe_quantize_weights( + w_l[idx], w_s_l[idx], w_gs_l[idx] = moe_quantize_weights( w_16[idx], None, quant_dtype, per_act_token_quant, block_shape) w = torch.stack(w_l) w_s = torch.stack(w_s_l) + if e > 0 and w_gs_l[0] is not None: + w_gs = torch.stack(w_gs_l) if w_s.ndim == 2: assert w_s.shape[-1] == 1 w_s = w_s.view(-1, 1, 1) @@ -225,8 +245,9 @@ def make_test_weight( else: w = w_16 w_s = None + w_gs = None - return w_16, w, w_s + return w_16, w, w_s, w_gs def make_test_weights( @@ -234,14 +255,30 @@ def make_test_weights( n: int, k: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Optional[torch.dtype] = None, + quant_dtype: Union[torch.dtype, str, None] = None, block_shape: Optional[list[int]] = None, per_act_token_quant: bool = False, -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor, - torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor]], + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor]]]: return ( - *make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape, - per_act_token_quant), - *make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, - per_act_token_quant), + make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape, + per_act_token_quant), + make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, + per_act_token_quant), ) + + +def per_token_cast_to_fp8( + x: torch.Tensor, + block_size: int = 128) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + pad_size = (block_size - (n % block_size)) % block_size + x = torch.nn.functional.pad(x, + (0, pad_size), value=0) if pad_size > 0 else x + x_view = x.view(m, -1, block_size) + x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) + fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn) + return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 127a340fc6c6..9e5aa4e4c2a8 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -105,7 +105,8 @@ def __init__(self, # we initialize the all2all manager used in expert parallel. use_ep = config.parallel_config.data_parallel_size > 1 - self.use_all2all = "ep" in unique_name and use_ep + self.is_ep_communicator = "ep" in unique_name + self.use_all2all = self.is_ep_communicator and use_ep self.all2all_manager: Optional[All2AllManagerBase] = None def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: @@ -246,7 +247,7 @@ def prepare_communication_buffer_for_model(self, """ Prepare the communication buffer for the model. """ - if not self.use_all2all: + if not self.is_ep_communicator: return moe_modules = [ @@ -254,7 +255,7 @@ def prepare_communication_buffer_for_model(self, if module.__class__.__name__ == "FusedMoE" ] for module in moe_modules: - module.quant_method.init_prepare_finalize(module.moe_config) + module.quant_method.init_prepare_finalize() def dispatch( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 3d40879b4ccb..3007643d7a28 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -49,7 +49,8 @@ def get_config() -> Optional[dict[str, Any]]: from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 BatchedTritonOrDeepGemmExperts) from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - CutlassExpertsFp8, cutlass_moe_fp4, cutlass_moe_fp8) + CutlassBatchedExpertsFp8, CutlassExpertsFp8, cutlass_moe_fp4, + cutlass_moe_fp8) from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( DeepGemmExperts) from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( @@ -69,6 +70,7 @@ def get_config() -> Optional[dict[str, Any]]: "cutlass_moe_fp8", "cutlass_moe_fp4", "CutlassExpertsFp8", + "CutlassBatchedExpertsFp8", "TritonExperts", "BatchedTritonExperts", "DeepGemmExperts", diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index c48a0137c306..d9cfe96f7a03 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -254,18 +254,28 @@ def workspace_shapes( output = (num_experts, max_num_tokens * num_dispatchers, K) return (workspace13, workspace2, output, a.dtype) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): assert expert_tokens_meta is not None expert_num_tokens = expert_tokens_meta.expert_num_tokens diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py index fc30e84e6656..89d7412ee223 100644 --- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -132,18 +132,28 @@ def workspace_shapes( a, aq, M, N, K, topk, global_num_experts, local_num_experts, expert_tokens_metadata) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): experts = (self.batched_deep_gemm_experts if self.allow_deep_gemm else self.batched_triton_experts) assert experts is not None @@ -151,4 +161,4 @@ def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, activation, global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale, workspace13, workspace2, expert_tokens_meta, - apply_router_weight_on_input, extra_expert_args) + apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 31ea826f1f97..7c1a7b636a9c 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -45,7 +45,6 @@ def get_quant_config_weight_quant( return _get_quant_config_quantization_args(quant_config, "weights") -# TODO (bnell): use scalar_type instead of bools? def get_config_quant_dtype( use_fp8_w8a8: bool, use_int8_w8a8: bool, @@ -65,7 +64,8 @@ def get_config_quant_dtype( @dataclass class FusedMoEQuantConfig: # The post quantization activation type. - quant_dtype: Optional[torch.dtype] = None + # TODO (bnell): use scalar_type instead of Union. + quant_dtype: Union[torch.dtype, str, None] = None per_act_token_quant: bool = False per_out_ch_quant: bool = False block_shape: Optional[list[int]] = None @@ -141,6 +141,7 @@ def make( use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, + use_mxfp4_w4a4, ] ]) <= 1, "Quantization flags are mutually exclusive." @@ -334,7 +335,7 @@ def __post_init__(self): assert self.max_num_tokens > 0 @property - def quant_dtype(self) -> Optional[torch.dtype]: + def quant_dtype(self) -> Union[torch.dtype, str, None]: if self.quant_config is not None: return self.quant_config.quant_dtype else: @@ -429,7 +430,7 @@ def make( block_shape = None per_act_token_quant = False per_out_ch_quant = False - quant_dtype: Optional[torch.dtype] = None + quant_dtype: Union[torch.dtype, str, None] = None input_quant = get_quant_config_input_quant(quant_config) weight_quant = get_quant_config_weight_quant(quant_config) @@ -453,7 +454,7 @@ def make( ModelOptNvFp4Config) if quant_dtype is None and isinstance(quant_config, ModelOptNvFp4Config): - quant_dtype = torch.uint8 + quant_dtype = "nvfp4" if weight_quant is not None: per_out_ch_quant = ( diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 2585a2953c9d..0a02b558d09e 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ CUTLASS based Fused MoE kernels.""" -from typing import Any, Callable, Optional +from typing import Callable, Optional import torch @@ -12,11 +12,10 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( - TopKWeightAndReduceDelegate) + TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP) from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, _fp8_quantize, - _resize_cache, - extract_required_args) + _resize_cache) from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -213,19 +212,14 @@ def run_cutlass_moe_fp8( output.copy_(c3[c_map].view(M * topk, K), non_blocking=True) -# TODO (bnell): split class batched vs. non-batched? -# maybe remove need for passing aq to workspace_shapes -class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): +class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - max_experts_per_worker: int, out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, block_shape: Optional[list[int]] = None, - num_dispatchers: Optional[int] = None, - use_batched_format: bool = False, ): super().__init__( FusedMoEQuantConfig( @@ -234,33 +228,84 @@ def __init__( per_out_ch_quant=per_out_ch_quant, block_shape=block_shape, )) - assert max_experts_per_worker > 0 - assert not use_batched_format or num_dispatchers is not None - self.max_experts_per_worker = max_experts_per_worker - self.num_dispatchers = num_dispatchers self.out_dtype = out_dtype - self.use_batched_format = use_batched_format + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): + assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE" + assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE" + + expert_num_tokens = None + if expert_tokens_meta is not None: + expert_num_tokens = expert_tokens_meta.expert_num_tokens + + activation_callable = lambda o, i: self.activation(activation, o, i) + + use_batched_format = self.activation_formats[ + 0] == mk.FusedMoEActivationFormat.BatchedExperts + + in_dtype = hidden_states.dtype + run_cutlass_moe_fp8( + output, hidden_states, w1, w2, topk_ids, activation_callable, + global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, + a2_scale, workspace13, workspace2, expert_num_tokens, + self.out_dtype if self.out_dtype is not None else in_dtype, + self.per_act_token_quant, self.per_out_ch_quant, + use_batched_format) + + +class CutlassExpertsFp8(CutlassExpertsFp8Base): + + def __init__( + self, + out_dtype: Optional[torch.dtype], + per_act_token_quant: bool, + per_out_ch_quant: bool, + block_shape: Optional[list[int]] = None, + ): + super().__init__( + out_dtype, + per_act_token_quant, + per_out_ch_quant, + block_shape, + ) @property def activation_formats( self ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: - if self.use_batched_format: - return (mk.FusedMoEActivationFormat.BatchedExperts, - mk.FusedMoEActivationFormat.BatchedExperts) - else: - return (mk.FusedMoEActivationFormat.Standard, - mk.FusedMoEActivationFormat.Standard) + return (mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard) def supports_chunking(self) -> bool: - return not self.use_batched_format + return True def supports_expert_map(self) -> bool: - return not self.use_batched_format - - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - # Let PrepareAndFinalize::finalize() decide the impl. - return TopKWeightAndReduceDelegate() + return True def workspace_shapes( self, @@ -274,54 +319,69 @@ def workspace_shapes( local_num_experts: int, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: - workspace1: tuple[int, ...] = () - workspace2: tuple[int, ...] = () - output: tuple[int, ...] = () - if self.use_batched_format: - padded_M = aq.size(1) - num_dp = self.num_dispatchers - assert num_dp is not None - workspace1 = (self.max_experts_per_worker, padded_M * num_dp, - max(N, K)) - workspace2 = (self.max_experts_per_worker, padded_M * num_dp, - (N // 2)) - output = (self.max_experts_per_worker, padded_M, K) - else: - workspace1 = (M * topk, max(N, K)) - workspace2 = (M * topk, N // 2) - output = (M * topk, K) + workspace1 = (M * topk, max(N, K)) + workspace2 = (M * topk, N // 2) + output = (M * topk, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): - assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE" - assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE" - expert_num_tokens = None - if expert_tokens_meta is not None: - expert_num_tokens = expert_tokens_meta.expert_num_tokens +class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): - activation_callable = lambda o, i: self.activation(activation, o, i) + def __init__( + self, + max_experts_per_worker: int, + num_dispatchers: int, + out_dtype: Optional[torch.dtype], + per_act_token_quant: bool, + per_out_ch_quant: bool, + block_shape: Optional[list[int]] = None, + ): + super().__init__( + out_dtype, + per_act_token_quant, + per_out_ch_quant, + block_shape, + ) + assert max_experts_per_worker > 0 + self.max_experts_per_worker = max_experts_per_worker + self.num_dispatchers = num_dispatchers - in_dtype = hidden_states.dtype - run_cutlass_moe_fp8( - output, hidden_states, w1, w2, topk_ids, activation_callable, - global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, - a2_scale, workspace13, workspace2, expert_num_tokens, - self.out_dtype if self.out_dtype is not None else in_dtype, - self.per_act_token_quant, self.per_out_ch_quant, - self.use_batched_format) + @property + def activation_formats( + self + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return (mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts) + + def supports_chunking(self) -> bool: + return False + + def supports_expert_map(self) -> bool: + return False + + # TODO(bnell): maybe remove need for passing aq to workspace_shapes + def workspace_shapes( + self, + a: torch.Tensor, + aq: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: + padded_M = aq.size(1) + num_dp = self.num_dispatchers + assert num_dp is not None + workspace1 = (self.max_experts_per_worker, padded_M * num_dp, + max(N, K)) + workspace2 = (self.max_experts_per_worker, padded_M * num_dp, (N // 2)) + output = (self.max_experts_per_worker, padded_M, K) + return (workspace1, workspace2, output, + self.out_dtype if self.out_dtype is not None else a.dtype) def cutlass_moe_fp8( @@ -387,11 +447,9 @@ def cutlass_moe_fp8( fn = mk.FusedMoEModularKernel( MoEPrepareAndFinalizeNoEP(), CutlassExpertsFp8( - max_experts_per_worker=num_experts, out_dtype=a.dtype, per_act_token_quant=per_act_token, per_out_ch_quant=per_out_ch, - use_batched_format=False, ), ) @@ -476,8 +534,9 @@ def run_cutlass_moe_fp4( e_w1, nx2_w1, half_k_w1 = w1_fp4.shape e_w2, k_w2, half_n_w2 = w2_fp4.shape - assert (e_w1 == e_w2 and e_w1 == e), ("Number of experts must match", - " between weights.") + assert (e_w1 == e_w2 + and e_w1 == e), ("Number of experts must match", + f" between weights. {e_w1}, {e_w2}, {e}") assert (k_a == half_k_w1 * 2 and k == k_w2), ("Hidden size mismatch between a, w1 and w2") assert (nx2_w1 == n * 2 and half_n_w2 * 2 == n), ("mismatch in " @@ -554,6 +613,10 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + a1_gscale: torch.Tensor, + a2_gscale: torch.Tensor, max_experts_per_worker: int, out_dtype: torch.dtype, per_act_token_quant: bool, @@ -562,8 +625,12 @@ def __init__( use_batched_format: bool = False, ): super().__init__( + # NVFP4 requires two levels of quantization, which involves + # computing some scaling factors dynamically. This makes it + # incompatible with the typical prepare -> MoE -> finalize + # pipeline. Move the quantization logic into the MoE body. FusedMoEQuantConfig( - quant_dtype=torch.uint8, + quant_dtype=None, # skip quantization in prepare/finalize per_act_token_quant=per_act_token_quant, per_out_ch_quant=per_out_ch_quant, block_shape=block_shape, @@ -572,6 +639,12 @@ def __init__( self.out_dtype = out_dtype self.use_batched_format = use_batched_format + # TODO(bnell): put this stuff into quant config? + self.g1_alphas = g1_alphas + self.g2_alphas = g2_alphas + self.a1_gscale = a1_gscale + self.a2_gscale = a2_gscale + @property def activation_formats( self @@ -590,8 +663,7 @@ def supports_chunking(self) -> bool: return True def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - # Let PrepareAndFinalize::finalize() decide the impl. - return TopKWeightAndReduceDelegate() + return TopKWeightAndReduceNoOP() def workspace_shapes( self, @@ -620,34 +692,42 @@ def workspace_shapes( return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], w1_scale: torch.Tensor, - w2_scale: torch.Tensor, w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: torch.Tensor, workspace13: Optional[torch.Tensor], - workspace2: Optional[torch.Tensor], - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): - required_keys = [ - "g1_alphas", "g2_alphas", "a1_gscale", "a2_gscale", "m", "n", "k", - "e", "device" - ] - (g1_alphas, g2_alphas, a1_gscale, a2_gscale, m, n, k, e, - device) = extract_required_args(extra_expert_args, required_keys) + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: torch.Tensor, + workspace13: Optional[torch.Tensor], + workspace2: Optional[torch.Tensor], + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): + e, m, n, k, _ = mk._moe_problem_size(hidden_states, w1, w2, topk_ids) + n = w2.shape[2] * 2 + run_cutlass_moe_fp4( output=output, a=hidden_states, - a1_gscale=a1_gscale, + a1_gscale=self.a1_gscale, w1_fp4=w1, w1_blockscale=w1_scale, - w1_alphas=g1_alphas, - a2_gscale=a2_gscale, + w1_alphas=self.g1_alphas, + a2_gscale=self.a2_gscale, w2_fp4=w2, w2_blockscale=w2_scale, - w2_alphas=g2_alphas, + w2_alphas=self.g2_alphas, topk_weights=topk_weights, topk_ids=topk_ids, workspace13=workspace13, @@ -656,7 +736,7 @@ def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, n=n, k=k, e=e, - device=device, + device=hidden_states.device, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -677,7 +757,6 @@ def cutlass_moe_fp4( n: int, k: int, e: int, - device: torch.device, expert_map: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False) -> torch.Tensor: assert expert_map is None, ("Expert Parallelism / expert_map " @@ -686,6 +765,10 @@ def cutlass_moe_fp4( fn = mk.FusedMoEModularKernel( MoEPrepareAndFinalizeNoEP(), CutlassExpertsFp4( + g1_alphas, + g2_alphas, + a1_gscale, + a2_gscale, max_experts_per_worker=e, out_dtype=a.dtype, per_act_token_quant=False, @@ -693,29 +776,7 @@ def cutlass_moe_fp4( use_batched_format=False, ), ) - extra_expert_args = { - 'g1_alphas': g1_alphas, - 'g2_alphas': g2_alphas, - 'a1_gscale': a1_gscale, - 'a2_gscale': a2_gscale, - 'm': m, - 'n': n, - 'k': k, - 'e': e, - 'device': device, - } - - # NVFP4 requires two levels of quantization, which involves computing some - # scaling factors dynamically. This makes it incompatible with the typical - # prepare -> MoE -> finalize pipeline. Move the quantization logic into the - # MoE body. - extra_prepare_args = { - 'skip_quant': True, - } - # Similar reason as above. - extra_finalize_args = { - 'skip_weight_reduce': True, - } + return fn( hidden_states=a, w1=w1_fp4, @@ -731,9 +792,6 @@ def cutlass_moe_fp4( a1_scale=None, a2_scale=None, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args, - extra_prepare_args=extra_prepare_args, - extra_finalize_args=extra_finalize_args, ) @@ -824,16 +882,6 @@ def run_cutlass_block_scaled_fused_experts( k = w1_q.size(1) n = w2_q.size(1) - expert_offsets = torch.empty((num_experts + 1, ), - dtype=torch.int32, - device="cuda") - problem_sizes1 = torch.empty((num_experts, 3), - dtype=torch.int32, - device="cuda") - problem_sizes2 = torch.empty((num_experts, 3), - dtype=torch.int32, - device="cuda") - topk = topk_ids.size(1) a_q, a1_scale = _fp8_quantize(a, @@ -842,6 +890,16 @@ def run_cutlass_block_scaled_fused_experts( block_shape=[128, 128]) device = a_q.device + expert_offsets = torch.empty((num_experts + 1, ), + dtype=torch.int32, + device=device) + problem_sizes1 = torch.empty((num_experts, 3), + dtype=torch.int32, + device=device) + problem_sizes2 = torch.empty((num_experts, 3), + dtype=torch.int32, + device=device) + a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 9b8175f42a9d..7b8467a5a0cf 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools -from typing import Any, Optional +from typing import Optional import torch from tqdm import tqdm @@ -230,7 +230,6 @@ def apply( workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ): assert self.block_shape is not None assert a1q_scale is not None diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index f6b62254e7b4..437e569d3130 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import deep_ep import torch @@ -127,12 +127,16 @@ def _do_dispatch(self, tokens: torch.Tensor, expert_topk_weights) def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -187,11 +191,15 @@ def prepare( return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids, expert_topk_weights) - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: assert self.handle is not None diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index cfc2bdcf0240..93ac11fb4bfb 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional, Union +from typing import Optional, Union import deep_ep import torch @@ -77,7 +77,7 @@ def _do_quant( a1_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor], a1_dtype: torch.dtype, - quant_dtype: Optional[torch.dtype], + quant_dtype: Union[torch.dtype, str, None], per_act_token_quant: bool, block_shape: Optional[list[int]], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: @@ -111,12 +111,16 @@ def _do_quant( return x, x_scales def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -162,11 +166,15 @@ def prepare( return (expert_x, expert_x_scale, expert_tokens_meta, None, None) - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 4e3e15a35ada..3fbe2a0bc69b 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional, Union import torch @@ -8,8 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( - TopKWeightAndReduceDelegate) -from vllm.model_executor.layers.fused_moe.utils import extract_required_args + TopKWeightAndReduceNoOP) from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe, has_flashinfer_cutlass_fused_moe) @@ -20,7 +19,7 @@ def is_valid_flashinfer_cutlass_fused_moe(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> bool: """ - Check if the given problem size is supported by the FlashInfer CUTLASS MoE + Check if the given problem size is supported by the FlashInfer CUTLASS MoE kernel. """ if not has_flashinfer_cutlass_fused_moe(): @@ -43,31 +42,34 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - use_nvfp4_w4a4: bool = False, - use_fp8_w8a8: bool = False, - use_dp: bool = False, + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + a1_gscale: torch.Tensor, + a2_gscale: torch.Tensor, + out_dtype: torch.dtype, + quant_dtype: Union[torch.dtype, str, None], ep_rank: int = 0, ep_size: int = 1, tp_rank: int = 0, tp_size: int = 1, - num_dispatchers: Optional[int] = None, - use_batched_format: bool = False, ): super().__init__( FusedMoEQuantConfig( - quant_dtype=torch.uint8, + quant_dtype=quant_dtype, per_act_token_quant=False, block_shape=None, )) - self.use_nvfp4_w4a4 = use_nvfp4_w4a4 - self.use_fp8_w8a8 = use_fp8_w8a8 + assert quant_dtype == "nvfp4", ("Only nvfp4 quantization is " + "currently supported.") self.ep_rank = ep_rank self.ep_size = ep_size self.tp_rank = tp_rank self.tp_size = tp_size - self.use_dp = use_dp - assert not use_batched_format or num_dispatchers is not None - self.num_dispatchers = num_dispatchers + self.g1_alphas = g1_alphas + self.g2_alphas = g2_alphas + self.a1_gscale = a1_gscale + self.a2_gscale = a2_gscale + self.out_dtype = out_dtype @property def activation_formats( @@ -84,8 +86,7 @@ def supports_chunking(self) -> bool: return True def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - # Let PrepareAndFinalize::finalize() decide the impl. - return TopKWeightAndReduceDelegate() + return TopKWeightAndReduceNoOP() def workspace_shapes( self, @@ -117,8 +118,6 @@ def workspace_shapes( - Note: in order for activation chunking to work, the first dimension of each tuple must be the number of tokens. """ - assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is " - "currently supported.") aq_m, aq_n = aq.shape workspace2 = () output_shape = (aq_m, aq_n * 2) @@ -149,21 +148,9 @@ def apply( workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: Optional[bool], - extra_expert_args: Optional[dict[str, Any]], ): - assert extra_expert_args is not None, \ - "extra_expert_args must be provided" - required_keys = [ - 'g1_alphas', 'g2_alphas', 'a1_gscale', 'a2_gscale', 'out_dtype' - ] - - g1_alphas, g2_alphas, a1_gscale, a2_gscale, out_dtype = ( - extract_required_args(extra_expert_args, required_keys)) - # Flashinfer CUTLASS kernel takes scalar global scales, # min because inv_scale. - assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is " - "currently supported.") # Ensure w1_scale and w2_scale are not None before calling view assert w1_scale is not None and w2_scale is not None, ( @@ -171,12 +158,12 @@ def apply( "be None for FlashInferExperts") quant_scales = [ - a1_gscale, + self.a1_gscale, w1_scale.view(torch.int32), - g1_alphas, - a2_gscale, + self.g1_alphas, + self.a2_gscale, w2_scale.view(torch.int32), - g2_alphas, + self.g2_alphas, ] _ = flashinfer_cutlass_fused_moe( input=hidden_states, @@ -185,7 +172,7 @@ def apply( # FlashInfer API requires weight to be long for nvfp4 fc1_expert_weights=w1.view(torch.long), fc2_expert_weights=w2.view(torch.long), - output_dtype=out_dtype, + output_dtype=self.out_dtype, quant_scales=quant_scales, input_sf=a1q_scale, tp_size=self.tp_size, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 36aca8cf74b6..061b02172c44 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -9,7 +9,7 @@ from vllm.forward_context import get_forward_context from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( - extract_required_args, moe_kernel_quantize_input) + moe_kernel_quantize_input) from vllm.utils.flashinfer import nvfp4_block_scale_interleave @@ -21,16 +21,15 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def __init__( self, - quant_dtype: Optional[torch.dtype] = None, - per_channel_quant: bool = False, - block_shape: Optional[list[int]] = None, + use_dp: bool, + a1_gscale: Optional[torch.Tensor], num_dispatchers: int = 1, ): super().__init__() - self.per_channel_quant = per_channel_quant - self.block_shape = block_shape - self.quant_dtype = quant_dtype self.num_dispatchers_ = num_dispatchers + self.use_dp = use_dp + self.a1_gscale = a1_gscale + self.local_tokens = None @property def activation_format(self) -> mk.FusedMoEActivationFormat: @@ -55,10 +54,11 @@ def prepare( num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + # TODO(bnell): use quant_config + scales instead of ctor args quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], - Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], + Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], + Optional[torch.Tensor]]: if apply_router_weight_on_input: topk = topk_ids.size(1) @@ -67,22 +67,22 @@ def prepare( "apply_router_weight_on_input is only implemented for topk=1" a1.mul_(topk_weights.to(a1.dtype)) - (a1_gscale, use_dp, local_tokens) = extract_required_args( - extra_prepare_args, ['a1_gscale', 'use_dp', 'local_tokens']) - a1q, a1q_scale = moe_kernel_quantize_input( a1, - a1_gscale, + self.a1_gscale, quant_config.quant_dtype, - self.per_channel_quant, - self.block_shape, - is_fp4_scale_swizzled=not use_dp, # Swizzling after communication + quant_config.per_act_token_quant, + quant_config.block_shape, + # Swizzling after communication + is_fp4_scale_swizzled=not self.use_dp, ) - if use_dp: + if self.use_dp: topk_weights, topk_ids, a1q, a1q_scale = \ - get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], # noqa: E501 - dim=0, - sizes=get_local_sizes()) + get_dp_group().all_gatherv( + [topk_weights, topk_ids, a1q, a1q_scale], + dim=0, + sizes=get_local_sizes(), + ) a1_m, a1_n = a1q.shape a1q_scale = nvfp4_block_scale_interleave(a1q_scale) @@ -91,13 +91,9 @@ def prepare( def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + weight_and_reduce_impl: mk.TopKWeightAndReduce) -> None: - (use_dp, - local_tokens) = extract_required_args(extra_finalize_args, - ['use_dp', 'local_tokens']) - if use_dp: + if self.use_dp: fused_expert_output = get_dp_group().reduce_scatterv( fused_expert_output, dim=0, sizes=get_local_sizes()) output.copy_(fused_expert_output) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 9a5c85e120cc..b46f4be4b912 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused batched MoE kernel.""" -from typing import Any, Optional +from typing import Optional import torch @@ -496,12 +496,16 @@ def num_dispatchers(self) -> int: return self.num_dispatchers_ def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -590,11 +594,15 @@ def prepare( return b_a1, b_a1_scale, expert_tokens_meta, None, None - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): weight_and_reduce_impl = TopKWeightAndReduceNaiveBatched(self.rank) weight_and_reduce_impl.apply( @@ -688,18 +696,28 @@ def dequant(self, t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: else: return t.to(f32) * group_broadcast(scale, t.shape) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): assert hidden_states.dim() == 3 assert expert_tokens_meta is not None expert_num_tokens = expert_tokens_meta.expert_num_tokens @@ -894,18 +912,28 @@ def workspace_shapes( output = (num_experts, max_num_tokens * num_dp, K) return (workspace13, workspace2, output, a.dtype) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): # Check constraints. if self.use_int4_w4a16: assert hidden_states.size(-1) // 2 == w1.size(2), ( diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 1c497fa5521b..e58a9e568d4a 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1394,9 +1394,9 @@ def fused_experts(hidden_states: torch.Tensor, # E8M0 scale, which means we requantize the weight and input to the specific # scale. Fallen back to cutlass or triton for some cases would cause # accuracy issue. - should_use_deep_gemm = is_blackwell_deep_gemm_e8m0_used( - ) or _valid_deep_gemm(hidden_states, w1, w2) - if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm): + if (allow_deep_gemm and use_fp8_w8a8 + and (is_blackwell_deep_gemm_e8m0_used() + or _valid_deep_gemm(hidden_states, w1, w2))): assert apply_router_weight_on_input is False assert is_act_and_mul, ( "DeepGemm only supports is_act_and_mul=True for now.") @@ -1905,7 +1905,6 @@ def apply( workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ): # Check constraints. if self.use_int4_w4a16: diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 6b5284dc6c96..312befe2c1d7 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Optional import torch @@ -8,7 +8,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) -from vllm.model_executor.layers.fused_moe.utils import extract_required_args from vllm.utils import has_triton_kernels logger = init_logger(__name__) @@ -160,12 +159,16 @@ def __init__( num_dispatchers: int, w1_precision: "PrecisionConfig", w2_precision: "PrecisionConfig", + w1_bias: Optional[torch.Tensor], + w2_bias: Optional[torch.Tensor], ): super().__init__(quant_config) self.max_num_tokens = max_num_tokens self.num_dispatchers = num_dispatchers self.w1_precision = w1_precision self.w2_precision = w2_precision + self.w1_bias = w1_bias + self.w2_bias = w2_bias @property def activation_formats( @@ -219,11 +222,7 @@ def apply( workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ): - w1_bias, w2_bias = (extract_required_args(extra_expert_args, - ["w1_bias", "w2_bias"])) - return triton_kernel_fused_experts( output, hidden_states, @@ -240,8 +239,8 @@ def apply( expert_map=expert_map, w1_scale=w1_scale, w2_scale=w2_scale, - w1_bias=w1_bias, - w2_bias=w2_bias, + w1_bias=self.w1_bias, + w2_bias=self.w2_bias, w1_precision=self.w1_precision, w2_precision=self.w2_precision, a1_scale=a1q_scale, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 36e75825853e..c3c6e4782750 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -37,7 +37,6 @@ from vllm.platforms.interface import CpuArchEnum from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx, round_up) -from vllm.utils.flashinfer import has_flashinfer if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts @@ -49,9 +48,6 @@ from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE, DeepEPLLPrepareAndFinalize) - if has_flashinfer(): - from .flashinfer_cutlass_prepare_finalize import ( - FlashInferCutlassMoEPrepareAndFinalize) else: fused_experts = None # type: ignore FusedMoEPermuteExpertsUnpermute = None # type: ignore @@ -80,7 +76,12 @@ class FusedMoeWeightScaleSupported(Enum): class FusedMoEMethodBase(QuantizeMethodBase): - moe: FusedMoEConfig + # TODO(bnell): also pass quant_config? + def __init__(self, moe: FusedMoEConfig): + super().__init__() + self.moe = moe + self.fused_experts: Optional[Callable] = None + self.topk_indices_dtype = None @abstractmethod def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -99,16 +100,16 @@ def uses_weight_scale_2_pattern(self) -> bool: return False @staticmethod - def maybe_make_prepare_finalize( - moe: FusedMoEConfig) -> Optional[FusedMoEPrepareAndFinalize]: + def _maybe_make_prepare_finalize( + moe: FusedMoEConfig, ) -> Optional[FusedMoEPrepareAndFinalize]: all2all_manager = get_ep_group().device_communicator.all2all_manager assert all2all_manager is not None prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None - if moe.use_flashinfer_cutlass_kernels: - prepare_finalize = FlashInferCutlassMoEPrepareAndFinalize( - quant_dtype=moe.quant_dtype, ) + assert not moe.use_flashinfer_cutlass_kernels, \ + "Must be created in modelopt.py" + if moe.use_pplx_kernels: hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes( moe.max_num_tokens, @@ -188,14 +189,25 @@ def maybe_make_prepare_finalize( return prepare_finalize - def init_prepare_finalize(self, moe: FusedMoEConfig): - self.moe = moe - prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize( - self.moe) + def maybe_make_prepare_finalize( + self, + moe: FusedMoEConfig, + ) -> Optional[FusedMoEPrepareAndFinalize]: + if moe.moe_parallel_config.use_all2all_kernels: + return FusedMoEMethodBase._maybe_make_prepare_finalize(moe) + else: + return None + + def init_prepare_finalize(self): + assert self.moe is not None + prepare_finalize = self.maybe_make_prepare_finalize(self.moe) - self.topk_indices_dtype = None if prepare_finalize is not None: - logger.debug("%s", prepare_finalize.__class__.__name__) + logger.debug("%s for %s(%s)", prepare_finalize.__class__.__name__, + self, id(self)) + assert self.topk_indices_dtype is None + assert self.fused_experts is None, \ + f"Attempt to override experts for {id(self)}!" self.topk_indices_dtype = prepare_finalize.topk_indices_dtype() experts = self.select_gemm_impl(prepare_finalize, self.moe) self.fused_experts = FusedMoEModularKernel( @@ -214,12 +226,6 @@ def select_gemm_impl( f"{self.__class__.__name__} must select appropriate gemm " "implementation based on the prepare_finalize") - def maybe_swap_experts_impl( - self, - moe_parallel_config: FusedMoEParallelConfig, - ): - pass - @abstractmethod def apply( self, @@ -251,10 +257,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" def __init__(self, moe: FusedMoEConfig): - super().__init__() - self.fused_experts = fused_experts # type: ignore - self.topk_indices_dtype = None - self.moe = moe + super().__init__(moe) self.has_bias = self.moe.has_bias self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() if self.rocm_aiter_moe_enabled: @@ -266,6 +269,7 @@ def __init__(self, moe: FusedMoEConfig): def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, + # TODO(bnell): Remove. Every layer should have an moe config object. moe: FusedMoEConfig, ) -> FusedMoEPermuteExpertsUnpermute: if (prepare_finalize.activation_format == @@ -474,12 +478,30 @@ def forward_cuda( expert_map=expert_map, activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) + elif self.fused_experts is not None: + if self.has_bias: + raise ValueError( + "FusedMoEModularKernel does not support bias.") + return self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) else: - # add w1_bias/w2_bias to kwargs if they exist - kwargs = dict( + assert fused_experts is not None + return fused_experts( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, + w1_bias=layer.w13_bias if self.has_bias else None, + w2_bias=layer.w2_bias if self.has_bias else None, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, @@ -488,17 +510,6 @@ def forward_cuda( global_num_experts=global_num_experts, expert_map=expert_map, ) - if isinstance(self.fused_experts, - FusedMoEModularKernel) and self.has_bias: - raise ValueError( - "FusedMoEModularKernel does not support bias.") - if self.has_bias: - kwargs.update({ - "w1_bias": getattr(layer, "w13_bias", None), - "w2_bias": getattr(layer, "w2_bias", None), - }) - - return self.fused_experts(**kwargs) def forward_cpu( self, @@ -868,8 +879,6 @@ def __init__( moe_quant_params["intermediate_size_full"] = intermediate_size self.quant_method.create_weights(layer=self, **moe_quant_params) - if isinstance(self.quant_method, FusedMoEMethodBase): - self.quant_method.maybe_swap_experts_impl(self.moe_parallel_config) # Chunked all2all staging tensor self.batched_hidden_states: Optional[torch.Tensor] = None diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 6262904e4dca..2ea6383d5ae9 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from enum import Enum from math import prod -from typing import Any, Optional, final +from typing import Optional, final import torch @@ -150,15 +150,23 @@ class FusedMoEPrepareAndFinalize(ABC): @abstractmethod def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[ExpertTokensMetadata], Optional[torch.Tensor], - Optional[torch.Tensor]]: + ) -> tuple[ + torch.Tensor, + Optional[torch.Tensor], + Optional[ExpertTokensMetadata], + Optional[torch.Tensor], + Optional[torch.Tensor], + ]: """ Perform any quantization (and/or) dispatching needed for this kernel. @@ -186,11 +194,15 @@ def prepare( raise NotImplementedError @abstractmethod - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: TopKWeightAndReduce, + ) -> None: """ Perform any combine plus apply weights and perform a reduction on the fused experts output. @@ -368,7 +380,6 @@ def apply( workspace2: torch.Tensor, expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ): """ This function computes the intermediate result of a Mixture of Experts @@ -454,18 +465,27 @@ def __init__( f"{fused_experts.activation_formats[0]}") def _do_fused_experts( - self, fused_out: Optional[torch.Tensor], a1: torch.Tensor, - a1q: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, global_num_experts: int, local_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], w2_zp: Optional[torch.Tensor], - a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], - expert_tokens_meta: Optional[ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]) -> torch.Tensor: + self, + fused_out: Optional[torch.Tensor], + a1: torch.Tensor, + a1q: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + local_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + expert_tokens_meta: Optional[ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ) -> torch.Tensor: _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) @@ -509,7 +529,7 @@ def _do_fused_experts( workspace2=workspace2, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args) + ) return fused_out @@ -533,7 +553,6 @@ def _maybe_chunk_fused_experts( a2_scale: Optional[torch.Tensor], expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ) -> torch.Tensor: _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) @@ -541,6 +560,9 @@ def _maybe_chunk_fused_experts( CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_chunks = cdiv(M, CHUNK_SIZE) + # TODO(bnell): get rid of one level here, update slice functions + # to nops on num_chunks==1 + if not self.fused_experts.supports_chunking() or num_chunks == 1: return self._do_fused_experts( fused_out=None, @@ -562,7 +584,7 @@ def _maybe_chunk_fused_experts( a2_scale=a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args) + ) # Chunking required case assert num_chunks > 1 @@ -618,15 +640,6 @@ def slice_expert_tokens_metadata( expert_num_tokens=c_expert_num_tokens, expert_num_tokens_cpu=c_expert_num_tokens_cpu) - m = None - if extra_expert_args is not None and 'm' in extra_expert_args: - m = extra_expert_args.get('m') - - if extra_expert_args is not None: - chunked_extra_expert_args = extra_expert_args - else: - chunked_extra_expert_args = {} - for chunk_idx in range(num_chunks): c_a1q, c_a1q_scale, c_a2_scale, c_topk_ids, c_topk_weights = ( slice_input_tensors(chunk_idx)) @@ -637,11 +650,6 @@ def slice_expert_tokens_metadata( expert_tokens_meta, c_topk_ids, local_num_experts, expert_map) - s = chunk_idx * CHUNK_SIZE - e = min(s + CHUNK_SIZE, M) - - if m is not None: - chunked_extra_expert_args['m'] = e - s self._do_fused_experts( fused_out=slice_output_tensor(chunk_idx), a1=a1, @@ -662,7 +670,7 @@ def slice_expert_tokens_metadata( a2_scale=c_a2_scale, expert_tokens_meta=c_expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=chunked_extra_expert_args) + ) return fused_out @@ -684,9 +692,6 @@ def forward( a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, - extra_expert_args: Optional[dict] = None, - extra_prepare_args: Optional[dict] = None, - extra_finalize_args: Optional[dict] = None, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets @@ -719,12 +724,6 @@ def forward( - apply_router_weight_on_input (bool): When true, the topk weights are applied directly on the inputs. This is only applicable when topk is 1. - - extra_expert_args (Optional[dict]): Extra keyword arguments to pass to - fused_experts.apply. - - extra_prepare_args (Optional[dict]): Extra keyword arguments to pass - to prepare. - - extra_finalize_args (Optional[dict]): Extra keyword arguments to pass - to finalize. Returns: - torch.Tensor: The output tensor after applying the MoE layer. @@ -748,7 +747,6 @@ def forward( expert_map, apply_router_weight_on_input, self.fused_experts.quant_config, - extra_prepare_args, ) # Maybe prepare gathered topk_ids and topk_weights from other EP ranks. @@ -786,12 +784,15 @@ def forward( a2_scale=a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args) + ) self.prepare_finalize.finalize( - output, fused_out, topk_weights, topk_ids, + output, + fused_out, + topk_weights, + topk_ids, apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), - extra_finalize_args) + ) return output diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 46931f2dd7c7..401f37922b7b 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional, Union import pplx_kernels as pplx import torch @@ -21,7 +21,7 @@ def pplx_hidden_dim_scale_bytes( max_num_tokens: int, hidden_dim: int, in_dtype: torch.dtype, - quant_dtype: Optional[torch.dtype], + quant_dtype: Union[torch.dtype, str, None], per_act_token_quant: bool, block_shape: Optional[list[int]], ): @@ -32,6 +32,7 @@ def pplx_hidden_dim_scale_bytes( # ceil_div(hidden_dim, block_size) * sizeof(float32) # For per-token: set to 4 * sizeof(float32) (x4 for alignment) if quant_dtype is not None: + assert isinstance(quant_dtype, torch.dtype) assert quant_dtype.itemsize == 1 hidden_dim_bytes = hidden_dim * quant_dtype.itemsize elem_size = torch.float32.itemsize @@ -89,12 +90,16 @@ def num_dispatchers(self) -> int: return self.num_dispatchers_ def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -213,11 +218,15 @@ def prepare( return expert_x, expert_x_scale, expert_tokens_meta, None, None - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 696c7cdba9a7..567a0a88fec0 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -38,7 +38,6 @@ def prepare( expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -50,32 +49,26 @@ def prepare( "apply_router_weight_on_input is only implemented for topk=1" a1.mul_(topk_weights.to(a1.dtype)) - if (extra_prepare_args is not None - and extra_prepare_args.get("skip_quant", True)): - # Skip quantization if explicitly requested - return a1, None, None, None, None - a1q, a1q_scale = moe_kernel_quantize_input( a1, a1_scale, quant_config.quant_dtype, quant_config.per_act_token_quant, quant_config.block_shape) return a1q, a1q_scale, None, None, None - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: - if (extra_finalize_args is not None - and extra_finalize_args.get("skip_weight_reduce", True)): - assert output.shape == fused_expert_output.shape - output.copy_(fused_expert_output) - else: - if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): - weight_and_reduce_impl = TopKWeightAndReduceContiguous() - weight_and_reduce_impl.apply( - output=output, - fused_expert_output=fused_expert_output, - topk_weights=topk_weights, - topk_ids=topk_ids, - apply_router_weight_on_input=apply_router_weight_on_input) + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): + weight_and_reduce_impl = TopKWeightAndReduceContiguous() + weight_and_reduce_impl.apply( + output=output, + fused_expert_output=fused_expert_output, + topk_weights=topk_weights, + topk_ids=topk_ids, + apply_router_weight_on_input=apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 9d0ff2e06190..486ca881df48 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -119,18 +119,28 @@ def workspace_shapes( local_num_experts, expert_tokens_meta) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): use_deep_gemm = (self.allow_deep_gemm and (_valid_deep_gemm(hidden_states, w1, w2) or is_blackwell_deep_gemm_e8m0_used())) @@ -158,5 +168,4 @@ def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, workspace2, expert_tokens_meta, apply_router_weight_on_input, - extra_expert_args, ) diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 966471b5c59b..4c3e700ad399 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from math import prod -from typing import Any, Optional, Union +from typing import Optional, Union import torch @@ -189,7 +189,7 @@ def moe_kernel_quantize_input( return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape) elif quant_dtype == torch.int8: return _int8_quantize(A, A_scale, per_act_token_quant, block_shape) - elif quant_dtype == torch.uint8: # nvfp4 + elif quant_dtype == "nvfp4": return _fp4_quantize(A, A_scale, is_sf_swizzled_layout=is_fp4_scale_swizzled) @@ -252,17 +252,3 @@ def _validate_scale_shape( assert block_shape is not None expected = (a.shape[0], cdiv(a.shape[1], block_shape[1])) assert a_scale.shape == expected, f"{a_scale.shape} == {expected}" - - -def extract_required_args( - extra_args: Optional[dict[str, Any]], - required_keys: list[str], -) -> tuple[Any, ...]: - if extra_args is None: - raise ValueError("`extra_args` must be provided.") - - missing_keys = [k for k in required_keys if k not in extra_args] - if missing_keys: - raise ValueError(f"Missing keys in `extra_args`: {missing_keys}") - - return tuple(extra_args[k] for k in required_keys) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index a9e967e608e9..fb285413ba9e 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -241,7 +241,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): if isinstance(layer, FusedMoE): if use_marlin: - return AWQMoEMethod(quant_args_marlin) + return AWQMoEMethod(quant_args_marlin, layer.moe) from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) @@ -339,7 +339,7 @@ def apply_gptq_quant_layer(self, } return MoeWNA16Config.from_config(config).get_quant_method( layer, prefix) - return GPTQMarlinMoEMethod(quant_args_marlin) + return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) if isinstance(layer, (LinearBase, ParallelLMHead)): if use_marlin: diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index fe42e26a1706..af602eb9aca3 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -113,7 +113,7 @@ def get_quant_method( } awq_marlin_config = AWQMarlinConfig.from_config( marlin_compatible_config_dict) - return AWQMoEMethod(awq_marlin_config) + return AWQMoEMethod(awq_marlin_config, layer.moe_config) return None diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index ed7ffb21e85a..287d66b06d6e 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -10,7 +10,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, UnquantizedFusedMoEMethod) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, @@ -151,7 +151,7 @@ def get_quant_method(self, layer: torch.nn.Module, "Falling back to Moe WNA16 kernels.") return MoeWNA16Config.from_config( self.full_config).get_quant_method(layer, prefix) - return AWQMoEMethod(self) + return AWQMoEMethod(self, layer.moe_config) return None @classmethod @@ -328,7 +328,12 @@ def apply( class AWQMoEMethod(FusedMoEMethodBase): - def __init__(self, quant_config: AWQMarlinConfig): + def __init__( + self, + quant_config: AWQMarlinConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) self.quant_config = quant_config if self.quant_config.weight_bits != 4: raise ValueError("AWQMoEMethod only supports 4bit now.") @@ -500,6 +505,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `AWQMoEMethod` yet.") @@ -516,7 +523,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return torch.ops.vllm.fused_marlin_moe( x, @@ -535,4 +543,4 @@ def apply( expert_map=expert_map, w1_zeros=layer.w13_qzeros, w2_zeros=layer.w2_qzeros, - workspace=layer.workspace) \ No newline at end of file + workspace=layer.workspace) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 0204ff46852f..b7897a43793c 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -7,6 +7,7 @@ from packaging import version from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, + FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, @@ -132,7 +133,7 @@ def get_quant_method( return UnquantizedLinearMethod() return BitsAndBytesLinearMethod(self) elif isinstance(layer, FusedMoE): - return BitsAndBytesMoEMethod(self) + return BitsAndBytesMoEMethod(self, layer.moe_config) return None @@ -411,7 +412,12 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): quant_config: The BitsAndBytes quantization config. """ - def __init__(self, quant_config: BitsAndBytesConfig): + def __init__( + self, + quant_config: BitsAndBytesConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) try: import bitsandbytes if version.parse( @@ -422,7 +428,6 @@ def __init__(self, quant_config: BitsAndBytesConfig): raise ImportError("Please install bitsandbytes>=0.46.1 via " "`pip install bitsandbytes>=0.46.1` to use " "bitsandbytes quantizer.") from err - self.topk_indices_dtype = None self.quant_config = quant_config def create_weights( @@ -470,6 +475,7 @@ def apply( logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts + assert self.fused_experts is None if enable_eplb: raise NotImplementedError( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 839942beaf40..42c43cbc03e5 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -11,20 +11,21 @@ QuantizationStrategy) import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported) -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa - FlashInferCutlassMoEPrepareAndFinalize) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + is_valid_flashinfer_cutlass_fused_moe) from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP) from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( - build_flashinfer_fp4_cutlass_moe_kernel, - flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) + build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1, + select_nvfp4_gemm_impl) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_moe_marlin_supports_layer, marlin_make_workspace_new, marlin_moe_permute_scales) @@ -58,6 +59,9 @@ class GPTQMarlinState(Enum): class CompressedTensorsMoEMethod(FusedMoEMethodBase): + def __init_(self, moe: FusedMoEConfig): + super().__init__(moe) + @staticmethod def get_moe_method( quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 @@ -81,18 +85,22 @@ def get_moe_method( "WNA16MoE is not supported with actorder=group/dynamic." ) logger.info_once("Using CompressedTensorsWNA16MoEMethod") - return CompressedTensorsWNA16MoEMethod(quant_config) + return CompressedTensorsWNA16MoEMethod(quant_config, + layer.moe_config) else: logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") - return CompressedTensorsWNA16MarlinMoEMethod(quant_config) + return CompressedTensorsWNA16MarlinMoEMethod( + quant_config, layer.moe_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): - return CompressedTensorsW4A4MoeMethod() + return CompressedTensorsW4A4MoeMethod(layer.moe_config, layer) elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant) or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant) or quant_config._is_fp8_w8a8(weight_quant, input_quant)): - return CompressedTensorsW8A8Fp8MoEMethod(quant_config) + return CompressedTensorsW8A8Fp8MoEMethod(quant_config, + layer.moe_config) elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8Int8MoEMethod(quant_config) + return CompressedTensorsW8A8Int8MoEMethod(quant_config, + layer.moe_config) else: raise RuntimeError( f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}") @@ -100,15 +108,16 @@ def get_moe_method( class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): - def __init__(self): + def __init__(self, moe: FusedMoEConfig, layer: torch.nn.Module): from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 detect_nvfp4_moe_support) + super().__init__(moe) _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported self.allow_flashinfer = _nvfp4.allow_flashinfer self.use_marlin = _nvfp4.use_marlin self.group_size = 16 - self.fused_experts = None # type: ignore[assignment] + self.layer = layer def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -265,19 +274,36 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_input_scale_quant = torch.nn.Parameter( (layer.w2_input_global_scale), requires_grad=False) - def maybe_swap_experts_impl(self, moe_parallel_config): + def maybe_make_prepare_finalize( + self, + moe: FusedMoEConfig, + ) -> Optional[mk.FusedMoEPrepareAndFinalize]: if not self.allow_flashinfer: - return - self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config) + return super().maybe_make_prepare_finalize(moe) - def select_gemm_impl(self, prepare_finalize, moe): - """Return the appropriate GEMM experts implementation.""" - assert moe is not None and prepare_finalize is not None - from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 - select_nvfp4_gemm_impl) + prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize( + moe, + a1_gscale=self.layer.w13_input_scale_quant, + ) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize - return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger) + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + moe: FusedMoEConfig, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + """Return the appropriate GEMM experts implementation.""" + experts = select_nvfp4_gemm_impl( + moe, + g1_alphas=self.layer.g1_alphas, + g2_alphas=self.layer.g2_alphas, + a1_gscale=self.layer.w13_input_scale_quant, + a2_gscale=self.layer.w2_input_scale_quant, + allow_flashinfer=self.allow_flashinfer, + ) + logger.debug_once("Using %s", experts.__class__.__name__) + return experts def apply( self, @@ -301,6 +327,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError("EPLB not supported for " "`CompressedTensorsW4A4MoeMethod` yet.") @@ -317,6 +345,7 @@ def apply( custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, ) if self.use_marlin: @@ -340,15 +369,22 @@ def apply( # FlashInfer fused experts path if self.fused_experts is not None: - return flashinfer_fp4_cutlass_moe_forward( - self.fused_experts, - layer, - x, - topk_weights, - topk_ids, + assert is_valid_flashinfer_cutlass_fused_moe( + x, layer.w13_weight, layer.w2_weight), ( + "Flashinfer CUTLASS Fused MoE not applicable!") + + return self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, # TODO(shuw): fix later, now output is high prec activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, + w1_scale=layer.w13_blockscale_swizzled, + w2_scale=layer.w2_blockscale_swizzled, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -376,7 +412,6 @@ def apply( n=layer.w2_weight.shape[2] * 2, k=x.shape[1], e=layer.w13_weight.shape[0], - device=x.device, apply_router_weight_on_input=apply_router_weight_on_input).to( x.dtype) @@ -384,15 +419,16 @@ def apply( class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, ): + super().__init__(moe) self.quant_config = quant_config self.weight_quant = self.quant_config.target_scheme_map["Linear"].get( "weights") self.input_quant = self.quant_config.target_scheme_map["Linear"].get( "input_activations") - self.topk_indices_dtype = None per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR and self.input_quant.strategy @@ -429,7 +465,6 @@ def __init__( self.weight_quant, self.input_quant) self.use_cutlass = (quant_config._is_fp8_w8a8_sm90( self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100) - self.fused_experts = None # type: ignore[assignment] self.disable_expert_map = False def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -614,25 +649,31 @@ def select_gemm_impl( ) -> FusedMoEPermuteExpertsUnpermute: # cutlass path if self.use_cutlass: - from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8 + from vllm.model_executor.layers.fused_moe import ( + CutlassBatchedExpertsFp8, CutlassExpertsFp8) - use_batched_format = (prepare_finalize.activation_format == - FusedMoEActivationFormat.BatchedExperts) + experts: FusedMoEPermuteExpertsUnpermute num_dispatchers = prepare_finalize.num_dispatchers() - num_experts = (moe.num_local_experts - if use_batched_format else moe.num_experts) - - logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) - experts = CutlassExpertsFp8( - num_experts, - moe.in_dtype, - self.input_quant.strategy == QuantizationStrategy.TOKEN, - self.weight_quant.strategy == QuantizationStrategy.CHANNEL, - num_dispatchers=num_dispatchers, - use_batched_format=use_batched_format, - ) + if (prepare_finalize.activation_format == + FusedMoEActivationFormat.BatchedExperts): + logger.debug("CutlassBatchedExpertsFp8(%s)", + self.__class__.__name__) + experts = CutlassBatchedExpertsFp8( + moe.num_local_experts, + num_dispatchers, + moe.in_dtype, + self.input_quant.strategy == QuantizationStrategy.TOKEN, + self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ) + else: + logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) + experts = CutlassExpertsFp8( + moe.in_dtype, + self.input_quant.strategy == QuantizationStrategy.TOKEN, + self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ) self.disable_expert_map = (num_dispatchers > 1 or not experts.supports_expert_map()) @@ -834,9 +875,11 @@ def apply( class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, ): + super().__init__(moe) self.quant_config = quant_config self.weight_quant = self.quant_config.target_scheme_map["Linear"].get( "weights") @@ -934,6 +977,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for " @@ -951,7 +996,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_experts( hidden_states=x, @@ -975,9 +1021,11 @@ def apply( class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, ): + super().__init__(moe) self.quant_config = quant_config # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. @@ -1233,6 +1281,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for " @@ -1251,7 +1301,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return torch.ops.vllm.fused_marlin_moe( x, @@ -1279,9 +1330,11 @@ def apply( class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, ): + super().__init__(moe) self.quant_config = quant_config # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. @@ -1459,6 +1512,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError("EPLB not supported for " "`CompressedTensorsWNA16MoEMethod` yet.") @@ -1475,7 +1530,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_experts( x, diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 47eca80609e0..3e43caa4cbf7 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -6,7 +6,8 @@ import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group -from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, + FusedMoEMethodBase) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -46,13 +47,18 @@ def get_quant_method(self, layer: torch.nn.Module, if isinstance(layer, LinearBase): return UnquantizedLinearMethod() elif isinstance(layer, FusedMoE): - return ExpertsInt8MoEMethod(self) + return ExpertsInt8MoEMethod(self, layer.moe_config) return None class ExpertsInt8MoEMethod(FusedMoEMethodBase): - def __init__(self, quant_config: ExpertsInt8Config): + def __init__( + self, + quant_config: ExpertsInt8Config, + moe: FusedMoEConfig, + ): + super().__init__(moe) self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -122,6 +128,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `ExpertsInt8MoEMethod` yet.") @@ -138,7 +146,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_experts( x, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index dbd523428695..a49744913251 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import functools from typing import TYPE_CHECKING, Any, Callable, Optional import torch @@ -142,7 +141,7 @@ def get_quant_method(self, layer: torch.nn.Module, return UnquantizedLinearMethod() return Fp8LinearMethod(self) elif isinstance(layer, FusedMoE): - return Fp8MoEMethod(self) + return Fp8MoEMethod(self, layer.moe_config) elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) return None @@ -479,9 +478,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): quant_config: The quantization config. """ - def __init__(self, quant_config: Fp8Config): - - from vllm.model_executor.layers.fused_moe import fused_experts + def __init__(self, quant_config: Fp8Config, moe: FusedMoEConfig): + super().__init__(moe) self.quant_config = quant_config self.block_quant = self.quant_config.weight_block_size is not None @@ -529,15 +527,6 @@ def __init__(self, quant_config: Fp8Config): "CutlassBlockScaledGroupedGemm not supported on the current " "platform.") - self.topk_indices_dtype = None - self.fused_experts = functools.partial( # type: ignore - fused_experts, - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, - allow_deep_gemm=self.allow_deep_gemm, - allow_cutlass_block_scaled_grouped_gemm=( - self.allow_cutlass_block_scaled_grouped_gemm)) - def create_weights(self, layer: Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -1033,7 +1022,7 @@ def apply( num_expert_group=num_expert_group, topk_group=topk_group, apply_router_weight_on_input=apply_router_weight_on_input) - else: + elif self.fused_experts is not None: return self.fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -1052,6 +1041,30 @@ def apply( a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, ) + else: + from vllm.model_executor.layers.fused_moe import fused_experts + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + w1_scale=(layer.w13_weight_scale_inv + if self.block_quant else layer.w13_weight_scale), + w2_scale=(layer.w2_weight_scale_inv + if self.block_quant else layer.w2_weight_scale), + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + use_fp8_w8a8=True, + block_shape=self.quant_config.weight_block_size, + allow_deep_gemm=self.allow_deep_gemm, + allow_cutlass_block_scaled_grouped_gemm=( + self.allow_cutlass_block_scaled_grouped_gemm)) class Fp8KVCacheMethod(BaseKVCacheMethod): diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 86da04c39989..49d28927d6e7 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -11,6 +11,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, + FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization import QuantizationMethods @@ -58,7 +59,7 @@ def get_quant_method(self, layer: torch.nn.Module, elif isinstance(layer, VocabParallelEmbedding): return GGUFEmbeddingMethod(self) elif isinstance(layer, FusedMoE): - return GGUFMoEMethod(self) + return GGUFMoEMethod(self, layer.moe_config) return None @@ -445,7 +446,12 @@ class GGUFMoEMethod(FusedMoEMethodBase): quant_config: The GGUF quantization config. """ - def __init__(self, quant_config: GGUFConfig): + def __init__( + self, + quant_config: GGUFConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -525,6 +531,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ): + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `GGUFMoEMethod` yet.") @@ -545,7 +553,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight, topk_weights, topk_ids, layer.w13_qweight_type.weight_type, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 3299221e3af3..bd14ab9ef6c6 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -10,7 +10,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, UnquantizedFusedMoEMethod) from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) @@ -375,7 +375,12 @@ def apply( class GPTQMarlinMoEMethod(FusedMoEMethodBase): """MoE Marlin method with quantization.""" - def __init__(self, quant_config: GPTQMarlinConfig) -> None: + def __init__( + self, + quant_config: GPTQMarlinConfig, + moe: FusedMoEConfig, + ) -> None: + super().__init__(moe) self.quant_config = quant_config if self.quant_config.quant_type.size_bits == 4: self.quant_type = scalar_types.uint4b8 @@ -646,6 +651,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `GPTQMarlinMoEMethod` yet.") @@ -662,7 +669,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return torch.ops.vllm.fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 22fbbab00e91..e0f462b36976 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -12,7 +12,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + is_valid_flashinfer_cutlass_fused_moe) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, @@ -22,8 +24,8 @@ QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( - build_flashinfer_fp4_cutlass_moe_kernel, - flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) + build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1, + select_nvfp4_gemm_impl) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31) @@ -177,7 +179,7 @@ def get_quant_method(self, layer: torch.nn.Module, elif isinstance(layer, Attention): return ModelOptFp8KVCacheMethod(self) elif isinstance(layer, FusedMoE): - return ModelOptFp8MoEMethod(self) + return ModelOptFp8MoEMethod(self, layer.moe_config) return None @@ -273,7 +275,12 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): quant_config: The ModelOpt quantization config. """ - def __init__(self, quant_config: ModelOptFp8Config) -> None: + def __init__( + self, + quant_config: ModelOptFp8Config, + moe: FusedMoEConfig, + ) -> None: + super().__init__(moe) self.quant_config = quant_config from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( cutlass_fp8_supported) @@ -454,6 +461,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `ModelOptFp8MoEMethod` yet.") @@ -484,6 +493,7 @@ def apply( custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, ) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts) @@ -699,7 +709,7 @@ def get_quant_method(self, layer: torch.nn.Module, elif isinstance(layer, Attention): return ModelOptFp8KVCacheMethod(self) elif isinstance(layer, FusedMoE): - return ModelOptNvFp4FusedMoE(self) + return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer) return None @@ -923,10 +933,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): quant_config: NVFP4 Quant Config """ - def __init__(self, quant_config: ModelOptNvFp4Config) -> None: - self.quant_config = quant_config + def __init__( + self, + quant_config: ModelOptNvFp4Config, + moe: FusedMoEConfig, + layer: torch.nn.Module, + ) -> None: from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 detect_nvfp4_moe_support) + super().__init__(moe) + self.quant_config = quant_config + self.layer = layer _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported self.allow_flashinfer = _nvfp4.allow_flashinfer @@ -952,27 +969,35 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None: self.fused_experts: Optional[ mk.FusedMoEModularKernel] = None # type: ignore[assignment] - def maybe_swap_experts_impl( + def maybe_make_prepare_finalize( self, - moe_parallel_config: FusedMoEParallelConfig, - ): + moe: FusedMoEConfig, + ) -> Optional[mk.FusedMoEPrepareAndFinalize]: if not self.allow_flashinfer: - return - self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config) + return super().maybe_make_prepare_finalize(moe) - # This method update self.fused_experts - # only prepare_finalize is not None call select_gemm_impl - # so when native cutlass fp4, fused_expert is in fuse_moe.py fused_expert - # when it's not called(TP case), we still have 2 kernels to use. - def select_gemm_impl(self, prepare_finalize, - moe) -> mk.FusedMoEPermuteExpertsUnpermute: - - assert moe is not None and prepare_finalize is not None - from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 - select_nvfp4_gemm_impl) + prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize( + moe, + a1_gscale=self.layer.w13_input_scale_quant, + ) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize - return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger) + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + moe: FusedMoEConfig, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + experts = select_nvfp4_gemm_impl( + moe, + g1_alphas=self.layer.g1_alphas, + g2_alphas=self.layer.g2_alphas, + a1_gscale=self.layer.w13_input_scale_quant, + a2_gscale=self.layer.w2_input_scale_quant, + allow_flashinfer=self.allow_flashinfer, + ) + logger.debug_once("Using %s", experts.__class__.__name__) + return experts def uses_weight_scale_2_pattern(self) -> bool: """ @@ -1362,7 +1387,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) if self.use_marlin: return torch.ops.vllm.fused_marlin_moe( @@ -1404,21 +1430,28 @@ def apply( n=layer.w2_weight.shape[2] * 2, k=x.shape[1], e=layer.w13_weight.shape[0], - device=x.device, expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input) else: assert self.allow_flashinfer and \ self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS - out = flashinfer_fp4_cutlass_moe_forward( - self.fused_experts, - layer, - x, - topk_weights, - topk_ids, + + assert is_valid_flashinfer_cutlass_fused_moe( + x, layer.w13_weight, layer.w2_weight), ( + "Flashinfer CUTLASS Fused MoE not applicable!") + + out = self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, # TODO(shuw): fix later, now output is high prec activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, + w1_scale=layer.w13_blockscale_swizzled, + w2_scale=layer.w2_blockscale_swizzled, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index c5055a02fa3d..364d1ac314d2 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -7,7 +7,7 @@ from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) + FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -160,7 +160,7 @@ def get_quant_method(self, layer: torch.nn.Module, else: raise ValueError("moe_wna16 only support gptq and awq.") elif isinstance(layer, FusedMoE): - return MoeWNA16Method(self) + return MoeWNA16Method(self, layer.moe_config) return None @@ -175,7 +175,12 @@ class MoeWNA16Method(FusedMoEMethodBase): quant_config: The MOE WNA16 (W8A16/W4A16) quantization config. """ - def __init__(self, quant_config: MoeWNA16Config): + def __init__( + self, + quant_config: MoeWNA16Config, + moe: FusedMoEConfig, + ): + super().__init__(moe) self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -302,6 +307,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `MoeWNA16Method` yet.") @@ -318,7 +325,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) weight_bits = self.quant_config.weight_bits has_zp = self.quant_config.has_zp diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 594f5136ecc3..f75d73446c50 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -82,7 +82,7 @@ def get_quant_method(self, layer: torch.nn.Module, class Mxfp4MoEMethod(FusedMoEMethodBase): def __init__(self, moe: FusedMoEConfig): - super().__init__() + super().__init__(moe) self.topk_indices_dtype = None self.moe = moe self.use_marlin = self._should_use_marlin() diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 6f69210d0861..58f56c6381b3 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -7,7 +7,8 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, + FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( OCP_MX_BLOCK_SIZE) @@ -25,6 +26,9 @@ class QuarkMoEMethod(FusedMoEMethodBase): + def __init__(self, moe: FusedMoEConfig): + super().__init__(moe) + @staticmethod def get_moe_method( quant_config: "QuarkConfig", # type: ignore # noqa E501 # noqa F821 @@ -42,17 +46,24 @@ def get_moe_method( input_config = layer_quant_config.get("input_tensors") if quant_config._is_fp8_w8a8(weight_config, input_config): - return QuarkW8A8Fp8MoEMethod(weight_config, input_config) + return QuarkW8A8Fp8MoEMethod(weight_config, input_config, + module.moe_config) elif quant_config._is_mx_fp4(weight_config, input_config): - return QuarkW4A4MXFp4MoEMethod(weight_config, input_config) + return QuarkW4A4MXFp4MoEMethod(weight_config, input_config, + module.moe_config) else: raise RuntimeError("Unsupported FusedMoe scheme") class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): - def __init__(self, weight_config: dict[str, Any], input_config: dict[str, - Any]): + def __init__( + self, + weight_config: dict[str, Any], + input_config: dict[str, Any], + moe: FusedMoEConfig, + ): + super().__init__(moe) self.weight_quant = weight_config self.input_quant = input_config @@ -215,6 +226,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet.") @@ -231,7 +244,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_experts( x, @@ -253,8 +267,13 @@ def apply( class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): - def __init__(self, weight_config: dict[str, Any], input_config: dict[str, - Any]): + def __init__( + self, + weight_config: dict[str, Any], + input_config: dict[str, Any], + moe: FusedMoEConfig, + ): + super().__init__(moe) self.weight_quant = weight_config self.input_quant = input_config @@ -369,6 +388,7 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None if enable_eplb: raise NotImplementedError( @@ -386,7 +406,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) out = fused_experts( x, diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index cceaf9857c40..8bdb50e07b13 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -10,7 +10,8 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, + FusedMoEMethodBase) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -76,7 +77,7 @@ def get_quant_method(self, layer: torch.nn.Module, if isinstance(layer, LinearBase): return RTNLinearMethod(self) elif isinstance(layer, FusedMoE): - return RTNMoEMethod(self) + return RTNMoEMethod(self, layer.moe_config) return None @@ -210,7 +211,8 @@ def apply(self, class RTNMoEMethod(FusedMoEMethodBase): - def __init__(self, quant_config: RTNConfig): + def __init__(self, quant_config: RTNConfig, moe: FusedMoEConfig): + super().__init__(moe) self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -289,6 +291,8 @@ def apply( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `RTNMoEMethod` yet.") @@ -305,7 +309,8 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) weight_bits = self.quant_config.weight_bits group_size = self.quant_config.group_size diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 8ef91eeed406..f5d7c57fe2a8 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -3,33 +3,30 @@ """Utility helpers for NVFP4 + FlashInfer fused-MoE path""" from __future__ import annotations -from typing import Optional - import torch import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( - FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe) + FlashInferExperts) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 FlashInferCutlassMoEPrepareAndFinalize) from vllm.platforms import current_platform - -logger = init_logger(__name__) +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe __all__ = [ "is_flashinfer_fp4_cutlass_moe_available", "reorder_w1w3_to_w3w1", - "build_flashinfer_fp4_cutlass_moe_kernel", - "flashinfer_fp4_cutlass_moe_forward", + "build_flashinfer_fp4_cutlass_moe_prepare_finalize", ] def is_flashinfer_fp4_cutlass_moe_available() -> bool: """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" - return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda() + return (envs.VLLM_USE_FLASHINFER_MOE_FP4 + and has_flashinfer_cutlass_fused_moe() + and current_platform.is_cuda() and current_platform.is_device_capability(100)) @@ -49,105 +46,33 @@ def reorder_w1w3_to_w3w1(weight: torch.Tensor, dim=dim).contiguous()) -def build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel: - """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel""" - experts = FlashInferExperts( - use_nvfp4_w4a4=True, - use_dp=moe_parallel_config.dp_size > 1, - ep_rank=moe_parallel_config.ep_rank, - ep_size=moe_parallel_config.ep_size, - tp_rank=moe_parallel_config.tp_rank, - tp_size=moe_parallel_config.tp_size, - ) - logger.debug_once("FlashInferExperts (util)") - return mk.FusedMoEModularKernel( - FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8), - experts, - ) - - -def flashinfer_fp4_cutlass_moe_forward( - fused_experts: mk.FusedMoEModularKernel, - layer: torch.nn.Module, - x: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str, - global_num_experts: int, - expert_map: Optional[torch.Tensor], - apply_router_weight_on_input: bool, -) -> torch.Tensor: - """Common forward wrapper for FlashInfer NV-FP4 fused-MoE""" - - assert is_valid_flashinfer_cutlass_fused_moe( - x, layer.w13_weight, - layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!") - - a1_gscale = layer.w13_input_scale_quant - a2_gscale = layer.w2_input_scale_quant - - extra_expert_args = { - "g1_alphas": layer.g1_alphas, - "g2_alphas": layer.g2_alphas, - # Avoid confusion with a1_scale and a2_scale - # where are batch size related. - "a1_gscale": a1_gscale, - "a2_gscale": a2_gscale, - "out_dtype": x.dtype, - } - extra_prepare_args = { - "use_dp": layer.dp_size > 1, - "local_tokens": x.shape[0], - "a1_gscale": a1_gscale, - } - extra_finalize_args = { - "use_dp": layer.dp_size > 1, - "local_tokens": x.shape[0], - } - - return fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=False, # TODO(shuw): fix later, now output is high prec - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, - apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args, - extra_prepare_args=extra_prepare_args, - extra_finalize_args=extra_finalize_args, - ) +def build_flashinfer_fp4_cutlass_moe_prepare_finalize( + moe: FusedMoEConfig, + a1_gscale: torch.Tensor, +) -> mk.FusedMoEPrepareAndFinalize: + """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel""" + use_dp = moe.moe_parallel_config.dp_size > 1 + return FlashInferCutlassMoEPrepareAndFinalize(use_dp, a1_gscale=a1_gscale) def select_nvfp4_gemm_impl( - allow_flashinfer: bool, - moe, # FusedMoEConfig - logger): + moe: FusedMoEConfig, + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + a1_gscale: torch.Tensor, + a2_gscale: torch.Tensor, + allow_flashinfer: bool, +) -> mk.FusedMoEPermuteExpertsUnpermute: """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" - # lazy import - from vllm.distributed import get_ep_group - - all2all_manager = get_ep_group().device_communicator.all2all_manager - assert all2all_manager is not None - if allow_flashinfer: - flashinfer_backend = envs.VLLM_FLASHINFER_MOE_BACKEND - if flashinfer_backend != "throughput": - raise ValueError( - f"Only throughput backend is supported for FlashInferExperts, " - f"but got {flashinfer_backend}.") - logger.debug_once( - "Initializing FlashInferExperts with throughput backend.") return FlashInferExperts( - use_nvfp4_w4a4=True, - use_dp=moe.moe_parallel_config.dp_size > 1, + g1_alphas=g1_alphas, + g2_alphas=g2_alphas, + a1_gscale=a1_gscale, + a2_gscale=a2_gscale, + out_dtype=moe.in_dtype, + quant_dtype="nvfp4", ep_rank=moe.moe_parallel_config.ep_rank, ep_size=moe.moe_parallel_config.ep_size, tp_rank=moe.moe_parallel_config.tp_rank, From ab544cdf44cd998201dcda4932f018165d8b8f1b Mon Sep 17 00:00:00 2001 From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:47:56 -0400 Subject: [PATCH 054/231] [Model] Granite-4 support loading quantized checkpoint (#22925) Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: Duncan Moss --- vllm/model_executor/models/granitemoehybrid.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 5704496b9a5d..f451e65338b7 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -471,7 +471,10 @@ def _load_expert(n, p, name, shard_id, expert_id): # Mapping different experts' layout: # from HF (input_linear, output_linear, router) # to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate) - if n.endswith('.block_sparse_moe.input_linear.weight'): + # The renaming and parameter loading logic is the same for weight + # and weight_scale tensors so we can reuse them without issues. + if (n.endswith('.block_sparse_moe.input_linear.weight') or + n.endswith('.block_sparse_moe.input_linear.weight_scale')): for e in range(p.size(0)): w1_name = n.replace( '.block_sparse_moe.input_linear.weight', @@ -490,7 +493,8 @@ def _load_expert(n, p, name, shard_id, expert_id): w3_name, shard_id='w3', expert_id=e) - elif n.endswith('.block_sparse_moe.output_linear.weight'): + elif (n.endswith('.block_sparse_moe.output_linear.weight') or + n.endswith('.block_sparse_moe.output_linear.weight_scale')): for e in range(p.size(0)): w2_name = n.replace( '.block_sparse_moe.output_linear.weight', From 47d4185bdfcc3565e7d0f88eaaf96f28adac04dc Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:51:50 -0400 Subject: [PATCH 055/231] [Log] Debug Once for Randomizing dummy data for DP Rank (#22860) Signed-off-by: yewentao256 Signed-off-by: Duncan Moss --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3ea39dc519d8..bef67486d518 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2192,7 +2192,7 @@ def rand_input_ids() -> torch.Tensor: high=self.model_config.get_vocab_size(), dtype=input_ids.dtype) - logger.debug("Randomizing dummy data for DP Rank") + logger.debug_once("Randomizing dummy data for DP Rank") input_ids.copy_(rand_input_ids()[:input_ids.size(0)], non_blocking=True) yield From 612eab5976ba670c8a6245781465199f87c027de Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Fri, 15 Aug 2025 15:12:12 -0400 Subject: [PATCH 056/231] [Core] direct indexing on self.block_table_np in compute_slot_mapping (#22940) Signed-off-by: linzebing Signed-off-by: Duncan Moss --- vllm/v1/worker/block_table.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index bf38e88f0c2a..5662fc350e19 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -91,8 +91,7 @@ def compute_slot_mapping(self, req_indices: np.ndarray, # block_size. block_table_indices = (req_indices * self.max_num_blocks_per_req + positions // self.block_size) - block_table_cpu = self.get_cpu_tensor() - block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() + block_numbers = self.block_table_np.ravel()[block_table_indices] block_offsets = positions % self.block_size np.add(block_numbers * self.block_size, block_offsets, From 627c147f9491ca51a39e85829006e32a4fa465b3 Mon Sep 17 00:00:00 2001 From: nvjullin Date: Sat, 16 Aug 2025 04:08:37 +0800 Subject: [PATCH 057/231] [Bugfix] Added more env vars to hash (#22449) Signed-off-by: Julien Lin Signed-off-by: Duncan Moss --- vllm/envs.py | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 82084d1fc5ae..861e4c6a1bbe 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1199,14 +1199,6 @@ def compute_hash() -> str: affect the choice of different kernels or attention backends should also be included in the factors list. """ - factors: list[Any] = [] - - # summarize environment variables - def factorize(name: str): - if __getattr__(name): - factors.append(__getattr__(name)) - else: - factors.append("None") # The values of envs may affects the computation graph. # TODO(DefTruth): hash all environment variables? @@ -1221,11 +1213,45 @@ def factorize(name: str): "VLLM_DP_SIZE", "VLLM_USE_STANDALONE_COMPILE", "VLLM_FUSED_MOE_CHUNK_SIZE", + "VLLM_FLASHINFER_MOE_BACKEND", + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION", + "VLLM_USE_AITER_UNIFIED_ATTENTION", + "VLLM_ATTENTION_BACKEND", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_FLASHINFER_FORCE_TENSOR_CORES", + "VLLM_DISABLED_KERNELS", + "VLLM_USE_DEEP_GEMM", "VLLM_USE_TRTLLM_FP4_GEMM", + "VLLM_USE_FLASHINFER_MOE_FP8", + "VLLM_USE_FLASHINFER_MOE_FP4", + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", + "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", + "VLLM_USE_CUDNN_PREFILL", + "VLLM_USE_TRTLLM_ATTENTION", + "VLLM_ROCM_USE_AITER", + "VLLM_ROCM_USE_AITER_PAGED_ATTN", + "VLLM_ROCM_USE_AITER_LINEAR", + "VLLM_ROCM_USE_AITER_MOE", + "VLLM_ROCM_USE_AITER_RMSNORM", + "VLLM_ROCM_USE_AITER_MLA", + "VLLM_ROCM_USE_AITER_MHA", + "VLLM_ROCM_USE_SKINNY_GEMM", + "VLLM_ROCM_FP8_PADDING", + "VLLM_ROCM_MOE_PADDING", + "VLLM_ROCM_CUSTOM_PAGED_ATTN", + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", + "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", + "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", ] for key in environment_variables_to_hash: - if key in environment_variables: - factorize(key) + # if this goes out of sync with environment_variables, + # it's not a user error, it's a bug + assert key in environment_variables, \ + "Please update environment_variables_to_hash in envs.py" + + factors = [ + environment_variables[key]() for key in environment_variables_to_hash + ] hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() From a22c39fcaf3903c488cb129de2354c2c2cb359fb Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 15 Aug 2025 16:54:20 -0400 Subject: [PATCH 058/231] Use regex in convert-results-json-to-markdown.py (#22989) Signed-off-by: Michael Goin Signed-off-by: Duncan Moss --- .../scripts/convert-results-json-to-markdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 496ee6083abd..77047636bb95 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -4,7 +4,6 @@ import argparse import json import os -import re import shlex from importlib import util from pathlib import Path @@ -12,6 +11,7 @@ import pandas as pd import psutil +import regex as re from tabulate import tabulate # latency results and the keys that will be printed into markdown From a57f6d21800e8a95fd87a3e0dc244a74cfae97e7 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 15 Aug 2025 16:56:31 -0400 Subject: [PATCH 059/231] [CI] Speed up Whisper tests by reusing server (#22859) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- .../openai/test_transcription_validation.py | 320 ++++++++---------- .../openai/test_translation_validation.py | 234 +++++++------ 2 files changed, 263 insertions(+), 291 deletions(-) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index e103bd206b54..93239f41a4ae 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -4,19 +4,20 @@ # imports for guided decoding tests import io import json -from unittest.mock import patch import librosa import numpy as np import openai import pytest +import pytest_asyncio import soundfile as sf -from openai._base_client import AsyncAPIClient from vllm.assets.audio import AudioAsset from ...utils import RemoteOpenAIServer +MODEL_NAME = "openai/whisper-large-v3-turbo" +SERVER_ARGS = ["--enforce-eager"] MISTRAL_FORMAT_ARGS = [ "--tokenizer_mode", "mistral", "--config_format", "mistral", "--load_format", "mistral" @@ -37,6 +38,18 @@ def winning_call(): yield f +@pytest.fixture(scope="module") +def server(): + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", @@ -61,25 +74,33 @@ async def test_basic_audio(mary_had_lamb, model_name): @pytest.mark.asyncio -async def test_bad_requests(mary_had_lamb): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: +async def test_non_asr_model(winning_call): + # text to text model + model_name = "JackFram/llama-68m" + with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: client = remote_server.get_async_client() - - # invalid language - with pytest.raises(openai.BadRequestError): - await client.audio.transcriptions.create(model=model_name, - file=mary_had_lamb, - language="hh", - temperature=0.0) + res = await client.audio.transcriptions.create(model=model_name, + file=winning_call, + language="en", + temperature=0.0) + err = res.error + assert err["code"] == 400 and not res.text + assert err[ + "message"] == "The model does not support Transcriptions API" @pytest.mark.asyncio -@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"]) -async def test_long_audio_request(mary_had_lamb, model_name): - server_args = ["--enforce-eager"] +async def test_bad_requests(mary_had_lamb, client): + # invalid language + with pytest.raises(openai.BadRequestError): + await client.audio.transcriptions.create(model=MODEL_NAME, + file=mary_had_lamb, + language="hh", + temperature=0.0) + +@pytest.mark.asyncio +async def test_long_audio_request(mary_had_lamb, client): mary_had_lamb.seek(0) audio, sr = librosa.load(mary_had_lamb) # Add small silence after each audio for repeatability in the split process @@ -89,188 +110,129 @@ async def test_long_audio_request(mary_had_lamb, model_name): buffer = io.BytesIO() sf.write(buffer, repeated_audio, sr, format='WAV') buffer.seek(0) - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=buffer, - language="en", - response_format="text", - temperature=0.0) - out = json.loads(transcription)['text'] - counts = out.count("Mary had a little lamb") - assert counts == 10, counts - - -@pytest.mark.asyncio -async def test_non_asr_model(winning_call): - # text to text model - model_name = "JackFram/llama-68m" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - res = await client.audio.transcriptions.create(model=model_name, - file=winning_call, - language="en", - temperature=0.0) - err = res.error - assert err["code"] == 400 and not res.text - assert err[ - "message"] == "The model does not support Transcriptions API" + transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=buffer, + language="en", + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + counts = out.count("Mary had a little lamb") + assert counts == 10, counts @pytest.mark.asyncio -async def test_completion_endpoints(): +async def test_completion_endpoints(client): # text to text model - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - res = await client.chat.completions.create( - model=model_name, - messages=[{ - "role": "system", - "content": "You are a helpful assistant." - }]) - err = res.error - assert err["code"] == 400 - assert err[ - "message"] == "The model does not support Chat Completions API" - - res = await client.completions.create(model=model_name, prompt="Hello") - err = res.error - assert err["code"] == 400 - assert err["message"] == "The model does not support Completions API" + res = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "system", + "content": "You are a helpful assistant." + }]) + err = res.error + assert err["code"] == 400 + assert err["message"] == "The model does not support Chat Completions API" + + res = await client.completions.create(model=MODEL_NAME, prompt="Hello") + err = res.error + assert err["code"] == 400 + assert err["message"] == "The model does not support Completions API" @pytest.mark.asyncio -async def test_streaming_response(winning_call): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] +async def test_streaming_response(winning_call, client): transcription = "" - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - res_no_stream = await client.audio.transcriptions.create( - model=model_name, - file=winning_call, - response_format="json", - language="en", - temperature=0.0) - # Unfortunately this only works when the openai client is patched - # to use streaming mode, not exposed in the transcription api. - original_post = AsyncAPIClient.post - - async def post_with_stream(*args, **kwargs): - kwargs['stream'] = True - return await original_post(*args, **kwargs) - - with patch.object(AsyncAPIClient, "post", new=post_with_stream): - client = remote_server.get_async_client() - res = await client.audio.transcriptions.create( - model=model_name, - file=winning_call, - language="en", - temperature=0.0, - extra_body=dict(stream=True), - timeout=30) - # Reconstruct from chunks and validate - async for chunk in res: - # just a chunk - text = chunk.choices[0]['delta']['content'] - transcription += text - - assert transcription == res_no_stream.text + res_no_stream = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=winning_call, + response_format="json", + language="en", + temperature=0.0) + res = await client.audio.transcriptions.create(model=MODEL_NAME, + file=winning_call, + language="en", + temperature=0.0, + stream=True, + timeout=30) + # Reconstruct from chunks and validate + async for chunk in res: + text = chunk.choices[0]['delta']['content'] + transcription += text + + assert transcription == res_no_stream.text @pytest.mark.asyncio -async def test_stream_options(winning_call): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - original_post = AsyncAPIClient.post - - async def post_with_stream(*args, **kwargs): - kwargs['stream'] = True - return await original_post(*args, **kwargs) - - with patch.object(AsyncAPIClient, "post", new=post_with_stream): - client = remote_server.get_async_client() - res = await client.audio.transcriptions.create( - model=model_name, - file=winning_call, - language="en", - temperature=0.0, - extra_body=dict(stream=True, - stream_include_usage=True, - stream_continuous_usage_stats=True), - timeout=30) - final = False - continuous = True - async for chunk in res: - if not len(chunk.choices): - # final usage sent - final = True - else: - continuous = continuous and hasattr(chunk, 'usage') - assert final and continuous +async def test_stream_options(winning_call, client): + res = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=winning_call, + language="en", + temperature=0.0, + stream=True, + extra_body=dict(stream_include_usage=True, + stream_continuous_usage_stats=True), + timeout=30) + final = False + continuous = True + async for chunk in res: + if not len(chunk.choices): + # final usage sent + final = True + else: + continuous = continuous and hasattr(chunk, 'usage') + assert final and continuous @pytest.mark.asyncio -async def test_sampling_params(mary_had_lamb): +async def test_sampling_params(mary_had_lamb, client): """ Compare sampling with params and greedy sampling to assert results are different when extreme sampling parameters values are picked. """ - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - temperature=0.8, - extra_body=dict(seed=42, - repetition_penalty=1.9, - top_k=12, - top_p=0.4, - min_p=0.5, - frequency_penalty=1.8, - presence_penalty=2.0)) - - greedy_transcription = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - temperature=0.0, - extra_body=dict(seed=42)) - - assert greedy_transcription.text != transcription.text + transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + temperature=0.8, + extra_body=dict(seed=42, + repetition_penalty=1.9, + top_k=12, + top_p=0.4, + min_p=0.5, + frequency_penalty=1.8, + presence_penalty=2.0)) + + greedy_transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + temperature=0.0, + extra_body=dict(seed=42)) + + assert greedy_transcription.text != transcription.text @pytest.mark.asyncio -async def test_audio_prompt(mary_had_lamb): - model_name = "openai/whisper-large-v3-turbo" - server_args = ["--enforce-eager"] +async def test_audio_prompt(mary_had_lamb, client): prompt = "This is a speech, recorded in a phonograph." - with RemoteOpenAIServer(model_name, server_args) as remote_server: - #Prompts should not omit the part of original prompt while transcribing. - prefix = "The first words I spoke in the original phonograph" - client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - response_format="text", - temperature=0.0) - out = json.loads(transcription)['text'] - assert prefix in out - transcription_wprompt = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - response_format="text", - prompt=prompt, - temperature=0.0) - out_prompt = json.loads(transcription_wprompt)['text'] - assert prefix in out_prompt + #Prompts should not omit the part of original prompt while transcribing. + prefix = "The first words I spoke in the original phonograph" + transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + assert prefix in out + transcription_wprompt = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + prompt=prompt, + temperature=0.0) + out_prompt = json.loads(transcription_wprompt)['text'] + assert prefix in out_prompt diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index bfa9bdef1c00..f4f5c66f2dee 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -4,18 +4,21 @@ import io # imports for guided decoding tests import json -from unittest.mock import patch +import httpx import librosa import numpy as np import pytest +import pytest_asyncio import soundfile as sf -from openai._base_client import AsyncAPIClient from vllm.assets.audio import AudioAsset from ...utils import RemoteOpenAIServer +MODEL_NAME = "openai/whisper-small" +SERVER_ARGS = ["--enforce-eager"] + @pytest.fixture def foscolo(): @@ -25,50 +28,23 @@ def foscolo(): yield f -# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! -@pytest.mark.asyncio -async def test_basic_audio(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - translation = await client.audio.translations.create( - model=model_name, - file=foscolo, - response_format="text", - # TODO remove once language detection is implemented - extra_body=dict(language="it"), - temperature=0.0) - out = json.loads(translation)['text'].strip().lower() - assert "greek sea" in out +@pytest.fixture(scope="module") +def server(): + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: + yield remote_server -@pytest.mark.asyncio -async def test_audio_prompt(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - # Condition whisper on starting text - prompt = "Nor have I ever" - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - transcription = await client.audio.translations.create( - model=model_name, - file=foscolo, - prompt=prompt, - extra_body=dict(language="it"), - response_format="text", - temperature=0.0) - out = json.loads(transcription)['text'] - assert "Nor will I ever touch the sacred" not in out - assert prompt not in out +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client @pytest.mark.asyncio async def test_non_asr_model(foscolo): # text to text model model_name = "JackFram/llama-68m" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: + with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: client = remote_server.get_async_client() res = await client.audio.translations.create(model=model_name, file=foscolo, @@ -78,81 +54,117 @@ async def test_non_asr_model(foscolo): assert err["message"] == "The model does not support Translations API" +# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! +@pytest.mark.asyncio +async def test_basic_audio(foscolo, client): + translation = await client.audio.translations.create( + model=MODEL_NAME, + file=foscolo, + response_format="text", + # TODO remove once language detection is implemented + extra_body=dict(language="it"), + temperature=0.0) + out = json.loads(translation)['text'].strip().lower() + assert "greek sea" in out + + +@pytest.mark.asyncio +async def test_audio_prompt(foscolo, client): + # Condition whisper on starting text + prompt = "Nor have I ever" + transcription = await client.audio.translations.create( + model=MODEL_NAME, + file=foscolo, + prompt=prompt, + extra_body=dict(language="it"), + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + assert "Nor will I ever touch the sacred" not in out + assert prompt not in out + + @pytest.mark.asyncio -async def test_streaming_response(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] +async def test_streaming_response(foscolo, client, server): translation = "" - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - res_no_stream = await client.audio.translations.create( - model=model_name, - file=foscolo, - response_format="json", - extra_body=dict(language="it"), - temperature=0.0) - # Unfortunately this only works when the openai client is patched - # to use streaming mode, not exposed in the translation api. - original_post = AsyncAPIClient.post - - async def post_with_stream(*args, **kwargs): - kwargs['stream'] = True - return await original_post(*args, **kwargs) - - with patch.object(AsyncAPIClient, "post", new=post_with_stream): - client = remote_server.get_async_client() - res = await client.audio.translations.create(model=model_name, - file=foscolo, - temperature=0.0, - extra_body=dict( - stream=True, - language="it")) - # Reconstruct from chunks and validate - async for chunk in res: - # just a chunk - text = chunk.choices[0]['delta']['content'] - translation += text - - assert translation == res_no_stream.text + res_no_stream = await client.audio.translations.create( + model=MODEL_NAME, + file=foscolo, + response_format="json", + extra_body=dict(language="it"), + temperature=0.0) + # Stream via HTTPX since OpenAI translation client doesn't expose streaming + url = server.url_for("v1/audio/translations") + headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"} + data = { + "model": MODEL_NAME, + "language": "it", + "stream": True, + "temperature": 0.0, + } + foscolo.seek(0) + async with httpx.AsyncClient() as http_client: + files = {"file": foscolo} + async with http_client.stream("POST", + url, + headers=headers, + data=data, + files=files) as response: + async for line in response.aiter_lines(): + if not line: + continue + if line.startswith("data: "): + line = line[len("data: "):] + if line.strip() == "[DONE]": + break + chunk = json.loads(line) + text = chunk["choices"][0].get("delta", {}).get("content") + translation += text or "" + + assert translation == res_no_stream.text @pytest.mark.asyncio -async def test_stream_options(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - original_post = AsyncAPIClient.post - - async def post_with_stream(*args, **kwargs): - kwargs['stream'] = True - return await original_post(*args, **kwargs) - - with patch.object(AsyncAPIClient, "post", new=post_with_stream): - client = remote_server.get_async_client() - res = await client.audio.translations.create( - model=model_name, - file=foscolo, - temperature=0.0, - extra_body=dict(language="it", - stream=True, - stream_include_usage=True, - stream_continuous_usage_stats=True)) - final = False - continuous = True - async for chunk in res: - if not len(chunk.choices): +async def test_stream_options(foscolo, client, server): + url = server.url_for("v1/audio/translations") + headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"} + data = { + "model": MODEL_NAME, + "language": "it", + "stream": True, + "stream_include_usage": True, + "stream_continuous_usage_stats": True, + "temperature": 0.0, + } + foscolo.seek(0) + final = False + continuous = True + async with httpx.AsyncClient() as http_client: + files = {"file": foscolo} + async with http_client.stream("POST", + url, + headers=headers, + data=data, + files=files) as response: + async for line in response.aiter_lines(): + if not line: + continue + if line.startswith("data: "): + line = line[len("data: "):] + if line.strip() == "[DONE]": + break + chunk = json.loads(line) + choices = chunk.get("choices", []) + if not choices: # final usage sent final = True else: - continuous = continuous and hasattr(chunk, 'usage') - assert final and continuous + continuous = continuous and ("usage" in chunk) + assert final and continuous @pytest.mark.asyncio -async def test_long_audio_request(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - +async def test_long_audio_request(foscolo, client): foscolo.seek(0) audio, sr = librosa.load(foscolo) repeated_audio = np.tile(audio, 2) @@ -160,13 +172,11 @@ async def test_long_audio_request(foscolo): buffer = io.BytesIO() sf.write(buffer, repeated_audio, sr, format='WAV') buffer.seek(0) - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - translation = await client.audio.translations.create( - model=model_name, - file=buffer, - extra_body=dict(language="it"), - response_format="text", - temperature=0.0) - out = json.loads(translation)['text'].strip().lower() - assert out.count("greek sea") == 2 + translation = await client.audio.translations.create( + model=MODEL_NAME, + file=buffer, + extra_body=dict(language="it"), + response_format="text", + temperature=0.0) + out = json.loads(translation)['text'].strip().lower() + assert out.count("greek sea") == 2 From eec4da9f8c927ecb12a35c6ece635af1d8a43396 Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:02:12 -0700 Subject: [PATCH 060/231] [Fix] enable swap_ab for pplx problem size computation (#22991) Signed-off-by: Shixian Cui Co-authored-by: Shixian Cui Signed-off-by: Duncan Moss --- .../quantization/cutlass_w8a8/moe/moe_data.cu | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu index 857cca1e82df..100f48508444 100644 --- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu +++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu @@ -161,6 +161,7 @@ void get_cutlass_moe_mm_data_caller( topk_ids.size(1)); } +template __global__ void compute_pplx_data(int32_t* expert_offsets, int32_t* problem_sizes1, int32_t* problem_sizes2, @@ -168,14 +169,23 @@ __global__ void compute_pplx_data(int32_t* expert_offsets, const int padded_m, const int n, const int k) { int expert_idx = threadIdx.x; - expert_offsets[expert_idx] = expert_idx * padded_m; - problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx]; - problem_sizes1[expert_idx * 3 + 1] = 2 * n; - problem_sizes1[expert_idx * 3 + 2] = k; - problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx]; - problem_sizes2[expert_idx * 3 + 1] = k; - problem_sizes2[expert_idx * 3 + 2] = n; + + if constexpr (!SWAP_AB) { + problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx]; + problem_sizes1[expert_idx * 3 + 1] = 2 * n; + problem_sizes1[expert_idx * 3 + 2] = k; + problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx]; + problem_sizes2[expert_idx * 3 + 1] = k; + problem_sizes2[expert_idx * 3 + 2] = n; + } else { + problem_sizes1[expert_idx * 3] = 2 * n; + problem_sizes1[expert_idx * 3 + 1] = expert_num_tokens[expert_idx]; + problem_sizes1[expert_idx * 3 + 2] = k; + problem_sizes2[expert_idx * 3] = k; + problem_sizes2[expert_idx * 3 + 1] = expert_num_tokens[expert_idx]; + problem_sizes2[expert_idx * 3 + 2] = n; + } } void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, @@ -187,10 +197,19 @@ void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, const int64_t n, const int64_t k) { auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index()); - compute_pplx_data<<<1, num_local_experts, 0, stream>>>( - static_cast(expert_offsets.data_ptr()), - static_cast(problem_sizes1.data_ptr()), - static_cast(problem_sizes2.data_ptr()), - static_cast(expert_num_tokens.data_ptr()), padded_m, n, - k); + if (num_local_experts * padded_m > SWAP_AB_THRESHOLD) { + compute_pplx_data<<<1, num_local_experts, 0, stream>>>( + static_cast(expert_offsets.data_ptr()), + static_cast(problem_sizes1.data_ptr()), + static_cast(problem_sizes2.data_ptr()), + static_cast(expert_num_tokens.data_ptr()), padded_m, n, + k); + } else { + compute_pplx_data<<<1, num_local_experts, 0, stream>>>( + static_cast(expert_offsets.data_ptr()), + static_cast(problem_sizes1.data_ptr()), + static_cast(problem_sizes2.data_ptr()), + static_cast(expert_num_tokens.data_ptr()), padded_m, n, + k); + } } \ No newline at end of file From af8ffba5f7de1099259fc60ff4ffdd2a89316d73 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:09:23 -0700 Subject: [PATCH 061/231] Add PrefixRepetitionRandomDataset to `vllm bench serve` datasets (#20638) Signed-off-by: Seiji Eicher Signed-off-by: Duncan Moss --- vllm/benchmarks/datasets.py | 133 +++++++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 2 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 5299dcf54b39..72d7ce49b8e1 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -26,6 +26,7 @@ import numpy as np from PIL import Image from transformers import PreTrainedTokenizerBase +from typing_extensions import deprecated from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path @@ -486,7 +487,10 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "--dataset-name", type=str, default="random", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], + choices=[ + "sharegpt", "burstgpt", "sonnet", "random", "hf", "custom", + "prefix_repetition" + ], help="Name of the dataset to benchmark on.", ) parser.add_argument( @@ -603,6 +607,37 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "from the sampled HF dataset.", ) + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options") + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=256, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=256, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=10, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + def get_samples(args, tokenizer) -> list[SampleRequest]: if args.dataset_name == "custom": @@ -721,6 +756,17 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: output_len=args.random_output_len, range_ratio=args.random_range_ratio, ), + "prefix_repetition": + lambda: PrefixRepetitionRandomDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.prefix_repetition_prefix_len, + suffix_len=args.prefix_repetition_suffix_len, + num_prefixes=args.prefix_repetition_num_prefixes, + output_len=args.prefix_repetition_output_len, + ), } try: @@ -828,7 +874,9 @@ def sample( # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- - +@deprecated( + "SonnetDataset is deprecated and will be removed in a future version.", +) class SonnetDataset(BenchmarkDataset): """ Simplified implementation of the Sonnet dataset. Loads poem lines from a @@ -1537,3 +1585,84 @@ def sample( self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests + + +# ----------------------------------------------------------------------------- +# Prefix Repetition Dataset Implementation +# ----------------------------------------------------------------------------- + + +class PrefixRepetitionRandomDataset(BenchmarkDataset): + # Default values copied from benchmark_serving.py for the repeated prefix + # dataset. + DEFAULT_PREFIX_LEN = 256 + DEFAULT_SUFFIX_LEN = 256 + DEFAULT_NUM_PREFIXES = 10 + DEFAULT_OUTPUT_LEN = 128 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + random.seed(self.random_seed) + np.random.seed(self.random_seed) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + suffix_len: int = DEFAULT_SUFFIX_LEN, + num_prefixes: int = DEFAULT_NUM_PREFIXES, + output_len: int = DEFAULT_OUTPUT_LEN, + **kwargs, + ) -> list[SampleRequest]: + vocab_size = tokenizer.vocab_size + prompts_per_prefix = num_requests // num_prefixes + if prompts_per_prefix == 0: + raise ValueError( + f"num_requests ({num_requests}) must be greater than or equal " + f"to num_prefixes ({num_prefixes})" + ) + + def _generate_exact_length_tokens(target_length: int) -> list[int]: + """Generate tokens that decode and re-encode to exactly + target_length.""" + # Generate random tokens + tokens = np.random.randint( + 0, vocab_size, size=target_length).tolist() + text = tokenizer.decode(tokens) + re_encoded = tokenizer.encode(text, add_special_tokens=False) + + if len(re_encoded) == target_length: + return re_encoded + elif len(re_encoded) < target_length: + # Recursively generate additional consistent tokens + needed = target_length - len(re_encoded) + extra_tokens = _generate_exact_length_tokens(needed) + return re_encoded + extra_tokens + else: + # Truncate to target length + return re_encoded[:target_length] + + requests = [] + for _ in range(num_prefixes): + prefix_tokens = _generate_exact_length_tokens(prefix_len) + + for _ in range(prompts_per_prefix): + suffix_tokens = _generate_exact_length_tokens(suffix_len) + + combined_tokens = prefix_tokens + suffix_tokens + prompt = tokenizer.decode(combined_tokens) + prompt_len = len(combined_tokens) + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + + random.shuffle(requests) + return requests From 4ab6bd4cc584259a4d9a7ea60e094979fcdb1cc1 Mon Sep 17 00:00:00 2001 From: eigen <52445717+yyihuang@users.noreply.github.com> Date: Fri, 15 Aug 2025 17:38:10 -0400 Subject: [PATCH 062/231] minor: zero workspace buffer init for flashinfer trtllm-gen attn (#22603) Signed-off-by: Duncan Moss --- tests/kernels/attention/test_flashinfer_trtllm_attention.py | 4 ++-- vllm/attention/backends/flashinfer.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 53e225ea3ea6..4b84e6a00ece 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -113,7 +113,7 @@ def test_flashinfer_trtllm_decode_with_baseline( kv_indices = torch.tensor(kv_indices, dtype=torch.int32) kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) - workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) + workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( workspace_buffer, kv_layout, @@ -247,7 +247,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( kv_indices = torch.tensor(kv_indices, dtype=torch.int32) kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) - workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) + workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, kv_layout) wrapper.plan(q_indptr, diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 208cacec38eb..a85ec2463283 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -203,7 +203,7 @@ def __init__(self, runner): def _get_workspace_buffer(self): if self._workspace_buffer is None: - self._workspace_buffer = torch.empty( + self._workspace_buffer = torch.zeros( FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=self.runner.device) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 02decb171fc0..eac3f33e1509 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -252,7 +252,7 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], def _get_workspace_buffer(self): if self._workspace_buffer is None: - self._workspace_buffer = torch.empty( + self._workspace_buffer = torch.zeros( FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=self.device) From 9b6683ffb5d6c7eb05dab2c8eb68b6ddf7fbaee1 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 15 Aug 2025 17:41:07 -0400 Subject: [PATCH 063/231] [Attention] FA3 Attention Sinks Perf Boost (#22478) Signed-off-by: Lucas Wilkinson Signed-off-by: Duncan Moss --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index d24d8e8e5e79..4e2a0e4533e6 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 93cf5a08f421a3efd0c4a7e005ef8f742b578ce0 + GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From 8d808ce83e9575f9f77938a7ce223985197cb3c4 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 16 Aug 2025 00:55:26 +0200 Subject: [PATCH 064/231] [BugFix] Fix regression caused by mamba state dtype PR (#22998) Signed-off-by: Thomas Parnell Signed-off-by: Duncan Moss --- vllm/model_executor/models/phi4flash.py | 8 ++++++-- vllm/model_executor/models/plamo2.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index 493a4192d35a..fcdfcb7bc160 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -650,8 +650,12 @@ def forward( num_mamba_layers = self.config.num_hidden_layers \ // 2 // self.config.mb_per_layer + 1 self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) + self.vllm_config, + num_mamba_layers, + *self._get_mamba_cache_shape(), + self.lm_head.weight.dtype, + self.lm_head.weight.dtype, + ) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) attn_metadata = get_forward_context().attn_metadata diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 8b1df66f0280..e5034b536266 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -767,8 +767,12 @@ def forward(self, self.vllm_config.parallel_config, LayerBlockType.mamba) self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) + self.vllm_config, + num_mamba_layers, + *self._get_mamba_cache_shape(), + self.lm_head.weight.dtype, + self.lm_head.weight.dtype, + ) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) From 456c8cf9ccbac06a7c995e9f658eacfdd758f302 Mon Sep 17 00:00:00 2001 From: Eli Uriegas <1700823+seemethere@users.noreply.github.com> Date: Fri, 15 Aug 2025 16:16:23 -0700 Subject: [PATCH 065/231] ci: Add CUDA + arm64 release builds (#21201) Signed-off-by: Eli Uriegas Signed-off-by: Duncan Moss --- .buildkite/release-pipeline.yaml | 16 ++++++++++++++++ docker/Dockerfile | 17 ++--------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 6314afd65234..85d3e5638742 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,4 +1,20 @@ steps: + # aarch64 + CUDA builds + - label: "Build arm64 wheel - CUDA 12.8" + id: build-wheel-arm64-cuda-12-8 + agents: + queue: arm64_cpu_queue_postmerge + commands: + # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: + # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + + # x86 + CUDA builds - label: "Build wheel - CUDA 12.8" id: build-wheel-cuda-12-8 agents: diff --git a/docker/Dockerfile b/docker/Dockerfile index 66a6e6fd6f67..74938917781a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -139,21 +139,6 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace # install build and runtime dependencies - -# arm64 (GH200) build follows the practice of "use existing pytorch" build, -# we need to install torch and torchvision from the nightly builds first, -# pytorch will not appear as a vLLM dependency in all of the following steps -# after this step -RUN --mount=type=cache,target=/root/.cache/uv \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - --pre pytorch_triton==3.3.0+gitab727c40; \ - fi - COPY requirements/common.txt requirements/common.txt COPY requirements/cuda.txt requirements/cuda.txt RUN --mount=type=cache,target=/root/.cache/uv \ @@ -234,6 +219,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && sccache --show-stats; \ fi +ARG vllm_target_device="cuda" +ENV VLLM_TARGET_DEVICE=${vllm_target_device} ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ From 02200dc08a7a5cd2b9af2f6c10dc561aa85ca872 Mon Sep 17 00:00:00 2001 From: rishitdholakia13 <123388671+rishitdholakia13@users.noreply.github.com> Date: Fri, 15 Aug 2025 17:25:05 -0600 Subject: [PATCH 066/231] [Structured Outputs] [Bug] Fix misalignment in apply_grammar_bitmask causing unintended masking and NaN logits (#22963) Signed-off-by: rishitdholakia13 Signed-off-by: Duncan Moss --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bef67486d518..4c919b392fbd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1355,10 +1355,10 @@ def apply_grammar_bitmask( cumulative_index += 1 + num_spec_tokens grammar_bitmask = sorted_bitmask - # If the grammar bitmask and the logits have the same shape + # If the length of out indices and the logits have the same shape # we don't need to pass indices to the kernel, # since the bitmask is already aligned with the logits. - skip_out_indices = grammar_bitmask.shape[0] == logits.shape[0] + skip_out_indices = len(out_indices) == logits.shape[0] # Serialization of np.ndarray is much more efficient than a tensor, # so we receive it in that format. From 6c4a7f21f14eb418fb2cb1db71f84c8fc3419e13 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 15 Aug 2025 16:38:42 -0700 Subject: [PATCH 067/231] [BugFix] Handle case where async utility call is cancelled (#22996) Signed-off-by: Nick Hill Co-authored-by: Yinghai Lu Signed-off-by: Duncan Moss --- tests/v1/engine/test_engine_core_client.py | 24 +++++++++++++++++++++- vllm/v1/engine/core_client.py | 21 +++++++++++++------ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index c82285639aee..37eb869fe69a 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -121,8 +121,13 @@ async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict): # Dummy utility function to monkey-patch into engine core. -def echo(self, msg: str, err_msg: Optional[str] = None) -> str: +def echo(self, + msg: str, + err_msg: Optional[str] = None, + sleep: Optional[float] = None) -> str: print(f"echo util function called: {msg}, {err_msg}") + if sleep is not None: + time.sleep(sleep) if err_msg is not None: raise ValueError(err_msg) return msg @@ -289,6 +294,23 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): await core_client.call_utility_async("echo", None, "help!") assert str(e_info.value) == "Call to echo method failed: help!" + + # Test that cancelling the utility call doesn't destabilize the + # engine. + util_task = asyncio.create_task( + core_client.call_utility_async("echo", "testarg2", None, + 0.5)) # sleep for 0.5 sec + await asyncio.sleep(0.05) + cancelled = util_task.cancel() + assert cancelled + + # Ensure client is still functional. The engine runs utility + # methods in a single thread so this request won't be processed + # until the cancelled sleeping one is complete. + result = await asyncio.wait_for(core_client.call_utility_async( + "echo", "testarg3"), + timeout=1.0) + assert result == "testarg3" finally: client.shutdown() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 29ee0a9dfb1e..079dd9a7d38d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -574,13 +574,22 @@ def monitor_engine_cores(): def _process_utility_output(output: UtilityOutput, utility_results: dict[int, AnyFuture]): - """Set the result from a utility method in the waiting future""" + """Set the result from a utility method in the waiting future.""" future = utility_results.pop(output.call_id) - if output.failure_message is not None: - future.set_exception(Exception(output.failure_message)) - else: - assert output.result is not None - future.set_result(output.result.result) + failure_message = output.failure_message + try: + if failure_message is not None: + future.set_exception(Exception(failure_message)) + else: + assert output.result is not None + future.set_result(output.result.result) + except asyncio.InvalidStateError: + # This can happen if the future is cancelled due to the + # original calling task being cancelled. + if failure_message is not None: + logger.error( + "Cancelled call to utility method failed " + "with error: %s", failure_message) class SyncMPClient(MPClient): From 21ead32ba2a19d480b939b8d80ae659d88bb1960 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Sat, 16 Aug 2025 02:52:52 +0300 Subject: [PATCH 068/231] [v1] Move block_hashes from KVCacheManager to Request.block_hashes (#19728) Signed-off-by: Or Ozeri Signed-off-by: Duncan Moss --- tests/v1/core/test_async_scheduler.py | 22 +- tests/v1/core/test_kv_cache_utils.py | 50 ++-- tests/v1/core/test_prefix_caching.py | 225 ++++++++++-------- tests/v1/core/test_scheduler.py | 29 ++- .../core/test_single_type_kv_cache_manager.py | 2 - tests/v1/core/utils.py | 17 +- .../kv_connector/unit/test_nixl_connector.py | 2 + .../unit/test_remote_decode_lifecycle.py | 10 +- .../unit/test_remote_prefill_lifecycle.py | 17 +- tests/v1/kv_connector/unit/utils.py | 31 ++- vllm/utils/__init__.py | 18 ++ vllm/v1/core/block_pool.py | 75 ++---- vllm/v1/core/kv_cache_coordinator.py | 33 +-- vllm/v1/core/kv_cache_manager.py | 51 +--- vllm/v1/core/kv_cache_utils.py | 80 ++++--- vllm/v1/core/sched/scheduler.py | 2 - vllm/v1/core/single_type_kv_cache_manager.py | 10 +- vllm/v1/engine/core.py | 22 +- vllm/v1/request.py | 22 +- 19 files changed, 382 insertions(+), 336 deletions(-) diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py index 3ccefbd81cab..3a9492269f9c 100644 --- a/tests/v1/core/test_async_scheduler.py +++ b/tests/v1/core/test_async_scheduler.py @@ -7,6 +7,7 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import RequestStatus +from vllm.v1.utils import ConstantList from .utils import create_requests, create_scheduler @@ -140,7 +141,8 @@ def test_prefix_caching_for_prefill_dedup(): requests = create_requests(num_requests=5, num_tokens=num_prompt_tokens, max_tokens=3, - same_prompt=True) + same_prompt=True, + block_size=BLOCK_SIZE) requests_copy = requests.copy() # Two requests with the same prompt. @@ -188,7 +190,8 @@ def test_prefix_caching_for_multi_turn(): block_size=BLOCK_SIZE) requests = create_requests(num_requests=5, num_tokens=num_prompt_tokens, - max_tokens=num_output_tokens) + max_tokens=num_output_tokens, + block_size=BLOCK_SIZE) for req in requests: scheduler.add_request(req) @@ -208,14 +211,19 @@ def test_prefix_caching_for_multi_turn(): # Create next-turn requests whose prompts are the full output of the # previous turn. - next_turn_requests = create_requests( - num_requests=5, - num_tokens=num_prompt_tokens + num_output_tokens, - max_tokens=num_output_tokens, - ) + next_turn_requests = create_requests(num_requests=5, + num_tokens=num_prompt_tokens + + num_output_tokens, + max_tokens=num_output_tokens, + block_size=BLOCK_SIZE) for i, req in enumerate(next_turn_requests): req.prompt_token_ids = (requests[i].prompt_token_ids + list(requests[i].output_token_ids)) + req._all_token_ids = req.prompt_token_ids.copy() + req.all_token_ids = ConstantList(req._all_token_ids) + req.block_hashes = [] + req.block_hashes = req.get_hash_new_full_blocks() + # Schedule the next-turn requests. for req in next_turn_requests: scheduler.add_request(req) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 182ea2b2345c..e0b91e6dd7ee 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib -from typing import Optional +from typing import Callable, Optional import pytest import torch @@ -19,7 +19,7 @@ FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, get_kv_cache_config, get_max_concurrency_for_kv_cache_config, - hash_block_tokens, hash_request_tokens, init_none_hash, + get_request_block_hasher, hash_block_tokens, init_none_hash, is_kv_cache_type_uniform, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor, @@ -33,6 +33,8 @@ def make_request( request_id: str, prompt_token_ids: list[int], + block_size: int = 3, + hash_fn: Callable = hash, mm_positions: Optional[list[PlaceholderRange]] = None, mm_hashes: Optional[list[str]] = None, cache_salt: Optional[str] = None, @@ -49,18 +51,17 @@ def make_request( mm_item = MultiModalKwargsItem.from_elems([mm_elem]) mm_kwargs = [mm_item] * len(mm_positions) - return Request( - request_id=request_id, - prompt_token_ids=prompt_token_ids, - multi_modal_kwargs=mm_kwargs, - multi_modal_hashes=mm_hashes, - multi_modal_placeholders=mm_positions, - sampling_params=SamplingParams(max_tokens=17), - pooling_params=None, - eos_token_id=100, - lora_request=None, - cache_salt=cache_salt, - ) + return Request(request_id=request_id, + prompt_token_ids=prompt_token_ids, + multi_modal_kwargs=mm_kwargs, + multi_modal_hashes=mm_hashes, + multi_modal_placeholders=mm_positions, + sampling_params=SamplingParams(max_tokens=17), + pooling_params=None, + eos_token_id=100, + lora_request=None, + cache_salt=cache_salt, + block_hasher=get_request_block_hasher(block_size, hash_fn)) def new_kv_cache_spec(block_size=16, @@ -428,12 +429,14 @@ def test_hash_block_tokens(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash]) -def test_hash_request_tokens(hash_fn): +def test_request_block_hasher(hash_fn): import vllm.v1.core.kv_cache_utils init_none_hash(hash_fn) request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], + block_size=3, + hash_fn=hash_fn, mm_positions=[ PlaceholderRange(offset=0, length=3), PlaceholderRange(offset=3, length=3), @@ -441,9 +444,7 @@ def test_hash_request_tokens(hash_fn): mm_hashes=["hash1", "hash2"], ) - block_size = 3 - block_hashes = hash_request_tokens(hash_fn, block_size, request) - + block_hashes = request.block_hashes assert len(block_hashes) == 2 assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash) assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash) @@ -464,6 +465,8 @@ def test_hash_tokens_different_mm_input(hash_fn): request1 = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], + block_size=3, + hash_fn=hash_fn, mm_positions=[ PlaceholderRange(offset=0, length=3), PlaceholderRange(offset=3, length=3), @@ -479,9 +482,8 @@ def test_hash_tokens_different_mm_input(hash_fn): ], mm_hashes=["hash3", "hash2"], ) - block_size = 3 - block_hashes1 = hash_request_tokens(hash_fn, block_size, request1) - block_hashes2 = hash_request_tokens(hash_fn, block_size, request2) + block_hashes1 = request1.block_hashes + block_hashes2 = request2.block_hashes assert block_hashes1[0] != block_hashes2[0] assert block_hashes1[1] != block_hashes2[1] @@ -493,12 +495,13 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn): request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], + block_size=3, + hash_fn=hash_fn, mm_positions=None, mm_hashes=None, ) - block_size = 3 - block_hashes = hash_request_tokens(hash_fn, block_size, request) + block_hashes = request.block_hashes assert len(block_hashes) == 2 assert block_hashes[0].token_ids == (0, 1, 2) @@ -858,6 +861,7 @@ def test_allocate_with_lookahead(): request = make_request( request_id="0", prompt_token_ids=[], + block_size=block_size, mm_positions=None, mm_hashes=None, ) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 87acdef22013..28cfca6767b1 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -3,7 +3,7 @@ """Compare the with and without prefix caching.""" import copy -from typing import Optional +from typing import Callable, Optional import pytest import torch @@ -17,8 +17,9 @@ from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_manager import KVCacheManager, Request from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, - KVCacheBlock, hash_block_tokens, - init_none_hash) + KVCacheBlock, + get_request_block_hasher, + hash_block_tokens, init_none_hash) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, SlidingWindowSpec) @@ -26,6 +27,8 @@ def make_request( request_id: str, prompt_token_ids: list[int], + block_size: int, + hash_fn: Callable, mm_positions: Optional[list[PlaceholderRange]] = None, mm_hashes: Optional[list[str]] = None, prompt_logprobs: Optional[int] = None, @@ -43,19 +46,18 @@ def make_request( mm_item = MultiModalKwargsItem.from_elems([mm_elem]) mm_kwargs = [mm_item] * len(mm_positions) - return Request( - request_id=request_id, - prompt_token_ids=prompt_token_ids, - multi_modal_kwargs=mm_kwargs, - multi_modal_hashes=mm_hashes, - multi_modal_placeholders=mm_positions, - sampling_params=SamplingParams(max_tokens=17, - prompt_logprobs=prompt_logprobs), - pooling_params=None, - eos_token_id=100, - lora_request=None, - cache_salt=cache_salt, - ) + return Request(request_id=request_id, + prompt_token_ids=prompt_token_ids, + multi_modal_kwargs=mm_kwargs, + multi_modal_hashes=mm_hashes, + multi_modal_placeholders=mm_positions, + sampling_params=SamplingParams( + max_tokens=17, prompt_logprobs=prompt_logprobs), + pooling_params=None, + eos_token_id=100, + lora_request=None, + cache_salt=cache_salt, + block_hasher=get_request_block_hasher(block_size, hash_fn)) def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig: @@ -105,11 +107,11 @@ def make_kv_cache_config_hybrid_model(block_size: int, @pytest.mark.parametrize("hash_algo", ["sha256", "sha256_cbor_64bit", "hash"]) def test_prefill(hash_algo): + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, - caching_hash_algo=hash_algo, ) # choose the hash function according to the parameter @@ -123,9 +125,9 @@ def test_prefill(hash_algo): # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 all_token_ids = common_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids) + req0 = make_request("0", all_token_ids, block_size, hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - assert len(manager.req_to_block_hashes[req0.request_id]) == 3 + assert len(req0.block_hashes) == 3 assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, @@ -152,9 +154,10 @@ def test_prefill(hash_algo): # Cache hit in the common prefix when the original block is still in use. # Incomplete 1 block (5 tokens) unique_token_ids = [3] * 5 - req1 = make_request("1", common_token_ids + unique_token_ids) + req1 = make_request("1", common_token_ids + unique_token_ids, block_size, + hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) - assert len(manager.req_to_block_hashes[req1.request_id]) == 3 + assert len(req1.block_hashes) == 3 assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 @@ -187,9 +190,10 @@ def test_prefill(hash_algo): # Cache hit in the common prefix when the original block is already free. # Incomplete 1 block (6 tokens) unique_token_ids = [3] * 6 - req2 = make_request("2", common_token_ids + unique_token_ids) + req2 = make_request("2", common_token_ids + unique_token_ids, block_size, + hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert len(manager.req_to_block_hashes[req2.request_id]) == 3 + assert len(req2.block_hashes) == 3 assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 @@ -208,7 +212,7 @@ def test_prefill(hash_algo): manager.free(req2) # Cache miss and eviction. - req3 = make_request("3", [99] * (16 * 10)) + req3 = make_request("3", [99] * (16 * 10), block_size, hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -242,9 +246,9 @@ def test_prefill_hybrid_model(): # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 all_token_ids = common_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids) + req0 = make_request("0", all_token_ids, block_size, hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - assert len(manager.req_to_block_hashes[req0.request_id]) == 3 + assert len(req0.block_hashes) == 3 assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, @@ -274,9 +278,10 @@ def test_prefill_hybrid_model(): # Cache hit in the common prefix # Incomplete 1 block (5 tokens) unique_token_ids = [3] * 5 - req1 = make_request("1", common_token_ids + unique_token_ids) + req1 = make_request("1", common_token_ids + unique_token_ids, block_size, + hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) - assert len(manager.req_to_block_hashes[req1.request_id]) == 3 + assert len(req1.block_hashes) == 3 assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6, 7], [0, 10, 11]) assert num_computed_tokens == 3 * 16 @@ -290,7 +295,7 @@ def test_prefill_hybrid_model(): if block != manager.block_pool.null_block: assert block.ref_cnt == 2 - block_hashes = manager.req_to_block_hashes[req1.request_id] + block_hashes = req1.block_hashes manager.free(req0) manager.free(req1) @@ -300,12 +305,13 @@ def test_prefill_hybrid_model(): def test_partial_request_hit(request_id: str, hash_to_evict: list[BlockHashWithGroupId], expect_hit_length: int): - req = make_request(request_id, common_token_ids + unique_token_ids) + req = make_request(request_id, common_token_ids + unique_token_ids, + block_size, hash) for hash_with_group_id in hash_to_evict: manager.block_pool.cached_block_hash_to_block.pop( hash_with_group_id) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) - assert len(manager.req_to_block_hashes[req.request_id]) == 3 + assert len(req.block_hashes) == 3 assert num_computed_tokens == expect_hit_length * block_size for block_per_group in computed_blocks.blocks: assert len(block_per_group) == num_computed_tokens // block_size @@ -364,8 +370,9 @@ def test_prefill_plp(): 2. Schedule non-plp request and validate blocks 3. Schedule plp request; no hit should occur; validate blocks ''' + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) @@ -380,9 +387,13 @@ def test_prefill_plp(): # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 all_token_ids = common_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids, prompt_logprobs=5) + req0 = make_request("0", + all_token_ids, + block_size, + hash_fn, + prompt_logprobs=5) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - assert len(manager.req_to_block_hashes[req0.request_id]) == 0 + assert len(req0.block_hashes) == 3 assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, @@ -411,9 +422,10 @@ def test_prefill_plp(): # Cache hit in the common prefix when the original block is still in use. # Incomplete 1 block (5 tokens) unique_token_ids = [3] * 5 - req1 = make_request("1", common_token_ids + unique_token_ids) + req1 = make_request("1", common_token_ids + unique_token_ids, block_size, + hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) - assert len(manager.req_to_block_hashes[req1.request_id]) == 3 + assert len(req1.block_hashes) == 3 assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 @@ -447,9 +459,11 @@ def test_prefill_plp(): unique_token_ids = [3] * 6 req2 = make_request("2", common_token_ids + unique_token_ids, + block_size, + hash_fn, prompt_logprobs=5) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert len(manager.req_to_block_hashes[req2.request_id]) == 0 + assert len(req2.block_hashes) == 3 assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 blocks = manager.allocate_slots(req2, 55, @@ -469,8 +483,9 @@ def test_prefill_plp(): def test_decode(): + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) @@ -481,7 +496,8 @@ def test_decode(): # Fully cache miss # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 - req0 = make_request("0", common_token_ids + unique_token_ids) + req0 = make_request("0", common_token_ids + unique_token_ids, block_size, + hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -518,14 +534,15 @@ def test_decode(): def test_evict(): + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) last_token_id = 5 * 16 + 7 - req0 = make_request("0", list(range(last_token_id))) + req0 = make_request("0", list(range(last_token_id)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -536,7 +553,8 @@ def test_evict(): # 3 blocks. req1 = make_request("1", list(range(last_token_id, - last_token_id + 3 * 16))) + last_token_id + 3 * 16)), block_size, + hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -558,7 +576,7 @@ def test_evict(): ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7] # Touch the first 2 blocks. - req2 = make_request("2", list(range(2 * 16 + 3))) + req2 = make_request("2", list(range(2 * 16 + 3)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert computed_blocks.get_block_ids() == ([1, 2], ) assert num_computed_tokens == 2 * 16 @@ -583,7 +601,7 @@ def test_hash_block_correct_reuse(): # Allocate 1 block and cache it. num_tokens = block_size * 1 - req = make_request("0", list(range(num_tokens))) + req = make_request("0", list(range(num_tokens)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -597,7 +615,7 @@ def test_hash_block_correct_reuse(): # Allocate a new block that's not full, make sure hash info on the # block is cleared. - req = make_request("1", list(range(num_tokens - 1))) + req = make_request("1", list(range(num_tokens - 1)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -624,7 +642,7 @@ def test_computed_blocks_not_evicted(): # Allocate a block and cache it. num_tokens = block_size * 1 - req0 = make_request("0", list(range(num_tokens))) + req0 = make_request("0", list(range(num_tokens)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -635,7 +653,8 @@ def test_computed_blocks_not_evicted(): assert blocks.blocks[0][0].block_id == 1 # Allocate another block. - req1 = make_request("1", list(range(num_tokens, num_tokens * 2))) + req1 = make_request("1", list(range(num_tokens, num_tokens * 2)), + block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -651,7 +670,7 @@ def test_computed_blocks_not_evicted(): # Now if we have a cache hit on the first block, we should evict the second # cached block rather than the first one. - req2 = make_request("2", list(range(num_tokens * 2))) + req2 = make_request("2", list(range(num_tokens * 2)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(computed_blocks.blocks[0]) == 1 assert computed_blocks.blocks[0][0].block_id == 1 @@ -675,7 +694,8 @@ def test_basic_prefix_caching_disabled(): enable_caching=False, ) - req1 = make_request("1", list(range(10))) # 2 blocks and some more + req1 = make_request("1", list(range(10)), block_size, + hash) # 2 blocks and some more computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] @@ -689,7 +709,8 @@ def test_basic_prefix_caching_disabled(): manager.free(req1) # No caching. - req2 = make_request("2", list(range(16))) # shared prefix + req2 = make_request("2", list(range(16)), block_size, + hash) # shared prefix computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -699,7 +720,7 @@ def test_basic_prefix_caching_disabled(): assert len(blocks.blocks[0]) == 4 # New requests should not have any blocks. - req3 = make_request("3", list(range(4))) + req3 = make_request("3", list(range(4)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -727,20 +748,17 @@ def test_cache_blocks(hash_fn): # Block 1: [4, 5, 6, 7] # Block 2: [8, 9, 10, 11] # Block 3: [12, 13] - req = make_request("0", list(range(14))) + req = make_request("0", list(range(14)), block_size, hash_fn) # Test that blocks are cached correctly for 2 full blocks from the start. blocks = [KVCacheBlock(block_id=i) for i in range(2)] - block_hashes: list[BlockHash] = [] block_pool.cache_full_blocks( request=req, blocks=blocks, - block_hashes=block_hashes, num_cached_blocks=0, num_full_blocks=2, block_size=block_size, - hash_fn=hash_fn, kv_cache_group_id=0, ) @@ -752,11 +770,9 @@ def test_cache_blocks(hash_fn): block_pool.cache_full_blocks( request=req, blocks=blocks, - block_hashes=block_hashes, num_cached_blocks=2, num_full_blocks=3, block_size=block_size, - hash_fn=hash_fn, kv_cache_group_id=0, ) assert len(block_pool.cached_block_hash_to_block) == 3 @@ -775,23 +791,20 @@ def test_cache_blocks_multi_group(): # Block 1/5: [4, 5, 6, 7] # Block 2/6: [8, 9, 10, 11] # Block 3/7: [12, 13] - req = make_request("0", list(range(14))) + req = make_request("0", list(range(14)), block_size, hash) # Cache the blocks for group 0. blocks = [KVCacheBlock(block_id=i) for i in range(2)] - block_hashes: list[BlockHash] = [] block_pool.cache_full_blocks( request=req, blocks=blocks, - block_hashes=block_hashes, num_cached_blocks=0, num_full_blocks=2, block_size=block_size, - hash_fn=hash, kv_cache_group_id=0, ) assert len(block_pool.cached_block_hash_to_block) == 2 - assert len(block_hashes) == 2 + assert len(req.block_hashes) == 3 assert all([block.block_hash is not None for block in blocks]) # Cache the blocks for group 1. @@ -799,38 +812,36 @@ def test_cache_blocks_multi_group(): block_pool.cache_full_blocks( request=req, blocks=blocks, - block_hashes=block_hashes, num_cached_blocks=0, num_full_blocks=3, block_size=block_size, - hash_fn=hash, kv_cache_group_id=1, ) assert len(block_pool.cached_block_hash_to_block) == 5 - assert len(block_hashes) == 3 + assert len(req.block_hashes) == 3 assert all([block.block_hash is not None for block in blocks]) # Block hash 0: hit for group 0 and 1 # Block hash 1: hit for group 0 and 1 # Block hash 2: hit for group 1 - assert block_pool.get_cached_block(block_hashes[0], + assert block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[0]) is not None - assert block_pool.get_cached_block(block_hashes[1], + assert block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[0]) is not None - assert block_pool.get_cached_block(block_hashes[2], + assert block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[0]) is None - assert block_pool.get_cached_block(block_hashes[0], + assert block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[1]) is not None - assert block_pool.get_cached_block(block_hashes[1], + assert block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[1]) is not None - assert block_pool.get_cached_block(block_hashes[2], + assert block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[1]) is not None - assert block_pool.get_cached_block(block_hashes[0], + assert block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[0, 1]) is not None - assert block_pool.get_cached_block(block_hashes[1], + assert block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[0, 1]) is not None - assert block_pool.get_cached_block(block_hashes[2], + assert block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[0, 1]) is None @@ -838,8 +849,9 @@ def test_mm_prefix_caching(): """ This tests that the multi-modal prefix caching is correct. """ + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) @@ -865,6 +877,8 @@ def test_mm_prefix_caching(): mm_hashes = common_mm_hashes + ["ccc"] req0 = make_request("0", all_token_ids, + block_size, + hash, mm_positions=mm_positions, mm_hashes=mm_hashes) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) @@ -872,7 +886,7 @@ def test_mm_prefix_caching(): # Completed block should have hashes with extra keys. assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 - block_hashes = manager.req_to_block_hashes[req0.request_id] + block_hashes = req0.block_hashes assert len(block_hashes) == 3 assert block_hashes[0].extra_keys == ("aaa", ) assert block_hashes[1].extra_keys == ("aaa", "bbb") @@ -905,6 +919,8 @@ def test_mm_prefix_caching(): mm_hashes = common_mm_hashes + ["ccc"] req1 = make_request("1", all_token_ids, + block_size, + hash, mm_positions=mm_positions, mm_hashes=mm_hashes) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) @@ -927,13 +943,13 @@ def test_cache_key_salting(): # 3 complete blocks and an incomplete block with 11 tokens. common_token_ids = [i for i in range(3) for _ in range(block_size)] token_ids = common_token_ids + [3] * 11 - req0 = make_request("0", token_ids, cache_salt="salt1") + req0 = make_request("0", token_ids, block_size, hash, cache_salt="salt1") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) # Completed block should have hashes with extra keys. assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 - block_hashes = manager.req_to_block_hashes[req0.request_id] + block_hashes = req0.block_hashes assert len(block_hashes) == 3 assert block_hashes[0].extra_keys == ("salt1", ) assert block_hashes[1].extra_keys is None @@ -959,7 +975,7 @@ def test_cache_key_salting(): # Test cache hit with a new request that has the same salt. token_ids = common_token_ids + [4] * 11 - req1 = make_request("1", token_ids, cache_salt="salt1") + req1 = make_request("1", token_ids, block_size, hash, cache_salt="salt1") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) # Should match only a prefix of 3 blocks. assert len(computed_blocks.blocks[0]) == 3 @@ -967,11 +983,11 @@ def test_cache_key_salting(): # Test cache miss with same content but different salt. token_ids = common_token_ids + [4] * 11 - req2 = make_request("2", token_ids, cache_salt="salt2") + req2 = make_request("2", token_ids, block_size, hash, cache_salt="salt2") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(computed_blocks.blocks[0]) == 0 assert num_computed_tokens == 0 - block_hashes = manager.req_to_block_hashes[req2.request_id] + block_hashes = req2.block_hashes assert len(block_hashes) == 3 assert block_hashes[0].extra_keys == ("salt2", ) @@ -992,7 +1008,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # Complete 3 blocks (48 tokens) # | Common-0 | Common-1 | Common-2 | ... | common_token_ids = [i for i in range(3) for _ in range(16)] - req0 = make_request("0", common_token_ids) + req0 = make_request("0", common_token_ids, block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -1003,7 +1019,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): req0.request_id] # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... | - req1 = make_request("1", common_token_ids * 2) + req1 = make_request("1", common_token_ids * 2, block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert computed_blocks.blocks[0] == block_part0 assert num_computed_tokens == 3 * 16 @@ -1020,19 +1036,19 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | # | Req1-5(F)| Req2-0 | Req2-1 | ... | - req2 = make_request("2", [7] * block_size * 2) + req2 = make_request("2", [7] * block_size * 2, block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 manager.allocate_slots(req2, block_size * 2, - len(computed_blocks.blocks[0]) * 16, + len(computed_blocks.blocks[0]) * block_size, computed_blocks) # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed, # but it cannot be allocated due to insufficient free blocks (2). # In this case, the ref_cnt of the computed blocks should not be changed. assert manager.block_pool.free_block_queue.num_free_blocks == 5 - req3 = make_request("3", common_token_ids * 3) + req3 = make_request("3", common_token_ids * 3, block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert computed_blocks.blocks[0] == block_part1 assert num_computed_tokens == 6 * 16 @@ -1047,8 +1063,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): def test_reset_prefix_cache(): + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) @@ -1056,15 +1073,15 @@ def test_reset_prefix_cache(): full_block_token_ids = [i for i in range(3) for _ in range(16)] unique_token_ids = [3] * 7 all_token_ids = full_block_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids) + req0 = make_request("0", all_token_ids, block_size, hash) blocks = manager.allocate_slots(req0, 55) assert blocks.get_block_ids() == ([1, 2, 3, 4], ) unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids - req1 = make_request("1", all_token_ids) + req1 = make_request("1", all_token_ids, block_size, hash) computed_blocks, _ = manager.get_computed_blocks(req1) - assert len(manager.req_to_block_hashes[req1.request_id]) == 3 + assert len(req1.block_hashes) == 3 assert len(computed_blocks.blocks[0]) == 3 blocks = manager.allocate_slots(req1, 7, len(computed_blocks.blocks[0]) * 16, @@ -1086,8 +1103,9 @@ def test_reset_prefix_cache(): def test_prefix_cache_stats_disabled(): """Test that prefix_cache_stats is None when log_stats is False.""" + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, log_stats=False, # Disable logging stats @@ -1095,7 +1113,7 @@ def test_prefix_cache_stats_disabled(): assert manager.prefix_cache_stats is None # Call all functions that check whether log_stats is disabled. - req = make_request("0", list(range(16))) + req = make_request("0", list(range(16)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -1192,7 +1210,7 @@ def test_kv_cache_events(blocks_to_cache: int): ) num_tokens = block_size * blocks_to_cache - req0 = make_request("0", list(range(num_tokens))) + req0 = make_request("0", list(range(num_tokens)), block_size, hash) _ = manager.allocate_slots(req0, num_tokens) events = manager.take_events() @@ -1208,7 +1226,7 @@ def test_kv_cache_events(blocks_to_cache: int): # Should see block_to_cache number of removed block events and a new block # stored event manager.free(req0) - req1 = make_request("1", list(range(num_tokens))) + req1 = make_request("1", list(range(num_tokens)), block_size, hash) _ = manager.allocate_slots(req1, num_tokens) events = manager.take_events() @@ -1242,7 +1260,7 @@ def test_eagle_enabled_removes_last_block(): # Request with 3 full blocks (48 tokens) token_ids = [0] * (3 * block_size) - req = make_request("divisible_request", token_ids) + req = make_request("divisible_request", token_ids, block_size, hash) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1252,7 +1270,7 @@ def test_eagle_enabled_removes_last_block(): manager.free(req) # New request with same tokens + Eagle enabled - req_eagle = make_request("eagle_divisible", token_ids) + req_eagle = make_request("eagle_divisible", token_ids, block_size, hash) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Should retain 1 block: @@ -1273,7 +1291,7 @@ def test_eagle_with_partial_blocks(): ) # 2 full blocks + 5 tokens (non-divisible length) token_ids = [0] * (2 * block_size + 5) - req = make_request("partial_block_test", token_ids) + req = make_request("partial_block_test", token_ids, block_size, hash) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1283,7 +1301,7 @@ def test_eagle_with_partial_blocks(): manager.free(req) # New request with Eagle enabled - req_eagle = make_request("partial_eagle", token_ids) + req_eagle = make_request("partial_eagle", token_ids, block_size, hash) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Original match: 2 full blocks → Eagle removes 1 → 1 remaining assert len(computed_blocks.blocks[0]) == 1 @@ -1314,7 +1332,7 @@ def test_eagle_with_sliding_window(): # 2 full blocks + 5 tokens (non-divisible length) token_ids = [0] * (2 * block_size + 5) - req = make_request("partial_block_test", token_ids) + req = make_request("partial_block_test", token_ids, block_size, hash) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1322,12 +1340,12 @@ def test_eagle_with_sliding_window(): len(computed_blocks.blocks[0]) * 16, computed_blocks) # record the block hash of the first block in the request for later use - block_hash_first_block = manager.req_to_block_hashes[req.request_id][0] + block_hash_first_block = req.block_hashes[0] assert block_hash_first_block is not None manager.free(req) # New request with Eagle enabled - req_eagle = make_request("partial_eagle", token_ids) + req_eagle = make_request("partial_eagle", token_ids, block_size, hash) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Original match: 2 full blocks → Eagle removes 1 → 1 remaining assert len(computed_blocks.blocks[0]) == 1 @@ -1340,7 +1358,8 @@ def test_eagle_with_sliding_window(): BlockHashWithGroupId(block_hash_first_block, 0)) # New request - req_after_evict = make_request("partial_eagle_after_evict", token_ids) + req_after_evict = make_request("partial_eagle_after_evict", token_ids, + block_size, hash) computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict) # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is # not considered. But after dropping the last matched block due to eagle, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 1c7dd0ca90b7..ac70c90d92ad 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -589,7 +589,7 @@ def test_preempt_during_execution(): block_size=16, num_blocks=11, enable_prefix_caching=False) - requests = create_requests(num_requests=2, num_tokens=80) + requests = create_requests(num_requests=2, num_tokens=80, block_size=16) # Schedule the first request. scheduler.add_request(requests[0]) @@ -762,7 +762,7 @@ def _assert_right_scheduler_output( def _assert_right_kv_cache_manager( scheduler: Scheduler, - req_ids: list[str], + requests: list[Request], num_tokens: int, block_size: int, num_requests: int, @@ -772,12 +772,12 @@ def _assert_right_kv_cache_manager( # Make sure the request stats are right. EXPECTED_TOTAL_BLOCKS = num_tokens // block_size - for req_id in req_ids: + for req in requests: blocks = (scheduler.kv_cache_manager.coordinator. - single_type_managers[0].req_to_blocks[req_id]) - hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id] + single_type_managers[0].req_to_blocks[req.request_id]) + hashes = req.block_hashes assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0]. - num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS) + num_cached_block[req.request_id] == EXPECTED_TOTAL_BLOCKS) assert len(blocks) == EXPECTED_TOTAL_BLOCKS assert len(hashes) == EXPECTED_TOTAL_BLOCKS @@ -840,7 +840,8 @@ def test_kv_connector_basic(): MAX_TOKENS = 3 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, - max_tokens=MAX_TOKENS) + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE) req_ids = [] req_to_index = {} for i, request in enumerate(requests): @@ -868,7 +869,7 @@ def test_kv_connector_basic(): ) # Ensure KVCacheManager is correct. - _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE, + _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE, NUM_REQUESTS, NUM_TOTAL_BLOCKS) # Continue Generation until done. @@ -886,7 +887,8 @@ def test_kv_connector_basic(): NUM_TOKENS = NUM_TOKENS_PREFIX * 2 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, - max_tokens=MAX_TOKENS) + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE) req_ids = [] req_to_index = {} for i, request in enumerate(requests): @@ -915,7 +917,7 @@ def test_kv_connector_basic(): NUM_MATCHED_NEW_TOKENS)) # Ensure KVCacheManager is correct. - _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE, + _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE, NUM_REQUESTS, NUM_TOTAL_BLOCKS) # Continue Generation until done. @@ -953,7 +955,8 @@ def test_kv_connector_unable_to_allocate(): MAX_TOKENS = 2 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, - max_tokens=MAX_TOKENS) + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE) req_ids = [] req_to_index = {} for i, request in enumerate(requests): @@ -1034,7 +1037,8 @@ def test_kv_connector_handles_preemption(): MAX_TOKENS = BLOCK_SIZE * 2 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, - max_tokens=MAX_TOKENS) + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE) req_ids = [] req_to_index = {} for i, request in enumerate(requests): @@ -1162,7 +1166,6 @@ def assert_scheduler_empty(scheduler: Scheduler): # KVCache Manager. assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. req_to_blocks) == 0 - assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. num_cached_block) == 0 num_free_blocks = ( diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py index b67c05bd7ac1..7dcebba491fa 100644 --- a/tests/v1/core/test_single_type_kv_cache_manager.py +++ b/tests/v1/core/test_single_type_kv_cache_manager.py @@ -17,7 +17,6 @@ def get_sliding_window_manager(sliding_window_spec, block_pool): return SlidingWindowManager(sliding_window_spec, block_pool, - caching_hash_fn=lambda x: x, kv_cache_group_id=0) @@ -25,7 +24,6 @@ def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool): return ChunkedLocalAttentionManager(chunked_local_attention_spec, block_pool, - caching_hash_fn=lambda x: x, kv_cache_group_id=0) diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 484afe61fc3f..52093d3d381a 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -10,6 +10,8 @@ MultiModalFieldElem, MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams +from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, + init_none_hash) from vllm.v1.core.sched.async_scheduler import AsyncScheduler from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -114,6 +116,9 @@ def create_scheduler( ) +_none_hash_initialized = False + + def create_requests( num_requests: int, num_tokens: int = 10, @@ -122,7 +127,14 @@ def create_requests( stop_token_ids: Optional[list[int]] = None, prompt_logprobs: Optional[int] = None, same_prompt: bool = False, + block_size: int = 16, ) -> list[Request]: + global _none_hash_initialized + if not _none_hash_initialized: + init_none_hash(hash) + _none_hash_initialized = True + + block_hasher = get_request_block_hasher(block_size, hash) sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens, stop_token_ids=stop_token_ids, @@ -139,9 +151,11 @@ def create_requests( ) mm_item = MultiModalKwargsItem.from_elems([mm_elem]) mm_kwargs = [mm_item] * len(mm_position) + mm_hashes = ["hash"] * len(mm_position) else: mm_position = None mm_kwargs = None + mm_hashes = None prompt_token_ids = ([0] * num_tokens if same_prompt else [i] * num_tokens) request = Request( @@ -151,8 +165,9 @@ def create_requests( pooling_params=None, multi_modal_kwargs=mm_kwargs, multi_modal_placeholders=mm_position, - multi_modal_hashes=None, + multi_modal_hashes=mm_hashes, eos_token_id=EOS_TOKEN_ID, + block_hasher=block_hasher, ) requests.append(request) return requests diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index b185936ab025..e6859ea73827 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -147,6 +147,7 @@ def test_basic_interface(): NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) request = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True) request_id = request.request_id @@ -186,6 +187,7 @@ def test_prompt_less_than_block_size(): # Request will have 1 partial remote block. request = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True, num_remote_blocks=1) diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index 2f8228864e7b..d8c56ac42f71 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -21,6 +21,7 @@ def test_basic_lifecycle(): NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) request = create_request(request_id=1, + block_size=BLOCK_SIZE, max_tokens=1, num_tokens=NUM_TOKENS, do_remote_decode=True) @@ -103,8 +104,10 @@ def test_short_prompt_lifecycle(): scheduler = create_scheduler(vllm_config) # Not enough tokens for full block. - NUM_TOKENS = vllm_config.cache_config.block_size // 2 + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_TOKENS = BLOCK_SIZE // 2 request = create_request(request_id=1, + block_size=BLOCK_SIZE, max_tokens=1, num_tokens=NUM_TOKENS, do_remote_decode=True) @@ -148,7 +151,9 @@ def test_prefix_cache_lifecycle(): NUM_EXTERNAL_FULL_BLOCKS = 3 NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) - request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS) + request_normal = create_request(request_id=1, + block_size=BLOCK_SIZE, + num_tokens=NUM_TOKENS) scheduler.add_request(request_normal) scheduler_output = scheduler.schedule() @@ -166,6 +171,7 @@ def test_prefix_cache_lifecycle(): NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) request_remote = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_decode=True) diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index 87f7490698a3..21fec5344255 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -23,6 +23,7 @@ def test_basic_lifecycle(): scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) request = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True) @@ -133,14 +134,17 @@ def test_interleaved_lifecycle(): NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) request_remote = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True) request_local_a = create_request( request_id=2, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, ) request_local_b = create_request( request_id=3, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, ) @@ -236,6 +240,7 @@ def test_no_spurious_prefix_caching(): # Both of these requests have prompts like [1,1,1,1,1, ...] request_remote = create_request( request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True, use_all_1s_for_prompt_tokens=True, @@ -243,6 +248,7 @@ def test_no_spurious_prefix_caching(): request_local = create_request( request_id=2, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=False, use_all_1s_for_prompt_tokens=True, @@ -292,6 +298,7 @@ def test_full_block_prompt(): NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS) request = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True) @@ -364,8 +371,11 @@ def test_cannot_schedule_after_recv(): NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) NUM_TOKENS_REMOTE = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) - request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL) + request_normal = create_request(request_id=1, + block_size=BLOCK_SIZE, + num_tokens=NUM_TOKENS_LOCAL) request_remote = create_request(request_id=2, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS_REMOTE, do_remote_prefill=True) @@ -456,8 +466,11 @@ def test_cannot_recv(): NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5)) - request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL) + request_normal = create_request(request_id=1, + block_size=BLOCK_SIZE, + num_tokens=NUM_TOKENS_LOCAL) request_remote = create_request(request_id=2, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS_REMOTE, do_remote_prefill=True) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 60847c48585c..8c5d132c00ae 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile from collections import defaultdict -from typing import Any, Optional +from typing import Any, Callable, Optional import torch @@ -14,6 +14,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa SharedStorageConnector) from vllm.v1.core.kv_cache_manager import KVCacheBlocks +from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, + init_none_hash) from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec) @@ -40,7 +42,6 @@ def assert_scheduler_empty(scheduler: Scheduler): # KVCache Manager. assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. req_to_blocks) == 0 - assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. num_cached_block) == 0 num_free_blocks = ( @@ -115,16 +116,23 @@ def create_scheduler( ) -def create_request( - request_id: int, - num_tokens: int = 10, - max_tokens: int = 16, - do_remote_decode: bool = False, - do_remote_prefill: bool = False, - use_all_1s_for_prompt_tokens: bool = False, - num_remote_blocks: int = 3, -) -> Request: +_none_hash_initialized = False + + +def create_request(request_id: int, + num_tokens: int = 10, + max_tokens: int = 16, + do_remote_decode: bool = False, + do_remote_prefill: bool = False, + use_all_1s_for_prompt_tokens: bool = False, + num_remote_blocks: int = 3, + block_size: int = 16, + hash_fn: Callable = hash) -> Request: """Make dummy request for testing.""" + global _none_hash_initialized + if not _none_hash_initialized: + init_none_hash(hash) + _none_hash_initialized = True kv_transfer_params: Optional[dict[str, Any]] = None @@ -158,6 +166,7 @@ def create_request( multi_modal_placeholders=None, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, + block_hasher=get_request_block_hasher(block_size, hash_fn), ) req.kv_transfer_params = kv_transfer_params return req diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index a1f8ad164762..72857ee2abc7 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3243,6 +3243,24 @@ def sha256_cbor_64bit(input) -> int: return full_hash & ((1 << 64) - 1) +def get_hash_fn_by_name(hash_fn_name: str) -> Callable: + """Get a hash function by name, or raise an error if + the function is not found. + Args: + hash_fn_name: Name of the hash function. + Returns: + A hash function. + """ + if hash_fn_name == "sha256": + return sha256 + if hash_fn_name == "sha256_cbor_64bit": + return sha256_cbor_64bit + if hash_fn_name == "builtin": + return hash + + raise ValueError(f"Unsupported hash function: {hash_fn_name}") + + def is_torch_equal_or_newer(target: str) -> bool: """Check if the installed torch version is >= the target version. diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index ad9854dd29c3..839297135fe0 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -2,15 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from collections.abc import Iterable -from typing import Callable, Optional +from typing import Optional from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved, BlockStored, KVCacheEvent) from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, - FreeKVCacheBlockQueue, KVCacheBlock, - generate_block_hash_extra_keys, - hash_block_tokens) + FreeKVCacheBlockQueue, KVCacheBlock) from vllm.v1.request import Request logger = init_logger(__name__) @@ -97,84 +95,39 @@ def cache_full_blocks( self, request: Request, blocks: list[KVCacheBlock], - block_hashes: list[BlockHash], num_cached_blocks: int, num_full_blocks: int, block_size: int, kv_cache_group_id: int, - hash_fn: Callable, ) -> None: """Cache a list of full blocks for prefix caching. This function takes a list of blocks that will have their block hash - metadata to be updated and cached. Given a request, it computes the - block hashes for the blocks starting from `num_cached_blocks` to - `num_full_blocks`, updating the metadata for each block - and caching them in the `cached_block_hash_to_block`. + metadata to be updated and cached. Given a request, it updates the + metadata for each block and caching it in the + `cached_block_hash_to_block`. + The block hashes values are computed by the Request object immediately + when it is created and when new tokens are appended. Args: request: The request to cache the blocks. blocks: All blocks in the request. - block_hashes: Block hashes of the blocks in the request. Note that - this list may be shorter than the blocks list. In this case the - missed block hash will be computed in this function. num_cached_blocks: The number of blocks that are already cached. num_full_blocks: The number of blocks that are full and should be cached after this function. block_size: Number of tokens in each block. kv_cache_group_id: The id of the KV cache group. - hash_fn: The hash function to use for block hashes. """ if num_cached_blocks == num_full_blocks: return new_full_blocks = blocks[num_cached_blocks:num_full_blocks] - assert len(block_hashes) >= num_cached_blocks - new_block_hashes = block_hashes[num_cached_blocks:] + assert len(request.block_hashes) >= num_full_blocks + new_block_hashes = request.block_hashes[num_cached_blocks:] - # Update the new blocks with the block hashes through the chain. - if num_cached_blocks == 0: - prev_block_hash_value = None - else: - prev_block = blocks[num_cached_blocks - 1] - assert prev_block.block_hash is not None - prev_block_hash_value = prev_block.block_hash.get_hash_value() - - parent_block_hash = prev_block_hash_value new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events else None) for i, blk in enumerate(new_full_blocks): assert blk.block_hash is None - - if i < len(new_block_hashes): - # The block hash may already be computed in - # "get_computed_blocks" if the tokens are not generated by - # this request (either the prompt tokens or the previously - # generated tokens with preemption), or by other - # single_type_managers with the same block_size. - # In this case we simply reuse the block hash. - block_hash = new_block_hashes[i] - else: - # Otherwise compute the block hash and cache it in the request - # in case it will be preempted in the future. - blk_idx = num_cached_blocks + i - start_token_idx = blk_idx * block_size - end_token_idx = (blk_idx + 1) * block_size - block_tokens = request.all_token_ids[ - start_token_idx:end_token_idx] - assert len(block_tokens) == block_size, ( - f"Expected {block_size} tokens, got " - f"{len(block_tokens)} at {blk_idx}th block for request " - f"{request.request_id}({request})") - - # Generate extra keys for multi-modal inputs. Note that since - # we reach to this branch only when the block is completed with - # generated tokens, we only need to consider the last mm input. - extra_keys, _ = generate_block_hash_extra_keys( - request, start_token_idx, end_token_idx, -1) - - # Compute the hash of the current block. - block_hash = hash_block_tokens(hash_fn, prev_block_hash_value, - block_tokens, extra_keys) - block_hashes.append(block_hash) + block_hash = new_block_hashes[i] # Update and added the full block to the cache. block_hash_with_group_id = BlockHashWithGroupId( @@ -184,9 +137,15 @@ def cache_full_blocks( blk.block_id] = blk if new_hashes is not None: new_hashes.append(block_hash.hash_value) - prev_block_hash_value = block_hash.hash_value if self.enable_kv_cache_events: + if num_cached_blocks == 0: + parent_block_hash = None + else: + parent_block = blocks[num_cached_blocks - 1] + assert parent_block.block_hash is not None + parent_block_hash = parent_block.block_hash.get_hash_value() + self.kv_event_queue.append( BlockStored( block_hashes=new_hashes, diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index f3a16d64e19f..a0ea4d96015a 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod -from typing import Callable, Optional +from typing import Optional from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock @@ -23,7 +23,6 @@ def __init__( max_model_len: int, use_eagle: bool, enable_caching: bool, - caching_hash_fn: Callable, enable_kv_cache_events: bool, ): self.kv_cache_config = kv_cache_config @@ -40,7 +39,6 @@ def __init__( kv_cache_spec=kv_cache_group.kv_cache_spec, block_pool=self.block_pool, kv_cache_group_id=i, - caching_hash_fn=caching_hash_fn, ) for i, kv_cache_group in enumerate( self.kv_cache_config.kv_cache_groups)) @@ -99,19 +97,17 @@ def allocate_new_blocks(self, request_id: str, manager.allocate_new_blocks(request_id, num_tokens) for manager in self.single_type_managers) - def cache_blocks(self, request: Request, block_hashes: list[BlockHash], - num_computed_tokens: int) -> None: + def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: """ Cache the blocks for the request. Args: request: The request. - block_hashes: The block hashes of the request. num_tokens: The total number of tokens that need to be cached (including tokens that are already cached). """ for manager in self.single_type_managers: - manager.cache_blocks(request, block_hashes, num_computed_tokens) + manager.cache_blocks(request, num_computed_tokens) def free(self, request_id: str) -> None: """ @@ -184,10 +180,9 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator): """ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, - use_eagle: bool, caching_hash_fn: Callable, - enable_kv_cache_events: bool): + use_eagle: bool, enable_kv_cache_events: bool): super().__init__(kv_cache_config, max_model_len, use_eagle, False, - caching_hash_fn, enable_kv_cache_events) + enable_kv_cache_events) self.num_single_type_manager = len(self.single_type_managers) def get_num_common_prefix_blocks(self, request_id: str, @@ -213,10 +208,9 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, enable_caching: bool, - caching_hash_fn: Callable, enable_kv_cache_events: bool): + enable_kv_cache_events: bool): super().__init__(kv_cache_config, max_model_len, use_eagle, - enable_caching, caching_hash_fn, - enable_kv_cache_events) + enable_caching, enable_kv_cache_events) self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[ 0].kv_cache_spec self.block_size = self.kv_cache_spec.block_size @@ -250,10 +244,9 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, enable_caching: bool, - caching_hash_fn: Callable, enable_kv_cache_events: bool): + enable_kv_cache_events: bool): super().__init__(kv_cache_config, max_model_len, use_eagle, - enable_caching, caching_hash_fn, - enable_kv_cache_events) + enable_caching, enable_kv_cache_events) self.verify_and_split_kv_cache_groups() def verify_and_split_kv_cache_groups(self) -> None: @@ -386,17 +379,15 @@ def find_longest_cache_hit( def get_kv_cache_coordinator( kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, - enable_caching: bool, caching_hash_fn: Callable, + enable_caching: bool, enable_kv_cache_events: bool) -> KVCacheCoordinator: if not enable_caching: return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len, - use_eagle, caching_hash_fn, + use_eagle, enable_kv_cache_events) if len(kv_cache_config.kv_cache_groups) == 1: return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle, enable_caching, - caching_hash_fn, enable_kv_cache_events) return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle, - enable_caching, caching_hash_fn, - enable_kv_cache_events) + enable_caching, enable_kv_cache_events) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index ce333dbe61a1..bfaa7ab08f5c 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,16 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections import defaultdict from dataclasses import dataclass from typing import Optional from vllm.distributed.kv_events import KVCacheEvent from vllm.logger import init_logger -from vllm.utils import sha256, sha256_cbor_64bit from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator -from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, - hash_request_tokens, init_none_hash) +from vllm.v1.core.kv_cache_utils import KVCacheBlock from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request, RequestStatus @@ -71,23 +68,13 @@ def __init__( kv_cache_config: KVCacheConfig, max_model_len: int, enable_caching: bool = True, - caching_hash_algo: str = "builtin", use_eagle: bool = False, log_stats: bool = False, enable_kv_cache_events: bool = False, ) -> None: self.max_model_len = max_model_len - if len(kv_cache_config.kv_cache_groups) == 0: - # Attention free models don't have kv cache, - # thus don't need prefix caching. - enable_caching = False self.enable_caching = enable_caching - - self.caching_hash_fn = ( - sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else - sha256 if caching_hash_algo == "sha256" else hash) - init_none_hash(self.caching_hash_fn) self.use_eagle = use_eagle self.log_stats = log_stats # FIXME: make prefix cache stats conditional on log_stats @@ -107,19 +94,12 @@ def __init__( max_model_len=self.max_model_len, use_eagle=self.use_eagle, enable_caching=self.enable_caching, - caching_hash_fn=self.caching_hash_fn, enable_kv_cache_events=enable_kv_cache_events, ) self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups) self.block_pool = self.coordinator.block_pool self.kv_cache_config = kv_cache_config - # Mapping from request ID to kv block hashes. - # This is to avoid recomputing the block hashes for each call of - # `get_computed_blocks` or `allocate_slots`. - self.req_to_block_hashes: defaultdict[ - str, list[BlockHash]] = defaultdict(list) - @property def usage(self) -> float: """Get the KV cache usage. @@ -161,15 +141,6 @@ def get_computed_blocks(self, and request.sampling_params.prompt_logprobs is not None)): return self.create_empty_block_list(), 0 - # The block hashes for the request may already be computed - # if the scheduler has tried to schedule the request before. - block_hashes = self.req_to_block_hashes[request.request_id] - if not block_hashes: - assert self.block_size is not None - block_hashes = hash_request_tokens(self.caching_hash_fn, - self.block_size, request) - self.req_to_block_hashes[request.request_id] = block_hashes - # NOTE: When all tokens hit the cache, we must recompute the last token # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1. # This can trigger recomputation of an entire block, rather than just @@ -178,7 +149,7 @@ def get_computed_blocks(self, # could slightly improve performance in the future. max_cache_hit_length = request.num_tokens - 1 computed_blocks, num_new_computed_tokens = ( - self.coordinator.find_longest_cache_hit(block_hashes, + self.coordinator.find_longest_cache_hit(request.block_hashes, max_cache_hit_length)) if self.log_stats: @@ -296,11 +267,7 @@ def allocate_slots( # at `request.num_tokens`, ensuring only "finalized" tokens are cached. num_tokens_to_cache = min(num_computed_tokens + num_new_tokens, request.num_tokens) - self.coordinator.cache_blocks( - request, - self.req_to_block_hashes[request.request_id], - num_tokens_to_cache, - ) + self.coordinator.cache_blocks(request, num_tokens_to_cache) return KVCacheBlocks(new_blocks) @@ -373,14 +340,6 @@ def get_num_common_prefix_blocks( return self.coordinator.get_num_common_prefix_blocks( request.request_id, num_running_requests) - def free_block_hashes(self, request: Request) -> None: - """Discard the block hashes for the request. - - NOTE: Unlike `free`, this method should be called only when the request - is finished, not when it is preempted. - """ - self.req_to_block_hashes.pop(request.request_id, None) - def take_events(self) -> list[KVCacheEvent]: """Take the KV cache events from the block pool. @@ -397,9 +356,7 @@ def get_block_ids(self, request_id: str) -> tuple[list[int], ...]: def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: """Cache the blocks for the request, if enabled.""" if self.enable_caching: - block_hashes = self.req_to_block_hashes[request.request_id] - self.coordinator.cache_blocks(request, block_hashes, - num_computed_tokens) + self.coordinator.cache_blocks(request, num_computed_tokens) def create_empty_block_list(self) -> KVCacheBlocks: """Creates a new KVCacheBlocks instance with no blocks.""" diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 626aa35a770c..6a62c55fb2d5 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -547,41 +547,61 @@ def hash_block_tokens( curr_block_token_ids_tuple, extra_keys) -def hash_request_tokens(hash_function: Any, block_size: int, - request: Request) -> list[BlockHash]: - """Computes hash values of a chain of blocks given a sequence of - token IDs. The hash value is used for prefix caching. +def get_request_block_hasher( + block_size: int, + caching_hash_fn: Callable[[Any], + int]) -> Callable[[Request], list[BlockHash]]: + """ + Returns a function which computes the list of un-computed block hashes + of a request. + + Each request holds a list of its block hashes (request.block_hashes). + When a request is created, it calls the below function to compute + the hashes of all full blocks of the request's initial tokens. + The hashes are then stored in request.block_hashes. + Later, whenever new tokens are appended to the request, it calls + the below function again to compute any new full blocks of tokens. + The returned new hashes are appended to request.block_hashes. + """ - Args: - block_size: The size of each block. - request: The request object. + def request_block_hasher(request: Request) -> list[BlockHash]: + start_token_idx = len(request.block_hashes) * block_size + num_tokens = request.num_tokens + + curr_mm_idx = 0 + if start_token_idx > 0: + # Set curr_mm_idx = -1 to indicate the last mm input. + # Note that since we reach to this branch only when the block is + # completed with generated tokens, we only need to consider the + # last mm input. + curr_mm_idx = -1 + + prev_block_hash_value = request.block_hashes[-1].hash_value \ + if request.block_hashes else None + new_block_hashes: list[BlockHash] = [] + while True: + end_token_idx = start_token_idx + block_size + if end_token_idx > num_tokens: + # We only hash full blocks + break - Returns: - The list of computed hash values. - """ - token_ids = request.all_token_ids + # MM and LoRA requests need extra keys for block-hash computation. + extra_keys, curr_mm_idx = generate_block_hash_extra_keys( + request, start_token_idx, end_token_idx, curr_mm_idx) - req_need_extra_keys = need_extra_keys(request) - req_extra_keys = None - curr_mm_idx = 0 + # Compute the hash of the current block + block_tokens = request.all_token_ids[start_token_idx:end_token_idx] + block_hash = hash_block_tokens(caching_hash_fn, + prev_block_hash_value, block_tokens, + extra_keys) - ret = [] - parent_block_hash_value = None - # Only full blocks will be hashed - for start in range(0, len(token_ids) - block_size + 1, block_size): - end = start + block_size - block_token_ids = token_ids[start:end] + new_block_hashes.append(block_hash) + start_token_idx += block_size + prev_block_hash_value = block_hash.hash_value - if req_need_extra_keys: - # MM and LoRA requests need extra keys for block-hash computation. - req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys( - request, start, end, curr_mm_idx) - - block_hash = hash_block_tokens(hash_function, parent_block_hash_value, - block_token_ids, req_extra_keys) - ret.append(block_hash) - parent_block_hash_value = block_hash.hash_value - return ret + return new_block_hashes + + return request_block_hasher def max_memory_usage_bytes(vllm_config: VllmConfig, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index dcb9f4dd36f5..981023409045 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -155,7 +155,6 @@ def __init__( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, enable_caching=self.cache_config.enable_prefix_caching, - caching_hash_algo=self.cache_config.prefix_caching_hash_algo, use_eagle=self.use_eagle, log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, @@ -1036,7 +1035,6 @@ def _free_request(self, request: Request) -> Optional[dict[str, Any]]: def _free_blocks(self, request: Request): assert request.is_finished() self.kv_cache_manager.free(request) - self.kv_cache_manager.free_block_hashes(request) del self.requests[request.request_id] def get_num_unfinished_requests(self) -> int: diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 8f310023a8cd..82e0292522b9 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -3,7 +3,6 @@ import itertools from abc import ABC, abstractmethod from collections import defaultdict -from typing import Callable from vllm.utils import cdiv from vllm.v1.core.block_pool import BlockPool @@ -25,7 +24,6 @@ def __init__( kv_cache_spec: KVCacheSpec, block_pool: BlockPool, kv_cache_group_id: int, - caching_hash_fn: Callable, ) -> None: """ Initializes the SingleTypeKVCacheManager. @@ -33,7 +31,6 @@ def __init__( kv_cache_spec: The kv_cache_spec for this manager. block_pool: The block pool. kv_cache_group_id: The id of the kv cache group of this manager. - caching_hash_fn: The caching hash function. """ self.block_size = kv_cache_spec.block_size @@ -52,7 +49,6 @@ def __init__( # data for reempted ones. self.num_cached_block: dict[str, int] = {} - self.caching_hash_fn = caching_hash_fn self.kv_cache_group_id = kv_cache_group_id self._null_block = block_pool.null_block @@ -130,14 +126,12 @@ def allocate_new_blocks(self, request_id: str, req_blocks.extend(new_blocks) return new_blocks - def cache_blocks(self, request: Request, block_hashes: list[BlockHash], - num_tokens: int) -> None: + def cache_blocks(self, request: Request, num_tokens: int) -> None: """ Cache the blocks for the request. Args: request: The request. - block_hashes: The block hashes of the request. num_tokens: The total number of tokens that need to be cached (including tokens that are already cached). """ @@ -147,12 +141,10 @@ def cache_blocks(self, request: Request, block_hashes: list[BlockHash], self.block_pool.cache_full_blocks( request=request, blocks=self.req_to_blocks[request.request_id], - block_hashes=block_hashes, num_cached_blocks=num_cached_blocks, num_full_blocks=num_full_blocks, block_size=self.block_size, kv_cache_group_id=self.kv_cache_group_id, - hash_fn=self.caching_hash_fn, ) self.num_cached_block[request.request_id] = num_full_blocks diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ed426f8ff452..1e52f93a581b 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -25,9 +25,11 @@ from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import (decorate_logs, make_zmq_socket, +from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket, resolve_obj_by_qualname, set_process_title) -from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, +from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_config, + get_request_block_hasher, + init_none_hash, unify_kv_cache_configs) from vllm.v1.core.sched.interface import SchedulerInterface from vllm.v1.core.sched.output import SchedulerOutput @@ -140,6 +142,19 @@ def __init__(self, self.batch_queue_size) self.batch_queue = queue.Queue(self.batch_queue_size) + self.request_block_hasher: Optional[Callable[[Request], + list[BlockHash]]] = None + if (self.vllm_config.cache_config.enable_prefix_caching + or self.scheduler.get_kv_connector() is not None): + + block_size = vllm_config.cache_config.block_size + caching_hash_fn = get_hash_fn_by_name( + vllm_config.cache_config.prefix_caching_hash_algo) + init_none_hash(caching_hash_fn) + + self.request_block_hasher = get_request_block_hasher( + block_size, caching_hash_fn) + def _initialize_kv_caches( self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]: start = time.time() @@ -417,7 +432,8 @@ def preprocess_add_request( request.mm_kwargs = self.mm_input_cache_server.get_and_update( request.mm_kwargs, request.mm_hashes) - req = Request.from_engine_core_request(request) + req = Request.from_engine_core_request(request, + self.request_block_hasher) if req.use_structured_output: # Note on thread safety: no race condition. # `grammar_init` is only invoked in input processing thread. For diff --git a/vllm/v1/request.py b/vllm/v1/request.py index d1f1c7f98755..562925bde669 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -3,7 +3,8 @@ import enum import time -from typing import TYPE_CHECKING, Any, Optional, Union +from functools import partial +from typing import TYPE_CHECKING, Any, Callable, Optional, Union from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.pooling_params import PoolingParams @@ -16,6 +17,7 @@ if TYPE_CHECKING: from vllm.lora.request import LoRARequest + from vllm.v1.core.kv_cache_utils import BlockHash class Request: @@ -36,6 +38,8 @@ def __init__( structured_output_request: Optional["StructuredOutputRequest"] = None, cache_salt: Optional[str] = None, priority: int = 0, + block_hasher: Optional[Callable[["Request"], + list["BlockHash"]]] = None, ) -> None: self.request_id = request_id self.client_index = client_index @@ -108,8 +112,18 @@ def __init__( # indicates that the output is corrupted self.num_nans_in_logits = 0 + self.block_hashes: list[BlockHash] = [] + self.get_hash_new_full_blocks: Optional[Callable[ + [], list[BlockHash]]] = None + if block_hasher is not None: + self.get_hash_new_full_blocks = partial(block_hasher, self) + self.block_hashes = self.get_hash_new_full_blocks() + @classmethod - def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": + def from_engine_core_request( + cls, request: EngineCoreRequest, + block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] + ) -> "Request": if request.mm_kwargs is not None: assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), ( "mm_kwargs was not updated in EngineCore.add_request") @@ -131,6 +145,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": if request.sampling_params else None, cache_salt=request.cache_salt, priority=request.priority, + block_hasher=block_hasher, ) def append_output_token_ids( @@ -144,6 +159,9 @@ def append_output_token_ids( self._output_token_ids.extend(token_ids) self._all_token_ids.extend(token_ids) + if self.get_hash_new_full_blocks is not None: + self.block_hashes.extend(self.get_hash_new_full_blocks()) + @property def is_output_corrupted(self) -> bool: return self.num_nans_in_logits > 0 From 465478196b797e77822864941bbd95ae324ec5b3 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Fri, 15 Aug 2025 16:54:10 -0700 Subject: [PATCH 069/231] Support multiple attention groups for KV sharing (#22672) Signed-off-by: Yong Hoon Shin Signed-off-by: Duncan Moss --- tests/v1/test_kv_sharing.py | 189 ++++++++++++++++++++++++++++++++++++ vllm/v1/worker/utils.py | 40 +++++--- 2 files changed, 213 insertions(+), 16 deletions(-) create mode 100644 tests/v1/test_kv_sharing.py diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/test_kv_sharing.py new file mode 100644 index 000000000000..6b01b7d3e1d6 --- /dev/null +++ b/tests/v1/test_kv_sharing.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import Mock + +import torch + +from vllm.v1.attention.backends.flash_attn import ( + FlashAttentionBackend, FlashAttentionMetadataBuilder) +from vllm.v1.attention.backends.flex_attention import ( + FlexAttentionBackend, FlexAttentionMetadataBuilder) +from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec +from vllm.v1.worker.utils import (AttentionGroup, + initialize_kv_cache_for_kv_sharing) + + +def new_kv_cache_spec(): + return FullAttentionSpec(16, 1, 1, torch.float32, False) + + +def test_initialize_kv_cache_for_kv_sharing_different_attn_groups(): + """ + Test initializing KV cache sharing with different attention groups. + Layers in the same KV cache group might be placed in different attn groups + if they have different attention backends. + """ + shared_kv_cache_layers = { + "model.layers.2": "model.layers.0", + "model.layers.3": "model.layers.1", + } + + # Layers 0 and 1 both belong in KV cache group 0 + # However, if they have have different attention backends, they will be + # placed in different attention groups for KV cache group 0 + kv_cache_groups = [ + KVCacheGroupSpec(["model.layers.0", "model.layers.1"], + new_kv_cache_spec()), + ] + + attn_groups = [ + # KV cache group 0 has two attention groups + [ + AttentionGroup( + backend=FlashAttentionBackend, + metadata_builder=Mock(spec=FlashAttentionMetadataBuilder), + layer_names=["model.layers.0"], + ), + AttentionGroup( + backend=FlexAttentionBackend, + metadata_builder=Mock(spec=FlexAttentionMetadataBuilder), + layer_names=["model.layers.1"], + ), + ], + ] + + # Only layers 0 and 1 will have KV caches allocated + kv_caches = { + "model.layers.0": torch.zeros(1, 2, 3), + "model.layers.1": torch.ones(1, 2, 3), + } + + initialize_kv_cache_for_kv_sharing( + shared_kv_cache_layers=shared_kv_cache_layers, + kv_cache_groups=kv_cache_groups, + kv_caches=kv_caches, + attn_groups=attn_groups, + ) + + # Check that the KV caches were shared correctly + assert kv_caches["model.layers.2"].data_ptr( + ) == kv_caches["model.layers.0"].data_ptr() + assert kv_caches["model.layers.3"].data_ptr( + ) == kv_caches["model.layers.1"].data_ptr() + + # Check that the layers were added to the correct KV cache group + assert len(kv_cache_groups) == 1 + assert kv_cache_groups[0].layer_names == [ + "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" + ] + + # Check that the layers were added to the attention groups + assert len(attn_groups) == 1 and len(attn_groups[0]) == 2 + assert attn_groups[0][0].layer_names == [ + "model.layers.0", "model.layers.2" + ] + assert attn_groups[0][1].layer_names == [ + "model.layers.1", "model.layers.3" + ] + + +def test_initialize_kv_cache_for_kv_sharing_same_attn_groups(): + """ + Test case assuming that all layers in the same KV cache group have the same + attention backends. This is true for most models. + """ + shared_kv_cache_layers = { + "model.layers.2": "model.layers.0", + "model.layers.3": "model.layers.1", + } + + kv_cache_groups = [ + KVCacheGroupSpec(["model.layers.0", "model.layers.1"], + new_kv_cache_spec()), + ] + + attn_groups = [ + # KV cache group 0 has a single attention group + # as all layers have the same flash attention backend + [ + AttentionGroup( + backend=FlashAttentionBackend, + metadata_builder=Mock(spec=FlashAttentionMetadataBuilder), + layer_names=["model.layers.0", "model.layers.1"], + ), + ], + ] + + kv_caches = { + "model.layers.0": torch.zeros(1, 2, 3), + "model.layers.1": torch.ones(1, 2, 3), + } + + initialize_kv_cache_for_kv_sharing( + shared_kv_cache_layers=shared_kv_cache_layers, + kv_cache_groups=kv_cache_groups, + kv_caches=kv_caches, + attn_groups=attn_groups, + ) + + # Check that the KV caches were shared correctly + assert kv_caches["model.layers.2"].data_ptr( + ) == kv_caches["model.layers.0"].data_ptr() + assert kv_caches["model.layers.3"].data_ptr( + ) == kv_caches["model.layers.1"].data_ptr() + + # Check that the layers were added to the correct KV cache group + assert len(kv_cache_groups) == 1 + assert kv_cache_groups[0].layer_names == [ + "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" + ] + + # Check that the layers were added to the attention groups + assert len(attn_groups) == 1 and len(attn_groups[0]) == 1 + assert attn_groups[0][0].layer_names == [ + "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" + ] + + +def test_initialize_kv_cache_for_kv_sharing_no_attn_groups(): + """ + Test KV sharing set up when no attention groups are provided. + This is the case for the TPU model runner, which doesn't have + support for attention groups yet. + """ + shared_kv_cache_layers = { + "model.layers.2": "model.layers.0", + "model.layers.3": "model.layers.1", + } + + kv_cache_groups = [ + KVCacheGroupSpec(["model.layers.0"], new_kv_cache_spec()), + KVCacheGroupSpec(["model.layers.1"], new_kv_cache_spec()), + ] + + kv_caches = { + "model.layers.0": torch.zeros(1, 2, 3), + "model.layers.1": torch.ones(1, 2, 3), + } + + initialize_kv_cache_for_kv_sharing( + shared_kv_cache_layers=shared_kv_cache_layers, + kv_cache_groups=kv_cache_groups, + kv_caches=kv_caches, + ) + + # Check that the KV caches were shared correctly + assert kv_caches["model.layers.2"].data_ptr( + ) == kv_caches["model.layers.0"].data_ptr() + assert kv_caches["model.layers.3"].data_ptr( + ) == kv_caches["model.layers.1"].data_ptr() + + # Check that the layers were added to the correct KV cache group + assert len(kv_cache_groups) == 2 + assert kv_cache_groups[0].layer_names == [ + "model.layers.0", "model.layers.2" + ] + assert kv_cache_groups[1].layer_names == [ + "model.layers.1", "model.layers.3" + ] diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index e7079235d651..b138f11af1eb 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -225,26 +225,34 @@ def initialize_kv_cache_for_kv_sharing( Note that layers in shared_kv_cache_layers.keys() are not originally included as it only contains layers which have its own KV cache allocation. + attn_groups: Optional list of attention groups. Layers in the same KV + cache group may be placed in different attention groups if they + have different attention backends. Currently only provided by + GPU model runner. """ - # Record index of KV cache group for each layer that allocates a KV cache. - layer_to_kv_cache_group_idx: dict[str, int] = {} - for i, kv_cache_group in enumerate(kv_cache_groups): - for layer_name in kv_cache_group.layer_names: - layer_to_kv_cache_group_idx[layer_name] = i + # mapping from layer name to tuple of (kv_cache_group_idx, attn_group_idx) + layer_to_attn_group_idx: dict[str, tuple[int, int]] = {} + if attn_groups: + for kv_cache_group_idx, kv_attn_groups in enumerate(attn_groups): + for attn_group_idx, attn_group in enumerate(kv_attn_groups): + for layer_name in attn_group.layer_names: + layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, + attn_group_idx) + else: + for kv_cache_group_idx, kv_cache_group in enumerate(kv_cache_groups): + for layer_name in kv_cache_group.layer_names: + # attn group idx default to 0 if not provided + layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, 0) for layer_name, target_layer_name in shared_kv_cache_layers.items(): kv_caches[layer_name] = kv_caches[target_layer_name] - group_idx = layer_to_kv_cache_group_idx[target_layer_name] - kv_cache_groups[group_idx].layer_names.append(layer_name) - - if attn_groups is not None: - assert len(attn_groups[group_idx]) == 1, ( - "Only one attention group per KV cache group is supported " - "for KV-cache sharing for now.") - # TODO(lucas): I think in the future the layers that re-use a - # KV cache will be in a different attention group so we can - # remove this code from here. - attn_groups[group_idx][0].layer_names.append(layer_name) + kv_cache_group_idx = layer_to_attn_group_idx[target_layer_name][0] + kv_cache_groups[kv_cache_group_idx].layer_names.append(layer_name) + + if attn_groups: + attn_group_idx = layer_to_attn_group_idx[target_layer_name][1] + attn_groups[kv_cache_group_idx][attn_group_idx].layer_names.append( + layer_name) def bind_kv_cache( From e67c5045beba70f102f587dc59278449369fc614 Mon Sep 17 00:00:00 2001 From: Yichen Yan Date: Sat, 16 Aug 2025 07:56:17 +0800 Subject: [PATCH 070/231] [BugFix] Make `run_once` thread-safe (#22978) Signed-off-by: Signed-off-by: Yichen Yan Signed-off-by: Duncan Moss --- vllm/utils/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 72857ee2abc7..40f41893abb6 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1640,15 +1640,19 @@ def weak_bound(*args, **kwargs) -> None: return weak_bound -# From: https://stackoverflow.com/a/4104188/2749989 def run_once(f: Callable[P, None]) -> Callable[P, None]: def wrapper(*args: P.args, **kwargs: P.kwargs) -> None: - if not wrapper.has_run: # type: ignore[attr-defined] - wrapper.has_run = True # type: ignore[attr-defined] - return f(*args, **kwargs) + if wrapper.has_run: # type: ignore[attr-defined] + return + + with wrapper.lock: # type: ignore[attr-defined] + if not wrapper.has_run: # type: ignore[attr-defined] + wrapper.has_run = True # type: ignore[attr-defined] + return f(*args, **kwargs) wrapper.has_run = False # type: ignore[attr-defined] + wrapper.lock = threading.Lock() # type: ignore[attr-defined] return wrapper From f2a8c6f27f5c4b2fc58bb25bd08373bc51e003e0 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 15 Aug 2025 17:00:36 -0700 Subject: [PATCH 071/231] [Misc] Support passing multiple request ids at once to `AsyncLLM.abort()` (#22944) Signed-off-by: Nick Hill Signed-off-by: Duncan Moss --- tests/v1/engine/test_async_llm.py | 77 ++++++++++++++++++++++++++- vllm/engine/async_llm_engine.py | 5 +- vllm/engine/multiprocessing/client.py | 10 ++-- vllm/engine/protocol.py | 7 +-- vllm/utils/__init__.py | 5 ++ vllm/v1/engine/async_llm.py | 15 +++--- 6 files changed, 105 insertions(+), 14 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 484640233f52..df04a14af70c 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -212,6 +212,79 @@ async def test_abort( assert not engine.output_processor.has_unfinished_requests() +@pytest.mark.parametrize( + "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +@pytest.mark.asyncio +async def test_multi_abort( + monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, +): + + with monkeypatch.context() as m, ExitStack() as after: + m.setenv("VLLM_USE_V1", "1") + + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) + after.callback(engine.shutdown) + + NUM_REQUESTS = 50 + NUM_EXPECTED_TOKENS = 100 + NUM_EXPECTED_TOKENS_LONG = 50000 + REQUEST_IDS_TO_ABORT = [5, 10, 15, 20, 25] + PARALLEL_SAMPLE_REQ_IDS = [5, 15, 30, 35] + + request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] + + # Create concurrent requests. + tasks: list[asyncio.Task] = [] + for idx, request_id in enumerate(request_ids): + max_tokens = (NUM_EXPECTED_TOKENS_LONG if + (idx + in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS) + n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1 + tasks.append( + asyncio.create_task( + generate(engine, request_id, TEXT_PROMPT, output_kind, + max_tokens, n))) + + # Let requests start + await asyncio.sleep(0.5) + + # Use multi-abort to abort multiple requests at once + abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT] + await engine.abort(abort_request_ids) + + # Wait for all tasks to complete + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Verify results + for idx, result in enumerate(results): + if idx in REQUEST_IDS_TO_ABORT: + # Aborted requests should return partial results + assert isinstance( + result, tuple + ), f"Request {idx} should have completed with partial results" + num_generated_tokens, request_id = result + # Should have generated some tokens before abort + assert num_generated_tokens > 0, ( + f"Aborted request " + f"{request_id} should have generated some tokens") + else: + # Non-aborted requests should complete normally + assert isinstance( + result, + tuple), f"Request {idx} should have completed successfully" + num_generated_tokens, request_id = result + n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1 + expected_tokens = NUM_EXPECTED_TOKENS * n + assert num_generated_tokens == expected_tokens, ( + f"{request_id} generated {num_generated_tokens} but " + f"expected {expected_tokens}") + + # Make sure all aborted requests were cleaned up + assert not engine.output_processor.has_unfinished_requests() + + @pytest.mark.parametrize("n", [1, 3]) @pytest.mark.parametrize( "engine_args,prompt", @@ -460,7 +533,9 @@ async def test_abort_final_output( token_count = sum( len(output.outputs[0].token_ids) for output in outputs) assert token_count > 0 - assert len(final_output.outputs[0].token_ids) == 0 + # This would ordinarily be 0, but could end up > 0 if the + # final abort is coalesced with another chunk in the output queue. + assert len(final_output.outputs[0].token_ids) >= 0 else: # For FINAL_ONLY, we should only get the final output assert len(outputs) == 0 diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 73726eeab5fc..84ad2299b065 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -998,7 +998,7 @@ async def encode( await self.abort(request_id) raise - async def abort(self, request_id: str) -> None: + async def abort(self, request_id: Union[str, Iterable[str]]) -> None: """Abort a request. Abort a submitted request. If the request is finished or not found, @@ -1007,6 +1007,9 @@ async def abort(self, request_id: str) -> None: Args: request_id: The unique id of the request. """ + if not isinstance(request_id, str): + raise RuntimeError("Only single-request abort supported in" + " deprecated V0") if not self.is_running: raise AsyncEngineDeadError( "Background loop is not running. If it was running, " diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index f69f72edf6a5..eca29af50055 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -5,8 +5,8 @@ import copy import pickle from contextlib import contextmanager, suppress -from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping, - Optional, Union, cast) +from typing import (Any, AsyncGenerator, Dict, Iterable, Iterator, List, + Mapping, Optional, Union, cast) import cloudpickle import psutil @@ -404,9 +404,13 @@ async def _wait_for_server_rpc(self, socket: Socket) -> RPCStartupResponse: error_message="Unable to start RPC Server", socket=socket) - async def abort(self, request_id: str): + async def abort(self, request_id: Union[str, Iterable[str]]): """Send an ABORT_REQUEST signal to the RPC Server""" + if not isinstance(request_id, str): + raise RuntimeError("Only single-request abort supported in" + " deprecated V0") + with suppress(MQClientClosedError): await self._send_one_way_rpc_request( request=RPCAbortRequest(request_id), socket=self.input_socket) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 671e9648a3d0..c610fb5eae60 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -3,7 +3,7 @@ import asyncio from abc import ABC, abstractmethod -from typing import AsyncGenerator, Mapping, Optional +from typing import AsyncGenerator, Iterable, Mapping, Optional, Union from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig, VllmConfig @@ -229,11 +229,12 @@ def encode( ... @abstractmethod - async def abort(self, request_id: str) -> None: + async def abort(self, request_id: Union[str, Iterable[str]]) -> None: """Abort a request. Args: - request_id: The unique id of the request. + request_id: The unique id of the request, + or an iterable of such ids. """ ... diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 40f41893abb6..64f7426bd65d 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1315,6 +1315,11 @@ def common_broadcastable_dtype(dtypes: Collection[torch.dtype]): ) +def as_list(maybe_list: Iterable[T]) -> list[T]: + """Convert iterable to list, unless it's already a list.""" + return maybe_list if isinstance(maybe_list, list) else list(maybe_list) + + # `collections` helpers def is_list_of( value: object, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index edc2e235c3c3..664fec31a4da 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time -from collections.abc import AsyncGenerator, Mapping +from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, Optional, Union @@ -27,7 +27,8 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, cancel_task_threadsafe, cdiv, deprecate_kwargs +from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv, + deprecate_kwargs) from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError @@ -431,14 +432,16 @@ async def output_handler(): self.output_handler = asyncio.create_task(output_handler()) - async def abort(self, request_id: str) -> None: + async def abort(self, request_id: Union[str, Iterable[str]]) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" - request_ids = self.output_processor.abort_requests((request_id, )) - await self.engine_core.abort_requests_async(request_ids) + request_ids = (request_id, ) if isinstance( + request_id, str) else as_list(request_id) + all_request_ids = self.output_processor.abort_requests(request_ids) + await self.engine_core.abort_requests_async(all_request_ids) if self.log_requests: - logger.info("Aborted request %s.", request_id) + logger.info("Aborted request(s) %s.", ",".join(request_ids)) async def encode( self, From f1e219cecd7262aa8771923ef2aba2fa51384f5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 16 Aug 2025 02:14:08 +0200 Subject: [PATCH 072/231] [Kernel] Simplify `get_kv_cache_layout` and cache `use_trtllm_attention` env-dependent bit (#22735) Signed-off-by: NickLucche Signed-off-by: Duncan Moss --- vllm/utils/flashinfer.py | 46 +++++++++++++++++++---------- vllm/v1/attention/backends/utils.py | 18 ++++++----- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 0d7d4b694f07..2e31b7bad747 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -148,6 +148,31 @@ def has_nvidia_artifactory() -> bool: return False +@functools.cache +def supports_trtllm_attention() -> tuple[bool, Optional[str]]: + """Cache result which only depends on the environment""" + # This is a lambda, call it once + env_value = envs.VLLM_USE_TRTLLM_ATTENTION + + # Requires SM100 and NVIDIA artifactory to be accessible to download cubins + if not (current_platform.is_device_capability(100) + and has_nvidia_artifactory()): + return False, env_value + + if env_value is not None: + logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value) + # Environment variable is set - respect it + # Making the conditional check for zero because + # the path is automatically enabled if the batch size condition + # is satisfied. + use_trtllm = (env_value == "1") + if use_trtllm: + logger.info_once("Using TRTLLM attention.") + return use_trtllm, env_value + + return True, None + + def use_trtllm_attention( num_tokens: int, max_seq_len: int, @@ -157,9 +182,8 @@ def use_trtllm_attention( attn_head_size: Optional[int], has_sinks: bool = False, ) -> bool: - # Requires SM100 and NVIDIA artifactory to be accessible to download cubins - if not (current_platform.is_device_capability(100) - and has_nvidia_artifactory()): + use_trtllm, env_value = supports_trtllm_attention() + if not use_trtllm: return False # Check if the dimensions are supported by TRTLLM decode attention @@ -174,18 +198,7 @@ def use_trtllm_attention( "Using TRTLLM attention (required for attention sinks).") return True - env_value = envs.VLLM_USE_TRTLLM_ATTENTION - if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value) - # Environment variable is set - respect it - # Making the conditional check for zero because - # the path is automatically enabled if the batch size condition - # is satisfied. - use_trtllm = (env_value == "1") - if use_trtllm: - logger.info_once("Using TRTLLM attention.") - return use_trtllm - else: + if env_value is None: # Environment variable not set - use auto-detection use_trtllm = (num_tokens <= 256 and max_seq_len < 131072 and kv_cache_dtype == "auto") @@ -193,6 +206,9 @@ def use_trtllm_attention( logger.warning_once("Using TRTLLM attention (auto-detected).") return use_trtllm + # Environment variable is set to 1 - respect it + return True + if has_flashinfer(): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 1c7d08798964..5e6bc331835b 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -248,19 +248,23 @@ def use_cascade_attention( @functools.lru_cache def get_kv_cache_layout(): + # Format specified by the code. global _KV_CACHE_LAYOUT_OVERRIDE - # Override with format specified by the user. + + if _KV_CACHE_LAYOUT_OVERRIDE is not None: + cache_layout = _KV_CACHE_LAYOUT_OVERRIDE + logger.info_once("`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. " \ + "Setting KV cache layout to %s.", cache_layout) + return cache_layout + + # Format specified by the user. cache_layout = envs.VLLM_KV_CACHE_LAYOUT + # When neither the user nor the override specified a layout, get default if cache_layout is None: - if envs.VLLM_USE_TRTLLM_ATTENTION: - cache_layout = "HND" - else: - cache_layout = get_kv_connector_cache_layout() + cache_layout = get_kv_connector_cache_layout() else: logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \ "detected. Setting KV cache layout to %s.", cache_layout) - if _KV_CACHE_LAYOUT_OVERRIDE is not None: - cache_layout = _KV_CACHE_LAYOUT_OVERRIDE return cache_layout From 98b4d43c893649ea036dae346f0c259d54ad0005 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Fri, 15 Aug 2025 21:25:06 -0400 Subject: [PATCH 073/231] [Bugfix] Fix DeepSeek MTP (#22934) Signed-off-by: Benjamin Chislett Signed-off-by: Duncan Moss --- vllm/model_executor/models/deepseek_mtp.py | 13 +++++++------ vllm/model_executor/models/glm4_moe_mtp.py | 7 +++---- vllm/model_executor/models/mimo_mtp.py | 7 +++---- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 2e026d582a6d..0ad001be71c1 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -158,14 +158,13 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - previous_hidden_states: torch.Tensor, + hidden_states: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, spec_step_idx: int = 0, ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, - previous_hidden_states, inputs_embeds, - spec_step_idx) + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) return hidden_states def compute_logits( @@ -213,13 +212,15 @@ def load_weights(self, weights: Iterable[tuple[str, # for mlp.experts[0].gate_gate_up_proj, which breaks load. if (("mlp.experts." in name) and name not in params_dict): continue - name = name.replace(weight_name, param_name) + name_mapped = name.replace(weight_name, param_name) # QKV fusion is optional, fall back to normal # weight loading if it's not enabled if ((param_name == "fused_qkv_a_proj") - and name not in params_dict): + and name_mapped not in params_dict): continue + else: + name = name_mapped # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 0624640054d1..322c5619c178 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -180,14 +180,13 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - previous_hidden_states: torch.Tensor, + hidden_states: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, spec_step_idx: int = 0, ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, - previous_hidden_states, inputs_embeds, - spec_step_idx) + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index 19afc5be3fb8..5a2079bf5121 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -164,15 +164,14 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - previous_hidden_states: torch.Tensor, + hidden_states: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, spec_step_idx: int = 0, ) -> torch.Tensor: assert spec_step_idx == 0, "mimo_mtp only support predict one token now" - hidden_states = self.model(input_ids, positions, - previous_hidden_states, inputs_embeds, - spec_step_idx) + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) return hidden_states def compute_logits( From a60d0c73f81698a1bbe1655f4969ea48b6c6b6c1 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 15 Aug 2025 19:06:30 -0700 Subject: [PATCH 074/231] [Frontend] Avoid list copies in `serving_chat.py` (#22947) Signed-off-by: Nick Hill Signed-off-by: Duncan Moss --- vllm/entrypoints/openai/serving_chat.py | 29 +++++++++++++------------ vllm/reasoning/abs_reasoning_parsers.py | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index b4231c6d10c4..12349234c320 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -50,6 +50,7 @@ from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls, truncate_tool_call_ids, validate_request_params) +from vllm.utils import as_list logger = init_logger(__name__) @@ -670,10 +671,10 @@ async def chat_completion_stream_generator( # avoid the None + list error. if previous_token_ids: - current_token_ids = previous_token_ids + list( + current_token_ids = previous_token_ids + as_list( output.token_ids) else: - current_token_ids = list(output.token_ids) + current_token_ids = as_list(output.token_ids) if self.use_harmony: if is_final: @@ -703,11 +704,10 @@ async def chat_completion_stream_generator( # set reasoning status to end. # Only keep 'content', remove 'reasoning_content'. if reasoning_parser.is_reasoning_end( - list(output.token_ids)) or \ - (res.prompt_token_ids and - reasoning_parser.is_reasoning_end( - list(res.prompt_token_ids) - )): + as_list(output.token_ids)) or ( + res.prompt_token_ids + and reasoning_parser.is_reasoning_end( + res.prompt_token_ids)): reasoning_end_arr[i] = True if delta_message and delta_message.content: # This need to be added to next `delta_text` @@ -771,6 +771,7 @@ async def chat_completion_stream_generator( assert reasoning_parser is not None assert added_content_delta_arr is not None assert reasoning_end_arr is not None + output_token_ids = as_list(output.token_ids) if not reasoning_end_arr[i]: delta_message = ( reasoning_parser. @@ -780,7 +781,7 @@ async def chat_completion_stream_generator( delta_text, previous_token_ids, current_token_ids, - output.token_ids, + output_token_ids, )) # When encountering think end id in prompt_token_ids # i.e {"enable_thinking": False}, @@ -789,9 +790,9 @@ async def chat_completion_stream_generator( # to 'reasoning_content'. if res.prompt_token_ids and \ reasoning_parser.is_reasoning_end( - list(res.prompt_token_ids)): + res.prompt_token_ids): reasoning_end_arr[i] = True - current_token_ids = list(output.token_ids) + current_token_ids = output_token_ids if delta_message and delta_message.content: current_text = delta_message.content delta_message.content = None @@ -802,11 +803,11 @@ async def chat_completion_stream_generator( # Remove the text and token ids related # to 'reasoning_content'. if reasoning_parser.is_reasoning_end( - list(output.token_ids)): + output_token_ids): reasoning_end_arr[i] = True current_token_ids = \ reasoning_parser.extract_content_ids( - list(output.token_ids)) + output_token_ids) if delta_message and delta_message.content: current_text = delta_message.content delta_message.content = None @@ -815,7 +816,7 @@ async def chat_completion_stream_generator( # handle tool calls only after reasoning is done, else: - delta_token_ids = list(output.token_ids) + delta_token_ids = output_token_ids # First time to tool call, # add the remaining text and token ids # to delta from previous @@ -899,7 +900,7 @@ async def chat_completion_stream_generator( self.request_logger.log_outputs( request_id=request_id, outputs=delta_content, - output_token_ids=list(output.token_ids), + output_token_ids=as_list(output.token_ids), finish_reason=output.finish_reason, is_streaming=True, delta=True, diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 4f4522d726e8..df9e84163f16 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -44,7 +44,7 @@ def vocab(self) -> dict[str, int]: return self.model_tokenizer.get_vocab() @abstractmethod - def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: + def is_reasoning_end(self, input_ids: list[int]) -> bool: """ Check if the reasoning content ends in the input_ids. From d854c2ab63f0d09ef862da0b753fce9420158a9e Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Sat, 16 Aug 2025 10:28:10 +0800 Subject: [PATCH 075/231] [V1] support min_tokens for detokener (#22014) Signed-off-by: calvin chen Co-authored-by: Nick Hill Signed-off-by: Duncan Moss --- tests/detokenizer/test_min_tokens.py | 50 ++++++++++++++++++++++++++++ vllm/v1/engine/detokenizer.py | 11 ++++-- 2 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 tests/detokenizer/test_min_tokens.py diff --git a/tests/detokenizer/test_min_tokens.py b/tests/detokenizer/test_min_tokens.py new file mode 100644 index 000000000000..887e83342536 --- /dev/null +++ b/tests/detokenizer/test_min_tokens.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from vllm import SamplingParams +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.detokenizer import FastIncrementalDetokenizer + +PROMPT = "Hello, my name is Lee, and I'm a student in the " + \ + "college of engineering" + + +@pytest.mark.parametrize("min_tokens,stop,truth", [ + (0, None, " is Lee, and I'm a student in the college of engineering"), + (0, "e", " is L"), + (5, "e", " is Lee, and I'm a stud"), +]) +def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str): + """Test for a specific min_tokens and stop. + + See https://github.com/vllm-project/vllm/pull/22014 + """ + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") + all_prompt_ids = tokenizer(PROMPT, add_special_tokens=False).input_ids + + # The prompt is "Hello, my name is" + prompt_token_ids = all_prompt_ids[:4] + params = SamplingParams( + stop=stop, + min_tokens=min_tokens, + ) + request = EngineCoreRequest("", + prompt_token_ids, + None, + None, + None, + params, + None, + None, + 0.0, + None, + cache_salt=None, + data_parallel_rank=None) + + detokenizer = FastIncrementalDetokenizer(tokenizer, request) + + detokenizer.update(all_prompt_ids[4:], False) + assert detokenizer.output_text == truth diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 2f5504ea14b4..04ad51aae0a8 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -74,6 +74,7 @@ def __init__(self, request: EngineCoreRequest): params = request.sampling_params assert params is not None self.stop = stop = params.stop + self.min_tokens = params.min_tokens self.include_stop_str_in_output = params.include_stop_str_in_output # Number of chars to hold back when stop strings are to be excluded @@ -111,10 +112,14 @@ def update(self, new_token_ids: list[int], # 1) Detokenize the new token ids incrementally. # TODO(woosuk): This method becomes very inefficient when the number of # new_token_ids is more than 1. We need to optimize this. - offset_before = len(self.output_text) + stop_check_offset = len(self.output_text) for new_token_id in new_token_ids: self.token_ids.append(new_token_id) self.output_text += self.decode_next(new_token_id) + # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014 + if self.min_tokens and len( + self.output_token_ids) <= self.min_tokens: + stop_check_offset = len(self.output_text) if stop_terminated: if skipped_stop_token_id is not None: @@ -125,10 +130,10 @@ def update(self, new_token_ids: list[int], # 2) Evaluate stop strings. stop_string = None - if self.stop: + if self.stop and len(self.output_token_ids) > self.min_tokens: stop = StopChecker.check_stop_strings( output_text=self.output_text, - new_char_count=len(self.output_text) - offset_before, + new_char_count=len(self.output_text) - stop_check_offset, stop=self.stop, include_in_output=self.include_stop_str_in_output, ) From 5b5d22e365be7816867afab65feee349b6a2d819 Mon Sep 17 00:00:00 2001 From: Grace Ho <146482179+gracehonv@users.noreply.github.com> Date: Fri, 15 Aug 2025 19:52:51 -0700 Subject: [PATCH 076/231] [misc] nsys profile output kernel classifier and visualizer (#22971) Signed-off-by: Grace Ho Signed-off-by: Duncan Moss --- tools/profiler/nsys_profile_tools/README.md | 175 +++++++ .../nsys_profile_tools/gputrc2graph.py | 426 ++++++++++++++++++ .../nsys_profile_tools/images/csv1.png | Bin 0 -> 148416 bytes .../nsys_profile_tools/images/html.png | Bin 0 -> 72163 bytes .../nsys_profile_tools/images/html_tbl.png | Bin 0 -> 36615 bytes 5 files changed, 601 insertions(+) create mode 100644 tools/profiler/nsys_profile_tools/README.md create mode 100755 tools/profiler/nsys_profile_tools/gputrc2graph.py create mode 100644 tools/profiler/nsys_profile_tools/images/csv1.png create mode 100644 tools/profiler/nsys_profile_tools/images/html.png create mode 100644 tools/profiler/nsys_profile_tools/images/html_tbl.png diff --git a/tools/profiler/nsys_profile_tools/README.md b/tools/profiler/nsys_profile_tools/README.md new file mode 100644 index 000000000000..75ae0811cc54 --- /dev/null +++ b/tools/profiler/nsys_profile_tools/README.md @@ -0,0 +1,175 @@ +# gputrc2graph.py + +This script processes NVIDIA Nsight Systems (`nsys`) GPU trace files +(`.nsys-rep`) with -t cuda tracing enabled, and generates kernel-level +summaries and visualizations of GPU and non-GPU time. It is useful for +profiling and analyzing nsys profile output. + +## Usage + +### Command-line Arguments + +- `--in_file` + **(required)** + List of input files and their metadata. Each entry should be in the format: + `,,,` + - `nsys-rep`: Path to the `.nsys-rep` file. + - `engine`: Engine name (e.g., `vllm`). + - `model`: Model name (e.g., `llama`, `gpt-oss`, `ds`). + - `elapsed_nonprofiled_sec`: Wall-clock runtime (in seconds) without + profiling. Specify `0` to use the elapsed time from the nsys-rep file + (this may inflate non-GPU time if actual runtime without profiling is + less). Multiple entries can be provided, separated by spaces. + +- `--out_dir` + Output directory for the generated CSV and HTML files. + If not specified, results are saved in the current directory. + +- `--title` + Title for the HTML chart/visualization. + +- `--nsys_cmd` + Path to the `nsys` command. + Default: `nsys` (assumes it is in your PATH). + Use this if `nsys` is not in your system PATH. + +## Notes + +- Make sure you have pandas installed. +- Make sure nsys is installed, and specify the path to the `nsys` command with + `--nsys_cmd` if it is not in your PATH. +- For more details on available engines and models, see the help string in + the script or run: + +```bash +python3 gputrc2graph.py --help +``` + +## Example 1: analyze a single profile + +To analyze the GPU cycles for say, gpt-oss model with vLLM engine: + +1. Run the following command to collect nsys profile, for vllm serve config. + + ```bash + nsys profile -t cuda -o run1 -f true --trace-fork-before-exec=true \ + --cuda-graph-trace=node --delay --duration \ + vllm serve openai/gpt-oss-120b ... + ``` + + where: + + - DELAY: how many seconds to delay nsys from collecting profiles, needed so + that profiles aren't captured till vllm server has come up and load + generation starts. + - DURATION: how many seconds for nsys profile to run before generating the + profile. This should be > the duration of the run. + +2. Run again, this time without collecting the profile, and get the total run + time in seconds. This value will be used by the script to calculate the + CPU(non-GPU) seconds for the analysis. + +3. Say the run elapsed time is 306 seconds, from step #2. Run script to + analyze: + + ```bash + python3 gputrc2graph.py \ + --in_file run1.nsys-rep,vllm,gpt-oss,306 \ + --title "vLLM-gpt-oss profile" + ``` + +The command will produce 2 files for analysis: + +- result.html: this categorizes kernel names into different categories in a + stacked bar chart. +- result.csv: shows how the kernel names are mapped to the different + categories. + +### HTML visualization with result.html + +The html file shows the number of elapsed seconds due to different GPU +Substages or categories, which consist of moe_gemm (Mixture of Experts GEMM) +kernels the biggest category, at 148 seconds, followed by "attn" or attention +kernels. This lets the user prioritize the kernels to focus on for performance +optimizations. + +![Example GPU Trace Visualization](images/html.png) + +There's also an appended data table underneath the bar chart for copying out to other post-processing tools. + +![Example GPU Trace Table](images/html_tbl.png) + +### Kernel to category mapping with result.csv + +Suppose the user would like to focus on improving triton kernels. It's not the +biggest consumer of cycles at 9.74 sec but perhaps it hasn't been optimized. +The next step is to use the result.csv to dive into what the kernels are which +compose the triton kernel GPU cycles. The following image shows that +triton_poi_fused__to_copy_add_addmm_cat_.. kernel to be the biggest +contributor to GPU cycles. + +![Example GPU Trace csv](images/csv1.png) + +## Example 2: analyze multiple profiles + +Suppose the user has multiple nsys trace files, captured for different models, +say llama and gpt-oss in this case, and wish to compare their GPU/non-GPU +time, something like the following command can be used. + +```bash +python3 gputrc2graph.py \ +--in_file run1.nsys-rep,vllm,llama,100 run2.nsys-rep,vllm,gpt-oss,102 \ +--out_dir results \ +--title "Comparison of vLLM Models" +``` + +The analysis process is similar to example 1 but now there will be multiple +stack bar charts that can be compared. The categories for the different +kernels will remain the same, so that it's easy to compare the GPU cycles for +the same categories. + +Once a category is shown to have more cycles for one configuration than +another, the next step would be to use the csv file to see what kernels are +mapped into that category, and which kernels are taking the largest amount of +time which would cause a difference for the overall category. + +## Example 3: add new classification for a new model + +Suppose there's a new model ABC that is available for engine DEF, and say there +are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels +have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*" +or "*K*" in them, add a new entry like so: + +```python +engine_model = { + 'DEF': { + 'ABC': { + 'layer_anno': { + 'Stage': { + '.*': 'layer', + }, + 'Substage': { + 'H|I': 'gemm', + 'J|K': 'attn', + 'CUDA mem': 'non-gpu-H_D_memops', + '.*': 'misc' + } + } + }, + } + 'vllm': {...} +``` + +Basically Substage is a dictionary with a list of key/value pairs, where the +keys are regex's of the kernel names to be classified, and values are the +classification bins which one wishes to compare across engines/models. + +The last 2 entries are common for all engine/models, consisting of CUDA memory +operations and a 'misc' for anything that's leftover and can't be classified. + +When invoking gputrc2graph.py, specify a trace file with this new model/engine +like the following: + +```bash +--infile new.nsys-rep,DEF,ABC, +``` diff --git a/tools/profiler/nsys_profile_tools/gputrc2graph.py b/tools/profiler/nsys_profile_tools/gputrc2graph.py new file mode 100755 index 000000000000..8921e1f20f3d --- /dev/null +++ b/tools/profiler/nsys_profile_tools/gputrc2graph.py @@ -0,0 +1,426 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" + This generates gpu kernel analysis output from nsys rep. Will call nsys + stats -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate + csv and html output for analysis +""" +import argparse +import logging +import os + +import regex as re + +logger = logging.getLogger(__name__) + + +# helper data class for annotating kernels +class EngineModelData: + # engine + model mappings + engine_model = { + 'vllm': { + 'llama': { + 'layer_anno': { + 'Stage': { + '.*': 'layer', + }, + 'Substage': { + 'gemm': 'gemm', + 'fused_moe_kernel|GroupProblemShape|group_gemm_starts': + 'moe_gemm', #llama4 + 'moe|sigmoid': 'moe', #llama4 + 'CatArrayBatched|prepare_inputs': 'prepare_next', + 'flash': 'attn', + 'ncclDevKernel|cross_device_reduce': + 'nccl_and_custom_ar', + '_norm_': 'norm', + 'act_and_mul_': 'silu', + 'rotary_embedding_kernel': 'rope', + 'SoftMax': 'softmax', + 'elementwise': 'elementwise', + 'fp8_quant': 'quantize', + 'reduce_kernel': 'reduce', + 'triton': 'triton_kernel', + 'CUDA mem': 'non-gpu-H_D_memops', + '.*': 'misc' + } + } + }, + 'ds': { + 'layer_anno': { + 'Stage': { + '.*': 'layer', + }, + 'Substage': { + 'block_fp8|gemm_fp8_blockwise': + 'block_fp8_gemm', + 'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal': + 'moe_gemm', + 'gemm|matmul|nvjet': + 'gemm', + 'moe|sigmoid|expert': + 'moe', + '_fwd_|FlashAttn|_mla_|_attn_': + 'attn', + 'CatArrayBatched': + 'prepare_next', + 'ncclDevKernel|cross_device_reduce': + 'nccl_and_custom_ar', + 'Norm|_norm_': + 'norm', + 'sbtopk': + 'topk', + 'act_and_mul_': + 'activation', + 'compute_position_kernel': + 'rope', + 'elementwise': + 'elementwise', + 'fp8_quant|quant_fp8|cvt_fp16_to_fp4': + 'quantize', + 'reduce': + 'reduce', + 'SoftMax': + 'softmax', + 'triton': + 'triton_kernel', + 'CUDA mem': + 'non-gpu-H_D_memops', + '.*': + 'misc' + } + } + }, + 'gpt-oss': { + 'layer_anno': { + 'Stage': { + '.*': 'layer', + }, + 'Substage': { + 'block_fp8|gemm_fp8_blockwise': + 'block_fp8_gemm', + 'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_' + # this section is triton_moe_gemm + '|matmul_ogs_|_topk_forward|_combined_routing' + '|_sum_bitmatrix_rows|_compute_writeback_idx': + 'moe_gemm', + 'gemm|matmul|nvjet': + 'gemm', + 'moe|sigmoid|expert|splitKreduce': + 'moe', + '_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha': + 'attn', + 'CatArrayBatched': + 'prepare_next', + 'ncclDevKernel|cross_device_reduce': + 'nccl_and_custom_ar', + 'Norm|_norm_': + 'norm', + 'sbtopk': + 'topk', + 'act_and_mul_': + 'activation', + 'compute_position_kernel': + 'rope', + 'elementwise': + 'elementwise', + 'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize': + 'quantize', + 'reduce': + 'reduce', + 'SoftMax': + 'softmax', + 'triton': + 'triton_kernel', + 'CUDA mem': + 'non-gpu-H_D_memops', + '.*': + 'misc' + } + } + } + }, + } + + +class GPUTrace2Graph: + """ + Parses output of nsys report, generates csv and bar chart output + """ + + def __init__(self, nsys_cmd): + self.nsys_cmd = nsys_cmd + import pandas as pd # avoid importing till needed + self.pd = pd + self.pd.options.mode.copy_on_write = True + + # helper functions for generating trace->summary csvs + def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file): + logger.info('loading %s', in_file) + df = self.pd.read_csv( + in_file, + usecols=['Start (ns)', 'Duration (ns)', 'Device', 'Strm', 'Name']) + df['End (ns)'] = df['Start (ns)'] + df['Duration (ns)'] + df = self.sum_non_overlapping_intervals(df) + # get ready to print table with elapsed times per kernel + df['Instances'] = 1 + df_sum = df.groupby('Name', as_index=False).agg({ + 'Elapsed Time (ns)': 'sum', + 'Duration (ns)': 'sum', + 'Instances': 'size' + }) + + # generate csv + df_sum['Total Time (sec)'] = df_sum['Duration (ns)'] / 1e9 + df_sum['Elapsed Time (sec)'] = df_sum['Elapsed Time (ns)'] / 1e9 + df_sum = df_sum.sort_values(by='Elapsed Time (sec)', ascending=False) + df_sum[['Elapsed Time (sec)', 'Total Time (sec)', 'Instances', + 'Name']].to_csv(out_file, index=False) + + def sum_non_overlapping_intervals(self, df): + """ + returns new sorted df with Elapsed Time (ns) column using + vectorized operations + """ + logger.info("sorting %s trace records by start time", str(df.shape)) + + # Sort by start time and reset index + df = df.sort_values(by='Start (ns)').reset_index(drop=True) + + # Initialize elapsed time as duration + df['Elapsed Time (ns)'] = df['Duration (ns)'] + + # Get numpy arrays for faster operations + starts = df['Start (ns)'].values + ends = df['End (ns)'].values + + # Keep track of current interval end + current_end = ends[0] + display_units = int(len(df) / 100) + # Update current_end for overlapping intervals + for i in range(1, len(df)): + if i % display_units == 0: + print(f'processing trace: {int(i/len(df) * 100)} %', end="\r") + if starts[i] <= current_end: + if ends[i] > current_end: + # Partial overlap + df.iloc[i, df.columns.get_loc('Elapsed Time (ns)' + )] = ends[i] - current_end + current_end = ends[i] + else: + # Complete overlap + df.iloc[i, df.columns.get_loc('Elapsed Time (ns)')] = 0 + else: + # No overlap + current_end = ends[i] + + return df + + # functions for generating html files + def make_html(self, df, output_dir, title): + """ make html graph from df """ + import plotly.express as px + if df.empty: + return + output_name = output_dir + '/result' + if not title: + title = 'Model_Engine' + x = 'Model_Engine' + y = 'Elapsed Time (sec)' + color = 'Substage' + """ generate kernel mapping table """ + # Sort Model_Engine categories by last field after underscore + df['Model_Engine'] = self.pd.Categorical( + df['Model_Engine'], + sorted(df['Model_Engine'].unique(), + key=lambda x: x.split('_')[-1])) + df[['Model_Engine', color, 'Instances', 'Name', + y]].sort_values(by=color).to_csv(f'{output_name}.csv', index=False) + graph = px.histogram(df.round(2), + x=x, + y=y, + title=(f'{y} for {title}'), + color=color, + text_auto=True) + # wrap x axis labels + graph.update_xaxes(automargin=True) + graph.write_html(f'{output_name}.html') + """ + Generate data table with columns per Model_Engine into result.html + """ + pivot_df = df.pivot_table(values='Elapsed Time (sec)', + index='Substage', + columns='Model_Engine', + aggfunc='sum', + observed=False).round(2) + # Add sum row at bottom + pivot_df.loc['total_elapsed_sec'] = pivot_df.sum() + pivot_df.fillna('').to_html('temp.html') + print('got') + with (open(f'{output_name}.html', 'a', encoding='utf-8') as + outfile, open('temp.html', encoding='utf-8') as infile): + outfile.write(infile.read()) + os.remove('temp.html') + + print(f'Finished generating: \n' + f' {output_name}.html for stack bar chart \n' + f' {output_name}.csv for Kernel-Substage mapping') + + def anno_gpu_kernname(self, df, mapping): + """ add "stage" and "substage" columns """ + + def anno_gpu_kernname_helper(name, stage): + for kern_name, val in mapping['layer_anno'][stage].items(): + if re.search(kern_name, name): + return val + + for stage in ['Stage', 'Substage']: + df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage) + + def make_nongpu_row(self, df, nongpu_sec): + """ this will append non-gpu time entry at end of df """ + nongpu_row = self.pd.DataFrame([df.iloc[-1]]) + nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)' + nongpu_row['Instances'] = 1 + nongpu_row['Elapsed Time (sec)'] = nongpu_sec + return (nongpu_row) + + def is_valid_file(self, base_file): + """ asserts if base_file is non-existent or is empty """ + assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, \ + f"{base_file} doesn't exist or is empty" + + def should_gen_file(self, new_file, base_file): + """ figure out if new file should be generated from base_file """ + self.is_valid_file(base_file) + if (os.path.exists(new_file) + and (os.path.getmtime(new_file) > os.path.getmtime(base_file)) + and (os.path.getsize(base_file) > 0)): + logger.info('reusing %s', new_file) + return False + else: + logger.info('generating %s', new_file) + return True + + def gen_sum_file(self, file): + """ + generates sum file from nsys trace with times per kernel and + returns the name of the sum file + """ + import subprocess + file_dir = os.path.dirname(file) + file_name = os.path.basename(file) + + if not file_dir: + file_dir = '.' + # Walk through trace and get the total non-overlapped time + nsys_stats_file = f'{file_dir}/{file_name}_cuda_gpu_trace.csv' + sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv' + if self.should_gen_file(nsys_stats_file, file): + cmd = [ + self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o', + f'{file_dir}/{file_name}' + ] + cmd_str = ' '.join(cmd) + logger.info('+ %s', cmd_str) + try: + subprocess.run(cmd) + except Exception: + logger.error( + "%s failed, specify --nsys_cmd for correct nsys path", + cmd_str) + exit(1) + logger.info('generating non-overalapped sum %s', sum_file) + self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file) + self.is_valid_file(sum_file) + logger.info('Finished generating %s', sum_file) + return sum_file + + def gen_graph(self, in_file, out_dir, title): + """ generates graph and csv file from in_file into out_dir """ + # Initialize an empty DataFrame to store combined data + combined_df = self.pd.DataFrame() + for idx, (file, engine, model, total_sec) in enumerate(in_file): + file_dir = os.path.dirname(file) + file_name = os.path.basename(file) + if not file_dir: + file_dir = '.' + sum_file = self.gen_sum_file(file) + # read kernel summary file + df = self.pd.read_csv(sum_file) + # annotate kernel to their categories + assert EngineModelData.engine_model.get(engine) + assert EngineModelData.engine_model[engine].get(model) + # remove nsys-rep from file_name for shorter x-label + file_name = file_name.replace('.nsys-rep', '') + df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}' + self.anno_gpu_kernname(df, + EngineModelData.engine_model[engine][model]) + # patch in non-gpu time + gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1) + total_sec = round(float(total_sec), 1) + if total_sec < gpu_sec: + logger.warning( + "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ", + total_sec, + gpu_sec, + ) + total_sec = gpu_sec + nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec) + df = self.pd.concat([df, nongpu_row], ignore_index=True) + combined_df = self.pd.concat([combined_df, df], ignore_index=True) + if out_dir is None: + out_dir = '.' + else: + os.makedirs(out_dir, exist_ok=True) + # generate html file + self.make_html(combined_df, out_dir, title) + + +def parse_tuple(s): + return tuple(s.split(',')) + + +def main(): + logging.basicConfig(format=('%(asctime)s - %(levelname)s - %(message)s'), + level=logging.INFO) + parser = argparse.ArgumentParser( + description=( + 'Process nsys rep and generate kernel non-overlapped cycles. \n' + 'Example:\n' + "gputrc2graph.py --in_file d1.nsys-rep,vllm,llama,100 \n" + "d2.nsys-rep,vllm,gpt-oss,102 " + "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""), + formatter_class=argparse.RawDescriptionHelpFormatter) + + # Build help string showing available engine/model combinations + engine_model_help = [] + for engine, models in EngineModelData.engine_model.items(): + model_list = list(models.keys()) + engine_model_help.append(f"{engine}:[{','.join(model_list)}]") + engine_model_str = ' '.join(engine_model_help) + parser.add_argument( + '--in_file', + type=parse_tuple, + nargs='+', + help=( + 'list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) ' + 'separated by space. Elapsed_nonprofiled_sec is runtime without ' + 'profiling used to calculate non-gpu time. Specify 0 to use ' + 'elapsed time from nsys-rep but that might inflate non-gpu time. ' + f'Available engine:[model] are: {engine_model_str} ' + f'Example: --infile d1.nsys-rep,vllm,llama,100 ' + 'd2.nsys-rep,vllm,gpt-oss,102'), + required=True) + parser.add_argument('--out_dir', help=('output dir for result.csv/html')) + parser.add_argument('--title', help=('title for html chart')) + parser.add_argument('--nsys_cmd', + help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'), + default="nsys") + args = parser.parse_args() + gputrace = GPUTrace2Graph(args.nsys_cmd) + gputrace.gen_graph(args.in_file, args.out_dir, args.title) + + +if __name__ == '__main__': + main() diff --git a/tools/profiler/nsys_profile_tools/images/csv1.png b/tools/profiler/nsys_profile_tools/images/csv1.png new file mode 100644 index 0000000000000000000000000000000000000000..bdeb47c3c2a3575c200ae8dee23bd14ca6dc491b GIT binary patch literal 148416 zcmeFYbyVET5 z{guZc)HhUKR_qPJI|MK=us0IoB8p&O&=_D~5J_+_pd7TOWiv1^ zgc>tpVR;E*VPbiE8)Gv|02r8fXhIUKywX0_K!=A7H$45fhM#FySb~y!;CU{HU&sZi znV}#;2sE_2>at1why5$Us2i~ae?f#8YGGnlF@_XT9~ufN)15*H+ur{0uvr;O=jyoM zUAoLTO#=dC!Ls+h2X;HEWBDQp)l-ko1`uN4i48=7@e2AP*@AIf$|gqRqNBsfr+xKl zxb*@{4Qm)JFn}yq9bYHUbs9vY)hZ;NXMxsb}KW{82odzz|o}?!zv8vIARDO zS4^y@L4B|hy@7iOc~;U1?L>`zH^sPdF5~V}4M{AkGyoVp%5ny=LX7(KQwXjKyuS0@ zwW+eOOY)DAk<5kYUBz`4GyyJ(v@sDIf~YZc+tJ;La{BUPu;k_**2=*}o+%d$Meic>Uxk7XA=YL06Kc$t-4E59nDsj9TN}VIe@?DcM^GiO2y^PL zu40}uvSg@-(+cAAJIqpOLyt--2DVk0)2EH2{-j~0TxXHe8v-ZxmywQ3I)fzu) zpma^MfcrQXYx}@>Gh2f`vl6-xws4zNO{0=rriKNXtpt z_^lwcakqIqhI!2k@@*eRHxcR%WT|WF6G?uQxui>r{auB@~ zyo9)*Idvj&^l>owV|-two%caF}&_y=uQ2sW9GFOWVrdcW}#qy zr>?x1e%!h8n1H8b8KULd`PfQx$6GT6ACik~8!it%&v5Ai$k ze{OefAMw(9s`3W+)_Qu-ct?j^n#i_uy4Eq-LFS8(N+^$Z#`%+69&@f5On0qAf6*US z=t0OoyFedx6k!GHk*Sx7oOP=~UfWdM6tFkNR9f}cAY*;{?Cz{*{gjNClwE8o>O&Z& zSiRVg2z^+|*VK@kFov&$A>TtZa2@DX*%FBs@q?qn!+20R+JAbiM+WwUA7Qgbgbp>=q?OHLPoM#E2tYy zxu;&z^uflg;is}XIsTyI@KOvHB?2nJpno-%37g5(PlNsDIq}B;SEd2BLY7@&Yvq$r z-X>n5m1_Xy2s@oe;%VH=i2bPj*u~A;1=?Q~INC$n8f^%g35_LQbtebiGd&ibv-p}) znrZbnw-6)#lm0G;IkR<`=9m>U_zzvT!Of)mqzVcz?z`720*2mOFLQNW#yu5p^un9^ z)THR7h9&uJwk%s8;%4Nw1-usHYl|&vo_5-_Y_*`Y9_pK|_>|wPrRs6lT6NX;A735! zEv_DitlH&|)|Z)6nH9S33cBgLT{5)kXgPaTh!^jbUlm!-mA8BJoP_c%@$q>q-~LiF zQ%+F!dPsS>%rIX)=zkZGTJ6>7#RbefiCS2_N8C(WfGmNPME2yO=H#@cK9UzJ7q&V! zxb8_iT;sM|9*hwcvnIUeUeLBt)mzSg7RKsR*_WJWKb|F?@za7; zFE7U3m8~Z>lq!@kqG5hIkG$ui9h6OhcQ8k=W617rbbauj@19Xj_O|27sIr0)vlKHE z`KH`eflCkfDQHz^5MmR zQ_I!5sNe_z_pEbo`<%Og;b37RTp}I;Q6G2q6d>+P(_{E~Qeq{z9+BtB-Hps3*7rg& za408E8hBq0Sn%v#2*EaZFthd12tKQyqc8qu5HF!7J7;D!uxo?wz`B>f;XZwS`+h%{ zVHrUXu0*sEa<2c=M$P5f)e%3SS`6%L@9eA|9^d^V#7k{T({oGv3$9<@!Rkf_n3xHC z2N;on7)Xm%1*l6H%gBJe2R*}qfd`v`L4lsYL7#V^4;UC^Y!DbM=ocOI5y^)5M=3N$ zHsn8^A(CDz3MmOoNPvEoKHCET)()mNj&(<0ctBmvnklP0s>?`of3~q=Ffg()1TeT- z*}gUb<8|c*Jz4=A4TxQ>EUg{5UHM4=sKE_-e$8eiCH|v|qXi$Sx{N%ru#G){n4N)% zfr*qKftZ+>*WSpOTT$fGU(G@P@sXN3I@)qGGP=09Fu1TX*w~veGIMcpF*30*varyD zYS25lSvwlI(px)_{n^Ps`Vj#*e6}~Ubu_cFCVuVLz|h9Yk&l%0b)bL#{W(v7tJ%Ls zvUd1uS)c_nzLqdDGcYm!vu{vS-q&1ic{5jlrG|)^6^J~bG5FcpS$Y4c|6e8l8u4!} z)&JF!nT3n%?@j+!^q);t902yhHddf99r^#YHGeh!d*NRVc^O}q{x?$mN#{RuL4@W< z;AQ;hrtu>HhcqI=zy!f0M1+)G!H+Va(^dMg2T&9t!l+66iOBPLpe6R>6XQ`up!QEN z^T?qvDe|=P-atV?Lnc^9%Qt(3zTE#f@oK%DQyV$CSZQCKA79jR*d6mU*;#4Y&E&$M zB`4o%mlyYi`fcij^rPa%-!(ud2LEkhg4)36_DrV!Z6~1qJIVai@ptGT0?kvWi}|_@ z!G7BtoG}Fc4*qEVcxxOOeDwcJz9>*d(pTVE0kQwH7?df_O7OQ2efjpFX7^z>q14|56r$T=0JF-EFz6J>z$IG`b`?obvM?o8iv@>b4BJn%k>x2c3^8gku z^xGQ5$?)41J($GW;zRA^5M&PFmK2f%sP@6l6kM zyL_K4ndWw#ASi7~1e%y8BB8@wk67g3uAV1N5~JSDdkTl$@uVzv>#O&cj?nMeJ->K% zI9oaDFnrS6jVjjy+-A1z1YZkAr*YVryWb8b(5iXLo&;sF54S=idh$n6K~eb$S`u|> zD2~?gK$H!f?49BKv5~$Vs38*Lv_3w&XDU={+G6!qs|qrug>r=MPT zFNihmfms5_bPDABl2{S`poHKkIjZS6Ezm{BV%dGyYuY8Uh{3>x+fP)g-Ptv&;lh|6 z+vlmya`n5CYK1n%?pRjTLEgfKfZIj<2cM3F-YBBgdd3n}UAKDxF28*|oksJA5{7k$ zR=;Wf)*ssVHjpgoCJ^Tv{cL{5qnTs@z*`4SgV~eii`msh{G0p3PmxFSfZnPo{Me)q zW{SbbA=bU21X}DBxhjSt_kk`e!R0F`HVv!Chm++kM~gm!iS$pJEC5%TXD)YTZl`^7 z%m|B)Yu)uHYTU(EK=>6Nr?~k{5rbXN!eMUjfWuRkV0LZPAmx3GrtfUAhVW0jZ&3Vx zf+Yh~av8BFZNR}^yU&nBybga#p*0LYL3drzZJtWGpQK#2#jzAyEh@CqWj@c>4UYRO z!a*nmq+AYLOh8@tb5XaQ6q7vf5(yIXLwb<>UB(0rjMck?f7 z$|b55baK4wMC^{Q1fhX>$v%*ggrpu-V>LzyvR@dH#x|~BElMCPQ>fk^BfG}qh;cgP zn9HKP3)3sq6&{MA%Bnq_lc>;b+p&Foegd+%kE+w?wC7-Y<`y{~PbgJrJ0NY^`eWn4 zV~|Hj;IeQ&#`3+ia@EMafL}ZZTLHRq`V^0TOe$6=0#cYA3WUo9Dzt2*92SuY0NV#M ziml4H_op3kl-k#tb%5@?@lo#f^tHKDAIpUIGuM3$wrKb29i-aLSDhCZ-GPWZl*t`G z@f7O~DR4RLVtcOry*Agr3HxVtjM(@*l0aYX##w5xNTqYykz1vHV6l62-Jhx{Tx$1P zB<~Q}XJIi}@zhd{B;c_=fR)}j20R62ZT+a;hIY#FIBiEhw%RO^DPRB$XHHFz10H7H zuozxZv3Pfj)aZUUXT#-m4bd2=gDtMJSvRDR<|(_C{oLh5FQnDa1XjzKXLH=_G`Y3B zb1_jBRZa|1nzlo)mEs(B)4H>GYB|t&;R+8W39KrFI&@mXUPsO`yWEMT)dSCU6|gR;YGh)Gi0%dA~;Y3!s6?_IG=x1zb%MzeR_D~z*~~la(lAO zdMXBxOk$a>dG=xkl*H31jYX&n6HR)5DWoFkyuZZGbC|WhaE~!mEHXV04zfz3QPCVr z=OXKPdTw!Q8fZt#yIFrj_a39|+m>+hfsIN+-Z-C0(1m8k+E3xGuV zX0N&W404&9)~kaRnGIgF#pGw}r%Of$d;*iDS_bV_Vk>@AitmPfEIZ9NF%gQQ8-*pw zrK)vd@Q@TzX()^E@ev9n@^IG5L=oXLMvWSAE?X(R61D3$tlsdpe676gw?Y<8&QGb1 zrn9OWec@D3Fi0%J6)i5pRB&AOSIOyoF2Bf8+KDELV32W1ZB&X$MG>%LBx4DD%j%ZiGr=IqwiNMJ=T$`>1?+bb;as<&$SF&|Lg(Xo}P#f5E;jHGipuGydvKKP_qCXPmE@?5`4$6__R@zYaXa**%#VjFr472DE0inLg%0+ydG8({b)J+mxjg#62<_(B)fgWP zT!mfj&nW52qNL@msWK;eN|SR?z7}Q{2`_(yaRZMk?yvPTwV+Rch^IiwO#s z&vUwhb9WoK(b-RfYEAlR;LQom6YwTR&j%>ee0>CnJkRog@KcRKd`m~o*W$z$CMXr- z^^*BHt9Qx^`-G8F=K14BZ{+6(c31Z7=dKq8i@N^tM2hf;X^#e{tZlD3FM z4B>avdJ!lw@~d2BqD~GdNRV&P15@pIk1*9Bgs<9B?lHb{p`0T!S>Qy=b-(Jf)Arok z@sc{ycnIs}9^{sbaYn%i=W%~QF*V!j@si2K20AYC=o*i(G_bHln1 zY}x=HE|m&>D4HC$lPa(qRq@XI8x}eoYKgdfzlTngYMon%`&6 zC$aRsNR#zqqpN$1iq!8(=`KX15XffmI0S8tqzxRkVAzWlct5A`-fzYATjHq3t$N=E zu+K$M{;1*>PR;VV(ZC?YN=UT-Nut>Drqy+8JZb!BJ|82Dn7@_SMkW78fu46_emK@U zN;q3#dPV`YK;;pvAIFAkB^?;PZeCMZ^Z;K?sEu3Ahbzt9QpsGH>+Su*tQ+q8eW)&o-*9%F?91Ipg2+8PGhagOt`W~_jXE=}2R&x1_|{Wz7^iZ)83$|{|fafO9W zO#!l7MWKOW2|6l-C@&q_IU8MMz1Fl&Ype7ATo*oL`EjG^5Uk$Jjv3FJCIV8Wjyr91 zN@-%Yk~RS0*wpk#T(j(GK@s>?otAoUk^9bcZ>bJbVLMo{`J!C;XqSVZx?!#7LH9y5 zQM)K7Bb+Eb+$?Q3sTyKX`zl@S#(gY9=`?Ynj3$vs=4M)|_0QKy@r+u(jvPE}^cuy_ z43Msus+E1%CD6N~RR;?(1d>QaSE9cKz@{Fv!M9zW5}10usH zyUqO!f?=ZQ3Uyvjr~u31_DFmT=|g*!`kSzY)>Xx&e_R^)=;v^F3inHV4wTc;d)rBR z-}BFGoWO%`zq>z7;OD-(D5_ZB6A;_ZY#({LzFoC;0>haJK?$2G^d$LJwO_k18|aUI z>Tg>PlcYMN=PLA7=A(e+BEx8cI2p+9%lqbuidL4CeY8;dygpedXA)$Uvw3usYqfi# zqu3y8l`Gym&zY&FSM}4C*yHwC#bImKGEC&C2QfEada05z&h@)?m1zteJk|UXe3qCd-7XJGy%6KCX2(3Wo~xg!>xsn@<_i!fAcYPn-Ek@cEqd~ zQHge>shb$K0Cle#oert|Uip(<#`}4-LHW81;0^E;F!AWNQ*bJynW;okmoF}Imt%v; zE_uzTd7Djqze-yvPF5#NcW*2~Q<7LwrCW;`yxy0^V5K>KcR7BLeT&ECv{U=5#*K20 zO-1viv`2En&y{R62Mt!y%uDVYV@={?`!z}(VLP=K@V3Eu$mYQpn59XlNXY0XXil^q zGn{(eC6-~SQG-C|IG)aDao}1U+Xu))UVae`{)wp+Xh>0)F1qYO!Ps_{qup+=!IZw0 ztcgp=h0c9G0Wz4Z3L9G*PM7MWSoSUl70Ea9WRDl;$NI#G;DXZBA+|S>RnCOptCgAZ zlXkLwI?oZ!_iU<$&=bWRhl2P{*uya~7!_Px8v1_cS$pd%DKf5;%WB;N%Kmjg97W^l zW-cveDC(CkwlWgqRurwgMsko>z;8f|_-CeC=87AAtn2ey?opIEbb}WVnE=;bDDluE z;@#indaW`;KS!U1B*GIodwaaIc^R(G9*%xl+nr*eKc2 zz}H+~Wbm~7U!n3?hI1EtXg%9U^RRDmy&?6{k9pJ3f}mGh-A1&Z`pXv$WeyA}U3u1#oLOh*@p^=%Wl&dZj^(s7U~k2;tYBfd}| zg0i_(Y_pYORJ^z~<0uS@AhtA*hH{};Bd3ts-ZL#%T+_CGP0J>jRbwdwWg`B z-_gDKXEpi35=4UNC=x(L0JZ|ml&4_Cj5-1N z>>_*Xrm`9l@dySWioINW$R0T$ZN`jn8lcuTOHO_L>}4|6uzyN`z&*+BZ0RqQd^MV9 z>xV&Y`l&Z%58cpR5e=8v4D4Uw3p%39xyj-vIEl{V>6I3!>Aw>ZSWD~1QcJ@d27avo zID1Jv3DLtE=!D|tu8o7mfr%(r2i>c59<@{!NSoy}IqXKJXoIQ7V8(nmhzq;BJd&>} zFrP1}tM#l5j8_d8l0v|u){hVUZWzc9m)*)^vXCdy0{WQOol8Cin9%(ps3iLl22f_w?q zN|c7w>}9sQ`@L^fo0*%!wqcRi9d?KF6LqN~TEJ_?;5I~;3wgaE{C@I#=0j~0c;8iB zBtDD;R=1d6r?PzRM}h@x(6?}Ne%`TkqKMJk`H^YB(BLsnc&+!)c^p*u!DwQAB3T^C zX|Y=RD&P}5LiCk$Sy<6$A%Nbb9c2*@rYw+%)tbyXF7s$M&9!8$*vJ1`%74|+pN&ZOsl*L;u2;k z(cQYwizNE+!*AeKARfb@!339=CnLfXfiHV0zv{2>+p#@ zV);7*Rln_##JEy-GV^lTW1nY0&3m1K!CAdvS!w^fZ2huq?*M*6=E>0q_`=NO7*@4s z|N4};o|cM2FZuY+u)b##0ouZr%L!Lz@@D5Y7-KBjQ5FQT7qo<>hdv0&-kSrLeUvt# z&moq}FmIgdS5gV--;%oj5ahN2xrc^r>3trnPAVa2+tNy;_rArY(5dth9=tzzM$JIG z55$h$ha`)W;C{3=!~q(X?{-+O2x`9RacZh7xA6!6)mSF#t%N`LsHfECSu{fCOhGz)=6-pdzn%uX@( z_vmj38(tuq2sa@|iqMGCp2HF(KsGyVI-}G|u-CV~s#lc@l?@8w*lqim$7YgAAhh(C zCoPk5^0)RypQWmGl8~C+A4&TMtOOX|wUaJyaXRB?Zec@OyrT5VA^WyWtSQslH^Yt@hpaN*Gcd#1p@Ggm_5V*j+Ks;S?~Q@fk#s% z@*Epa_EElQrym1NNVOm`qx(z>!&wyE&B$w3gdwM2w0l+A9*dArI%rQ& z{j1W2kw%UEsvu4-BiaydZuMb9tr9t2&py{O(R!4Ud4}5KpF^WYx;5_C!PQ@a(J=93v|6dEtcPQ{;v#@XdGKoAK606@ zl!ac+!t2^^{U~&53}~&yxK@UnuRAG9==McnOT8jz_R;bzA&mzd!s->zI_h5 zif+~sY3>B@zui9Ds0?LP${RT7eW{$6Jtn+k@~m`o$n7kCk;V8t6povWK_yGaX5L{% zx!?UouN`h-{!Z=_o?iQ_&zR5sHs&3--z4mTGX7+h5G1R@#GD+J_;(H|Jb-%|ie}0# zaikB6{g?0619^KT;X#i-LJmb?@f# z=cW0HYArpnXyqo}c=zygj?@)w{i!R{`x@aE_E@!5Jp<}55;IJxcIoe8$)ysddH2E4 z_fZ|B+heH}KAN4$e!hGT98%@S%Yv^9BIQH^%)#rL|wsU!#DITSUrzuUkXZfCY2M@d!QQWBEP5kD!9x?Jh zT1Gsg;Pw22(G=E;BZu+%cDg`p7(JkMQ5_i}`hyP7>VPh*tWsyufhK^w>QTOKr^T z5zx~tmf8q?{&c%4Q5Wkbl_`CehpGIc4n)EK;7!t_)XJ%swMf1e1;tOuB}plro~KtA zHmDSumH3sr^}&sxp3kJ;F*xPd6Z5e&#=pa5pnKU?mGaYa0tAmS^+#rR=o2}~1UlUO zlrB^g93Ld2hnN&SQb|Uv-Wa8nqCg^~VG(t(PM36^?+vNx$*DV| z)5N_mgjV?@h_BN5e}9tWFJQfPtE+1x!;$fditG_h)#`i0TsT`ai-dMGwyCQ;#sgV_EXG$%_) zKI3{6wE8H)%0AMvfwBQVz3>!BkWWFMxO4X{2#b~!qpLx6{ecQ2p)))P!2 zb1=Q4gh)lM3niDyO(Bu#_9@fjq{;T9ty&cZr_Xb%cB`X=u5>2v9%dK_EY=0`?PeVZ zjH8U#^}YyK5!Mg_edqxQulz3fWt^73at>J=18(PLJ$1N8jC1tffD&y1GF;?6I@uQn zQ(6C5Tqh_ein%YxWOI^Bw~y6cXTtHPb^R}6`h3QGHyxTon6JFh*#mXgx^a21uGz(h z<)?2=gav7ns7&cDJQU7^vk)OEMgr~vB`+qg<}WkH?_b_amZ(=B6=Juk@hs|lJ=ZZD zP)zd25r)eGA@J)C>$>P*e)?8K?eY9xln)%jN(@?dgvv{J4~ z>2SGhwx_px$}y~Bg%)*0s;5@>9`s-J^~#A)Ld}JFpHqxaWO3^a2g@RCfC{#Y_ za(l=r@)AfXl7H)ZhUC^J`wyAp7Yu(^zHdOeB>V?guMI%y$Kc?GiR{vm%J$0qxxBy2 zDO(xe#L$ZQGq^uak}nZ@=g%Fnm>J;dU@D|}WRW7f4lhm8b5|~^9hD3!P$6fx=|ANc zYA!slYe3kCXJK&tawuR-in2#R`!jkJt6{&q2Dt9EYWPU+w>% z2on_NXv$l^KV&g+HXLW^8yh^~K6P|jEJ}HJlVV`mlLa!bJh^5{eHrHWWBi7RQg`C~g8Y;I|YxT*nhx_r$*$FWVMR*?4F$@7o&OXiJq6~E+4zZ(b?(mOo^Q@GrwLZoqG(d~AGwb%#Rv+b-2pC* zaRm%w7e5R{5@>lpoWmA_y!P}O zoUVJ(t*I;kdB8&2&H4GD0*hBaQj`02YU|^q)MT+2Pq9KS5VXX#A2RApVaZGgu855*qmA&wEFhpYW+mlyD%EUKv5 zr!(Q94BB9>n;#e3)mw-8iWQa`xUAN#y!S?)X7ZI4AFP%*i-!A%SPx}@clUET!Fj?o zAtYmHI@}@!c!7!{r=!%WU6rDqcwpNLBd_ONjCliIIg5}3oWGLL1^(>e zS(+GUF_>{%XhcNMl$j3X<%mZsdbwG=a6VozF1px~z{qp`p0lGlAe$cD#ogOdg6#Ed zevo9m3cu{~)c-~zo%1`f&j2@GjK~8yseGc2Iqq(Lj{N0XD(zQvo99p6@DWZNl+PiA}FTYPDF)4m>G3?ThIN zmeS_K#tNrZt4y4+X!qJ%7@OUye?jvV%~_iDF&ME+vuTnsN}Ju}`GVqp>Df^0dS-_f z5_)?-&=))VXIxqcH-x9ID1lZ#{vDK|Jun+!xHc=d;|pcT>o=<_b*e-Na=lXAUu-*Z zQe`1Kt<*pHZ5x2NCW3SVAztQfvRD*B9eWcJF+}3Hh{Zn5zA7i^n>Xse6-8cR!I55A zy2eis;KIbH^e9Bo)^wlkcf61|Y?fnP?KZj7Q5ki4o4kMpRzzjE1W>A0@xJ7~58#I% zM@&fRPzXG!514_Qpzd*_Q;c&y^ZMzxj1SyF0AOBkBjO-*PykgB7G2F-n!i?&r;PUL zBgiA60d=zMkgnac=z6-`W`dYX;b1&p99K6tB6}U2tZ}PkNa*)7CH=Z1j!|T>UIBF4 z2romJ#QsZOIzgpNJLH^vj7R~Eb^*@`)Q7ubB-)=UjPv3KUSzg;Q0OewLS%)k3|j0@ z4!H{d|H&j^%A~fPbW5(0KS-1~kicJowp_wueL63D?U)8I)>T z$Iw>Uj`1CyC~T}9Lzo_m2*05lR^2&eiI@y^>f};L)pL~LmBXOkfDVIyK~c&C9BN4D zaX?}yS`i#?AZSwNaG3Aa<-^{(2@k7Pyk|9W69|>QbV(Zqg@EMA#~4x?I2}$W?)*p{ z@+jAo0zseKQ=xH2k5su7QdpXlpJaykR@CWNgVhZMYwC{E-|uezs(0YD{_lNkq(lYj zbiGJO4g};oL|l${RUl38A%^8)d?GiNT_#m5EPx6Y)(>Ql4VBU;!H606*za2a8$qtz zUXT!^AN6!FD!OG2J=~%_zt|oDfdm`b8w7(X*9BgWo}+fut=mAR94QmsK*e6?mACne z?e$UAGDu`mm=G(|o8nE;M55j9M`D3pd9OO8ZxTp#xIo7YfTr-%w9<3mv3d+Pkr^|0&j~Dc3A3 zAB-$VckvGpCuS+e>aYjsfn?hM9q$PSGHf(Z{uI#=VS*d3kwa~69G3hhb%I~eNa^0T z>8v>7Gt-aR@#uzX{U>7yzFvOFuZscgEv6qfhS+j0?fXq+>OpCceEWAjxse#v$;OC* zVQ~4cUe2hYVktu?Xfh)o!G}UH?3Didp3ue!*?d5a`l}~jA9V4UEqDo>m%mgWWP)G# zG4UUrLcaZ|63-(AX*+Jn|30i>6&i>xv68%`tmB_1^KoT-(ct=<(IecDtIYQ~hOYA~ z1n-kR2w8SWc!*$ZNM(|_zsbaI`%7t>&?j|pOfuzgbaSwt_@PIh51GodpD)>dQssz{Gi;){NW^3M&4!9~ppqd*K+n>u` z#{ivmPlhEa2x8Q-BnK{x=_h zatvP?^|K%e3*vt>;a@qZ1R$F>0(rO<=HHp%Z+zGb1{$c}_oETk{~Y8slbHL}>?MeI zd-MD0fq`2y9(a{d^Ic=55?NFgG1;$Kn;#L$CJ z`_M1aI7*^_S%l6|_1x3kDo>zx)_kn zP3W(Ao+S!CK_|7n8MXz@LqXb?A0rosDfhom{9jb}Oa2HlHIo%%M`3?UE)ixB(f|Km z{>DIm(*OT)x_HM%1;C;VE!`tp7HibHh8`_9jMo!Psa0sVy&s6m3fZ5JsD51T@X@1` zOSw=#jBp@`tR8j8{oav3Fk-~24h65>>?m`#mec%PLtGodASO%pS@=4SNt*Fiw+EC9AEe^obEiK?VV#xk-=WCHvdU&#nB~X~% z=5eA=;Utm{B9k+hkWPz5Y%D{7KqJX9WVZuj!03L~0sn@}CY$hNwPnQ3Mc(qI^%*E; z4y&N!9isp@TcOA1bae4{*2{OWQC2#%C(S#l((QU$?vjn)rhBvLYSrgus9dW-8|F5C z7WLTUWU)h!L9hLY1(}c-+yerD9^Nc;d4+BYKP484|J_RLIt)bMgG0Bl z)z!&Onl@fSW(bq<%sB3AJjs4bt~}Er#aM}?@Fed|)cFIOjyQiC%mfa}_;@-l`=M+S zB6#ko8y3LAhasgiCk)D(AUCnBgx#@>8tC(FU8my(HX7~LFRk$`32Qyk6d(cYz@}>TZS)~paqtvXRKmOxl?Z1CXgkql zlb1u2#*`~0k|?o|?aV=&>; z)1cJwYE!bYwughHe)fvKK#>hfAS&zEK;X=7|RQ;?8&+A@oB&&78J`sz*#smZN?0u}cKjq`Qn9g^*!oetMW$T+FaV}|684Gnr<2N0E;jQuQDT%Bhs3(nT z{Rz_j`NFKL)2YSUT$w|k_$iZd@s`MpGUAPqwi2yz-PpVRLaSX(+FxF8&R<#$GLCOo zT7HxfYs1ihtGCPGunk(BWJx@h(Jh`ee6XJQ^`)iq_I~}Tc{r26r-^So3z>VjE3MY6 z!7%iL&7$bT8$2pGf$Ip4!aJ*EjDZ(sfb;zYbcYNX2G%9KNFHHiTD9OO>2%!7brC~k z?6Et=cAa)k^v_pOMZDK7=}K+~S7!Iw+aC4&Q8O4sbnDqz1stNv9Bv}Re2OzKH8PzzubPgI$E6V4fM&p z;0A@lgJRkfM^l(}kWUxMR?9U>(!8HFnk@u=YUtzpw;Q$dx}~Go@xZ}DrL&yF)-74Q z7~rJ{m?W;got-J2%`0UG9nFiqBtQSx&~&r8#?aB^x2dY?>RlI5ScFv!LY!X)+@jyl zm=6YJdB?(MLCdaxaHtr{5?2>R}i;$1rHNn6$5doOMhu1Xa0gq;a% zBG78w#{r6X@3G(7W4<^%?MST1@9N0TvR0{?(YJx~xkrOix{S-b7GCG%--tzSjZzXZghBuSoZ!S)x*TZ+ky zgCO4PUzlv z^n-KJt?BsP|1EiP#knRqFzfcW}yfoKzu!#1o{L7ltaf;Nsn%QA9v^$tERPp0%X-@Y%b zq|8-vGEbIX3Y$Lu{4+nZfoGH254}cttOK?QE~iA$`$I!gjBTbHavARv)H_VFynb;} zHw1)qntT>e49^O+-?@?~)~jnQQ|F%DYL_^!u6A8-)WGbht>aye69^MtKLclZ-hq77%E;>6Kp>1V`y;5ck9zHOO9sY|W*_=nk#$=``?v1=)8 zdbwYKDtc*`rAb4KxdI1dvlYQN1E{ccY?MCS+R@(lFO5DXTuu){L?goig@**d-RLHm zZkZR$GK~%|U?S4NgzC=tIP~2X8kA@q9)e}eN6v(nEftz*S+$6ab`Duv#% zN>A|@wKOU?>q+B~WJTebUxc%y@jN*=JhU(KAYJB)Aur?v0B8;t5@k++rl6Mu*Z^Ou z)xd(K5=v%uT&6e6Zhvzmp(UP|JHs<0{Vbnl=M~c8Zpy=zo-ni-ghEe^4?f8xT#+*QXuB!I0wy}03eoIxU=bwia>b&Dyj0H$hVwrr`EtbCB zsMg5=jw>{vD_ zEVKDXosvBSF_!oSDd{;k6 z7PYvXYIH%ZM?PXa{OpxMl`PRM(KesX_N$%1zWkx(oiOjPBk|z4zcuy!GkLK${e@_x zsb{qGLynCE=q0Ql;V+gUMn)NU|(dM$m zeKwmKH|CSt88voWTN4_}q)+2L?+qAmzB!s76+@)P$-Wcxkj&!Ga6Fh?_AC=>@nsW; zk3zLMRjHahnQy7Q+Mn;aNmvp=y*a(wXN`Rf#5Ot;6aJdhX5I4%qEuYc7K2)&Agon) zOGYA@DW>*hPRo7=xo!N#Vb52=7%&_w+>Cg6j-}M(6WBSN{B_f1Ea3bFa}_Fz(d4_FQFO{J~;IVpzF^L~r~>vi|A1 zR4q_HL%?a0O0i+cV={Xl_9Eatme>K4kyL*%g!S08gyJ`I0<-!0(xa6o8TWcCYv7vM zb{E2_`Mq{EHWCUxqtZHk4Iz~*K9|>lUX$Z~YmXTYrT6i|+ZvmVJ(7#tU#oc*tLyHv zL~-pYO8se{anM)FoiJ*U`NzrKWShNzs@w8IAz)Ky*zfx2a9`n*zTopyuhK=}Ta-m; z)JXhj{x)lUrHfa5rfTINBL^^sXu5+9#J#}CY1fZg!FN+`Di!vdt$-&$IHfsZHF=Jk zSE4L&I$WLnHb6|!rTl*=d&l5NyLWFp$;7rbu`$uan%K$2wryJz&BV5CcC3lbj%}My z-+R}ypI!fY?+b ztX++zKdM@XqMa_lVX`q;df^0r{{X$}^mjkZqm;WU#qd&!e<8DGeTqpY9rC&9`1D4x z?YwVE5n!&qYW&41C|6F)s40|@z%Nma$V&>e_*tY-lKidGp1KU!Sj28Ur@2@bH#q2A zoTlP~WUo8CfIiAqSmv{@HG38utEK)Z1vIlqz-$9WZhv3crc3_%X;~z*8AmCaY;({B z231$#b#%hB_`z19Ff!bUV+tqtrWbt=Kp7yWfvfIX2)hhP&x>?O^x~Xy0QZ;SuNgFpIE*IZ_mYhz7G&Z$ooDkU!KHh9y$nW?&3>^2tqgOL9CpN~|6dnC z$nMGershG}H&l}7ehdZQ`Zv#8Y1RdzT#^W$*LM_a(~qFggLSxT&xED#TFmyF?dJ}j zxWvL|hY(L^QOq&fdDhgIm&V9{oXCgJJX$$lJ+Tugq;^th2Wo4QD%3|pu6GNylBUCd z1in2NI@F<~6yd;E@U@BZvPj)g*0^>&IB6vnKH*>OMCq5JvxX)rg$D5ka_oXo(NRz= z-4>ex>gJFuS>sA6mEIiL7;3S~ncB0!;p;8EW_}3Ys|`ALO&7btSL_W|_oV?=mw-G# zh?FY`#*w=Mh5V&rtQPjlO3*!d)(La?u2ll-8P#JmZ_ZH!B~#>yY&@51EfZew*Y;KU z)N(zoo0$Lc&(F?0 z7nM*G>!ydGErfN15&hp&)bgk<&~&OaHPYknR=rKjG09eB>o?k40?IB;yN1}72PzetkC9BG}8-cDof_2XfYYoUClFlM49prkcc^_h=m@H1Q%=XxgDD zme+(b1__5=4&`OB9GzBQ@vXj)8(^@Aqui{_p~@cS_rt5e!_xA>4eHr#pUiURAh54y z^9t@}&3UBIWJUVs^&n?21xZk{nlDdU_Vu(?>yq1~(PR>Rl|&}<2@h|!qo7e>QzD5C z&U7VTbGB`hg;u|;n44dF@@?pwIdGA2b;+yM{_q4p5?#P__YbYLJg~juYVl(o_!&>w zb}JRwH#lBBzT&*B_n^kCO*6&+8Fcq<0XR;ux=>ec-}&MP zm1G^<@=IGuaQ54C)MtW?=S|x88*B!$k%Vx8764x|4UD+Ts9Ut{g{&l{QOO1wGJD~H2PYkt}K~jxVt~{q=0XC zIF*wuC+KIs6Y6@JzAsa#k`q@blB8H<9~Qk*=-DZaqT4bxY&>!G8l8SPL9i&D@CXE+ zd@Sm79fvUHsog9_c7?S}Ht6s4(+fbjobhYWsrdp4N0Z5G=r#?u-*PAyEnK za$7aOTbJMA^9|KXKuYC!3wya)d?j%?F9;?ZBzuXy&j*ygs~obp6~wc>IYp)A3QGQGnET#T=yr+pj<2JL6_IQZffp9UEdv@I*>=Ed zVr!NE?zz%158VH1r}hAvyy-NF!@Z}!d-Xyzi-Ld(+9%u1HWlek%~_4_eOjRTqN;Mf z2Wy#P9^f3m*3ZRt96K*6socsmj2R=96<)e`!K&!H@d+kQ9+7)6%w!-afMms(pgdNC?V2~f22EG%E zZ#>V#daw<2HoVyMn2m>PKuw4(eHv4>j3@}7sU)yfiYR@pi6 z6;SUI8M+x?h1*BNj7&CJfD8Y4U3z`xn;c7g?!Bcp?+T(S_y4Jc$$X}n&9)RQfSt(s zM6}#6M{Sy)B%q&CG}cY2@53V!TZ*-EY7d|l+hj0j2tl1nH;o#1xS0_ZO{!XM(Vv5o zsW7YJ$+%CyC-y*g-gNr!RAnwxiel!+^Dy%_(;b?EWxm3$ z0a7meO_pN_XZK8$icg>kGXRvS-&tGp>{NH_&p#SWQTfZ$G|%sKlpDL+TG_wlU%L?e z-r)v!!U#Hhp=TRf3`T?>-cY;@9ESWh%#sEB!?(vJ@u{Mvr^&flS=_g|_7~P3^u&kz z+h6kdoDUgocoUVYP&*z;41rz0w`jA#-w{CEFp)^^xhl^YJ~8$<4#IV_Q_0Yuw# zINZELoOQKmM^u-l+D)gPPLe039KlMPaRv9>f~#Sk6B#ViWO(|Q|$(K+W>Fi*kb zoYg#pqdGK|5aP#S`KD>kKJZ1U*)aHJRwJBN8#^TFzVqcie{cb2N1TJH)Bw?~(l5=m zx+%s_>&HNbKpK`kP7}6|L^z%Oi)TZ7&$Euifo^lmRefLUs5Lc_65QtJKAkG-2jO;k zh5Q8mP{fWW5iK;{H=ES!1dYyBZ8aC2_yEwhnN-qN8Vl6Qz4ekUt5NB7(wMppP@|Vc zVXOtBxoUg0IuW|ou2ByRjToZ)=Vuu0TCQXr4q#f-aRNj?S3Y0x_Ws@j`8-lL>nWpL zzdPBfezXagk?4Dza=UuaCMIu|sT$TFCZzFU(Q8ova#7BsO8PU%RCEU2Hz%tYyUuu` z28mvCBJM+O4Re}%A)J)W<${Z1DZ^t4m5Zt$7ZDacdz%~WQL=CR)b+o8z0UE;ZR7x6 z@2sBiH*d2I=w(kMd;ZwhZnAJ`#{9_1LV$Q`4yLT7R2Yagt>R(OU1%dP%rbiYK4U0V z?h2PBBz=Y2)7haJ^%&!Xk5Q?y7)BZ(l@&|m&yorK%0SZgRBE(-2){RWYH$xm!gEFb z3%QW%vHVX?GzXECPf|9wH7AoX z-X6A~2Fr@fnupTTeoOUn+4$LEqse$;KTVMT z$FT$v_}wq1;(gzB=WZ-%knBJp-p491?W>HNR4UsaB^PpAwe~f~p~`i~Oo362q= zc2|$SXv*f-+>+BBt*`DPSP`ExXQ)Tl#9Myj5OND{>y;N^oQ_nobU$`~JcQ3U`*1iH91A4VFBQgd}u)QHvsHZ-a;cI?ZgGhd; zSI)`NczSFdABzO39P>z%_uFh0Q5)Yu;efrq!Ki}+X!EyV806`%L`4h!i|8EslLt4r zm@(OqS@utP+I3bjTK@d@i@KV%9>3`8J`EZa*pM~1H@cUth*u6Dv=E`TAm*U%&yR4qyug&YO&8ucEfVqKH}|^; zpDRP{ETxvJv<1Yhu+f+O8APImG&jaTAec8v)}o(%ie`O?U(wKSW$ObS#4M0$BFGII zaOvUY%^!BF0Fl}T5IwBsvGJ>^)M>u^8g+X265UH7t_ZIsfSOXH@zvog*eO=&*bFC2 zWMZemGLnE%FWFkT!o`KEmv>6*%MQ2InZ~Qde9SO@o8N7+5>=l9DkO%SW{@FUEo?E_ z3>i8==5&8S9irvuHAG+%R@m>0wJz0P$BI8;FK&5V`_K?EVi^KM=Nv)HE6k>J82frn zvB8DPpCbxk_mmG8%S=-^D5{VkLc8}V-TFq)8{=>wNyms^h~N ztuc3+Zn%1X%KBFJxk)6le%QAvfTvf}l_!2_{U3D~$zQ)8F8& z{`uJM&&0iQ$tc-2XWpM^*QKV(tk%fSVpzVC)EDNch?>K`b+yQjkHrFKYt0K92z^8- zCZMI=XjRg%G||yxhx?rQ$4wbpy(F$Nbu9D261crP`-OTa<&P?@*|sG`XUK#B0}t8( zg2}QuGZuko#b!lh;fISiAVcSKi|^^B)Abv7T7yq#vaW^xZOvwjs1x5%saP;8!e2Co?O;sRjAtOzR1veNeH?Pv0u+$)=x3NedY8hi5#E*L$cUGw9$0E z@ve94E~lvQ&i?uY(0wdc2y;)uj(%>Do>(I8!=%4nlFqd@rA^8^&iiL3gm)znWCo14<)v39pZS{A#sEEl1mEZES%~_Or!;F3lzhz~ z5fBi@4|^s%%TqgW_{FcP%hod(;(J&p3T!?YyEaegR-_-=Ew1YARv>E19+V)pzfjFl zc>39Fw{mvXe;%k@Jz{f zXtc}PV@Rkfo=i0CcRQwjeJZ6aqAp`;38eNw+|ZbYy%|VeEXTd3-3$#-UGch8*{JE; z?+h{Q6hOOkxxyYA-()(eTCLV=>6}jL`@hjubm@@}j)l=_+QR5oQurTZ1&Y{DRd<`d zHgB5lUpT?vp)!fd+aP}h2z)&ABpK?n4)VsmI6oK}>sbDPciJ;YviFjOD-#&08QiAx#f&K3*pF$_A5%QVO( zvKeL@IaQHNhBkX^Tm!XMG`MwweErn=x1(2qQl#<*RN5*?s{Ggo-TrUEHKj|lB`TCa z@yIHDaqFehuT5E*IekA{O|EyuH3NQ%zQ)06R_1XwSue2khg)~)s#ks;sdnP>Tf?#Alx#Z-(oRLPgB-(Ke3{+L>Em#h8xrf7$_$N^Llt zJgGhAvzw~DIVi8S8SXdo+<{}3;m9`G>Sm4#&b zF#MLzZw0XdGP_Yh+HDz&$29sHx;ttqo)0NL`Dm$M)t)SLW+^hKJ*e)L2U|@WQLE=1zPGE?^ z<&dT->w^w@SzsD>*YR*&QppMi26;A^OQl?_7dbVZP|2_144rU*cB~EF2Bzh?PtrOY z)%J}PY?(``b1J^obja5#H<}!cADu6VVE3mo_7*CqhQR*fe~d_`@V!X2 zz^eY?F<806PlavnIuWiQ#tEHynjbnAQ&z-)q7ub|#M9$Z>vebXRPo0i^G{ix#!`Fa zZL#j*t(j&&ktyMV{oN1Uaho>S-`v{rc${|Wo`2k9>B)cw4ZBmD`vn4dAbVc89CDCP z4^PeJ@UCjztQh_hgK>Via;a;{h8e&01E1M!xMhy7k=f&*yzDoK92-Dy31OYR9is*oe;Hv#i%I8Ox!wC zU6eYj*YWh68Q^t9*nQjlPA~59i1_Nek*FWnTVSFg*r{+-IfuHYF7C&vlH~F-PRun= zgGq24{WqdTF6CpkQr^?IJA+)+n%zM@z28l$YRcRFf~7yh5zG#6)#cpWjC#1a<5aKH z1xncFzELgMY?X6zBm^y3cD-(Aw1SOYbS()yHl<2O)NiF#jpwA(X{`||o>P~K+wY7d z%a(T@0-Yo_w-AuGxKT?cm^aL@ME`wLe^90Slq75I;y2x)!2--~Sg7;ik+&X#F?$GK zFDlq81J1XNyEz8zI%C1@lcr_Jx|;As?&{>rQiZc9tHAG);T+oJWHLz%c?pfNwOKo~ zHTSCFjz@nDE$6y|xnabyuA9T@G)9fyR|U)kNA$_*joesM&d=D znn?DezrWYbBgilncgGCry%xpTACb*(ry`8EOs6d0R->uIAk*-`TKrcCk!#D8?E;rjygO7N81AY_(#rbvn&Vrg{ zvn9;O3nPWu^q5uvkxQ-=MSJ{&P}2y>+HoP7{FEn}x)yG0?Do^;dxNic?G#$&Dr>*9 zG|__*l5MkK>^-QpZ*xNx8NuIwJvrf-T-tB`{I9fbnkv0xAq8qb8b~I?X?;}0^=Ct6 z)9vwM!}*Fe{FTqFu*t0co{Crl|6;Rq%yfE>xr?F1wRoA*xn!)e=(Mrhb?gbNA*`P# z2bP}jDckvB#v=6gz$=w+FiEGwPt`~e4QP0S_FAi6qr3Q_%@;1kc^0SN<)q}R&5vbA zavA%;W?v>K@(OtOek--n`t0<@L_U38h-dXJu0(!4Zswv4{-aWc0eQKJ^ zp9g}L>JA&2JRK%{s728~A?HG$J8-H72Funik<7MVT#u96J?m>K)Oc7=oo&w9+Hbfm z7W$j4yJn=qm^4cG8yjmnP>Ako``g|!gA0&#x;-aM@m9hgi!Sm$qj-?4kFN$&PV=*S zP!8Fw(C{}pS9I~Vbk<-PAHiDnfh|gfbL}Vw%bgpz#Lw^afHxz3P!`(s6lVL@!Bzgp z1NqbwcX!R3`%R%Eb(lah{q?+pv+G8|nS_?Y2@Ls@j(fIBFVMp==lk_#cRY_OUCJY` z{jO2#7AdNyeQ$5?m?_gtS|0_}S+ajslw>8j^pBG`oz7oKeHi^TjqIuuXULvXtg^wOF=8c}>ZPEovvDL3>k- zipb$5q_NR11Bu^#%7ALwO4(}EgR9|gJ5gk4htJB%i>^A1zMah#9%qj#kroQUr`52z z35W8z%VSqsrOedo+hn_Xp_3;Cei6<|0v9MY5Q_*U%xyhUW$$COlY_n+IAUaz+&&K7 zZ;ftPYW-MGhu*!D?xeFPOr1}t1RKK|^iKBGhw^E-**62f2xMOIew{yRpI$r}bPBo%K^}Fxvv!3my1d4*v@k#mzfW&iXO0%*JKJyW1?|o9T5*}0O zTDXwW*w#UOfpNS*i606HpcESWL5YQz9Et7l#IyFYr0;ZkAmiJ}Kvb%4HY*Tsc z#%+$)k%t#XXhmpYT!aw(<^}OyZ`ndFnp7|RuvEVx-vU6mzZUGDGTFP}Z>H$~4@&5v zhUhJny@Nyi^53|?6A~zi|6WX!ii(bUnU#!&{jZt*-$IK2VCJP!Icfiz{Rh7U$-$5K z>BIhAA&~oz-CYq!kWJxVS@jLjab~>L5T*Y06~b~L#2}Z+&n)|2g$Dp?klp>U5;y)| zUlEiJx<%HU0yBX*u2+{U?0# zKW~QG`j6^it=@R_AK38!^OscXA6a}=t0(KfpN0;IoPoAmVcq}t16BRQ->9tt{@cy| zAD{HU{)J=$-G(nMXQ@vAdY}lke^?-TJH66;a%_)Qh1fdFvASO=S(BOkq21Vb{~WrA=XzKiv)cb#cA1%!T=ho1}^#n$$C=Kb|#vbUPmZ-Xzhl7sI1$3A{u9oi!b^ z-Bu+2|I2*fWg~?~rTk7&E{-2dVFt!-f3R8nl`s=bPR`}$uvz;tYd%T#&&;)K*8I2S zTz5Tk#(o`0UT&7<6;CBfHIh^lU^12@fql}jsCU4uLxWdul0Zhe0is8oW!A%DN`E%O z47d^m3bjoE(cdEI>D4OdQ4CZ%V@*Nk@SEo5TUP?kBZNb-nyIMD01vqH+0GK76)ZgT z^{K#V731a35w0Q^IV2kpjtADGo{DO9Ldug(0EDenS3$ zdVj2Gx$sKWb#ph1>j>_wXg9YvZ%{O*=`MWwB(fSFEPG)j?1rCjw+c}tp*z2o-^565LLyUtY>QSZ zb>iEu*JfjPaY;~o%4q}8O04WbdAhp~<#(FPA4Nu{^S+Vf@xXphMzc*GcyKtR@8B?reNJ>CHoR3U6J z>w)Nr$Aa(vCuAQ{V61M(tpDeS#H+xjTuC=KNZRa{+kDQeu`2WK@u%47K0WcI?Ql;O zDHv`7o27==%};tH+au^qWl3BvV@}M>ti^U!s3ex8XytM;cjFyKV|Dkb>txG3Ww=KB&*%6AwGc91E^*{ZO+^5`A+Py1r8T{m%(l?3|eU_G=@Q$uEd+O z9BurZ;eBJ<8<>Nl-@Q_@pG~&aws=2D>O-m4>53M3c{0Wqw5+GN1l_Owo+kB8zdOlz z3ODM`xDwcq7?x`Wdm=_n!1w#3)lkwSTAfnVBvAC1N-KqF@Z+Sq`ynW)C;QD&xPcsm zeKcB(yyjyCF;bFNZi(DNqWe_xkrMWi6CX8|FId7|wO|2nCU46btei=yQ=ZT1g^($t4GJ~i;@{{FHhOq=2BjjX2k`=j|B_%2qrHvVc% zK!4S1zy5**X)8M*!L8kFDf6I;PhX!ykg&1mUOf65hsPCIYx`wcL=1@9Xj8sW&KS4s zgye70CG3srDdK$irXV;Is_&Nvq8F0gF4u%X#TK|`h-x%wUygQ@*nO@HMnU+*oypA} zyU*XP2&uQ&t<>*S#1asQcmhFGa@it>Q=b%1Qy@q62BNL|!5R0@A}=J(GSN=e8rAT{ z+Y>@|YIsaqs%@`_6Gq#Cp1^hq!g>7!vx&||eYuX*+YP=E1j83(T-<%z0ubkx2$)e5 zwlJ2{YD@&2a#)=$TwL=TQ`Uh5i#u_xy{nm?B#nX$$y>bQ9gtMg*D&BVTv9oljQ7V2 z3Ceu$?Cb8EzCXk9q^JKBNbJ8%v`|~#zPQfkTdH_+ej*d|$j0Ea=Pr==|DaVaWSOjw zxsK}eF0(F?zY`gXh74=cNt^u{J3JwE2_WCDC0lRv^~Ps4fqnFyQj8BLDR$Z){j~}5 zr|x_0kD3Mr)QLQ+(}J@0iJWrybh`PF=dUO9^vs<~kO4y9?{CdAYj06J!VTVe-|G0= zOjev`yG1K@Ut$Ox+{Bj8Yt7cRv<89Nz2T(HclV51E%(3TuC3;Q1&mx46{Ji<-=a|- z&VS1r!!(Zt)`;j7UQ<58%?S4nnKOJC>xgILmP~kDQt7|Gf)xk@EDSd zuKxBc$|t1yhZHwq1Ri2B+WS!_TC-fX*Jb1{*Sls|c6BYpknq?8vSre~9eE3@Kt(Hdw!4=4 zZ*DZ^G_~O0{JC{^b^y_(`< zbyk^8n({1O(8&8`_h-Xski*dr;aWWCPrvbbj!ONURp*izw?>B<^wHk}{BmjE1G5dz zvlWlnOJGI^f&=m-h&_KmhpKF5amru6Xf@qflfC&r<%Jj8;xM3otqdjNYTPB{%az0D zQVYS_WS#TdKf#jDX23(*Q5Bdzb+wwy{dnIa4l=NyH}ifr#eZMS*nt*@k4F027-c#p zn4itxu2PlG;qlt5Q)Ui_+9$}O+oWHk@Z>j|#7GXTI_0pSpe4N=PN3S4lqAP~BE)FP z(Etlh8W5~`m+B8kPUgg_ocN0VmT}|muytBv39?211nIqdDdtDQFoixtDIq{CB?{7k z0IPYdjoP!IA%S!*LmJzd>qwC2Y{}o?bN%(zrjTb@m&;C0^?Yw`x!=BIv4^f}?giaL zba%lh0yL{1Bk}yg8kv3ADcj_mg@-Cd`Fu5ipsk;k7D^ZT9e?>h3?X8wJZtvGE!&P` z$iN``%5u1NL$dq>xMM)w1l1wSx^k%yxYy}~l1or1j7FoAOlP}$K_zqNz~)ahJeav+ zdGZ4Yzo)N8-^^#Z-9PwrBN{Cuz6om=!74E1mBs&g#@7A^fz-&!R2sJNPiCQ<6C$SQaac2eqd-) z*!}7C&Wq6rmqjh4yEJg#l9ZSiG}!DzSZ=vz+zOSuBI$+oH4UbcGY!PO{P}s`&K?6L z+CVNRE)+$85QW0$kkc4dk|Xt1OZtJ``~H~Tv&9}5=C0>b90_QJ{}^Tzu7JIT8aN+0 zxA+WvpO0}L2w0nMPyurIj2=1emQ4`YUqPt);tM*WCwVqOV1fKACpY2Fg340@Xg(UH z+gTfI^LLr60yZh#ljo=%A-Sq3erP`Am}Ej*3oQOG;lQ;m;V-7WNWj^^A_9@O5r*#r zxIAum+K`6!&(cX^N`?cqUw@``H?i(gPiIdFrtBFMOm^*K#cmPQ=ydH+b!hkHo*PKa zV@T!vZud+)9C~t#Mwez0z=L2M4}Oqfg5B}{qEKrnIuOJiVv`yrOo|;Xn}tNk7h=Vh zvtH-hVbFWYBM2Hrc1{O+FkbyAjYGtnz2#cb#x!N2e*S$(iVBa<<9xu770FKjWt$Nj z#I<*Ts%B-(qux=tVe zuJ`^Xs1s`%kbhV&Rzt==)l=`Yrl(&^*5UQpmnj9MMu zI5iQDcZPj?v}*jR>H}PS2Ga(&H0q5cb9QNEXf_~yaVS=?&Y*NQau$#F)cCXqJSO9T zVHK_;+j*alibF`Zy?`2o09P>*36IZ@)o=FTwIa365ru=RG&@7GA&m*W{b6Iy{X=s6 zl%t9FxocsWtAxPGcy~VU2N;aO{#D#~WI|a_28GVZTi9IYVfO&X$rMie6}t0KjvL?3 zLdNtiZfL$AzZ-{YBlJ?uYFdA%%-$pdX z0Z?dy`~GqZZq+q<%N6zGWTAri>iZUOP5E6srnBp`MLh1>K(+zni4@x>Vs7=G0MNyy zM|gF%{+I;A7dy8Mu7|pL(z2P|7!M-|$%Rd-VW-r2%_Y##oI-8i+egeR7~<~OuTV#o zmKZ~Qp!R36TKn>NwKIi}x5)N{9W~DO$z=<|sLFoi@~F=EhVY2wK(c3T<1nb34M6uK z24F)Ft)fs2?@5FDLwp5jHSaCmNBMxTGxIa>glq->v^zxG$l?I*G_C)4Lxvy15%nlh zvjwI}OpUK_-6C=Tk|YB%6z80+UQ6lc@lKE-vyqHX&bXcT@c{Twe)A}>&gAnjs~H3$ zKCkXkZ8SR>&lIzR4QZc0%atek)1&pI`agq|xRLeA&roW)1NJ_p$2+ z4;~naRH)W!NQUu6fpugl`694odCgDiYk&@Fn|X1+KvTub8(%>mg|HlzuHdmt?p!q$2) z-4nu(u*|34k6>uA@!-~pz~9Iwc+%H6;P$FvENJC@eqd_B+TAnpbk~a&9f7gLanrJ+ zDbykvUrhVT<$qx+Y$^jXx$UZhuvzf6sJs50*L4NfIdOVl=rA9e%3$A0iw#7kKi}>* zz2(XUZ3=G?>z13q%{YWK?}6&Nlz}8{KeOxluys%yNxnR5!a=u|jaT4)db?BEYw{~h z_XHP@0ah9yLLS_pi6g#voz+EPW&G4J6CG*u5xiB?XbHK?RmGrNX@YII9z|ha%9mEO zgfSGu0Ae8Nwt2-UQ+vj~<$NJM-tlI%ZfrC~;UVSX;FW6QM)*+61QO&F5b{^pZe~siQ zS4~YO;4hSr9WhWM6peru=!kAHfJ{tNd6t&4Ui$dznvm5@1%o>g{s-0osom5ERnW-L3q6s-i}r~n>$`C`BRrb5e6@y6|GBV*S}dDh7c zpt?bB!8l0+2jB*bap*Jxq?2eS4_rCn;{b+pW-||A#cC|=DNS7}{3*)gE42p6?gYB% zT{ygvLNIM>9*3`&-J8Qzd&&mG^_JLIOy&PB-?q~gO(qK;Nb z{0qnUs`JnM>nu@3EvU_+Xvd?rcyHIU?fPCi%(lPaAW{0V8|fLlC4SW%AJzyk4*Kzm zb;~2lBCH(U&4zt7!t{k&0FM_r+Oj;RGK(|5iB`M54GLQeE-uCkjLPn)=)y4MkdJ)Ff-t8+?EN7mf@#-_SNz&yS7T18;Hq)bzsxr{v_lWo?#hEV z`}^Lv3(*`?D1w-*Q!Zn3&;4yE)*^jxtFWB?p&-^91|{ih?S3s(6IQhcJ2!~Q|GZa9!}grz->1hKkL zUW7%^+w0?^;noctF&;M{o?3Q(zLV;TP1 z_8*pBxuhFyh;|1s?A#o`E}`#21JfW%CDo7-R1h2=4E*&Gl{8}ijf8D5_w}NiwlV2& zAhOae&Rq89ERn@&tYm%Sod0Y(al2~RP1=%87!2sfUB9`Ooqm-K`H;Klo6GsQq>naU z#V^M~q~M@!G=W0jwK3oeIqlw$jZL+_JZq7YDZYQs`gOU(9OrzsA-qN7UHtjxz>uH` z7#a!LZw>pr_cvM25A`P?ldZ*O=6IZn7qOR0qb0l30@pr;LA$-taj}AI3qG);E(c{m zaQW3}t~;ZS`VY#~zO`Nt7Os4thjvxo-bljC!vujP6p#(r=t^H#oO-2iy)0fnx9s?y zb2@J9b*uAmxv__qrS1_4N7nj`Q3c-kk#v58iW-!8!kyTh!IJ0yA@&WVlbpZui@$Pm%E(p$4nOSBaah60hiV37 z|MKKq=4?r@&!RLE_=X8Ta0(^;#kuc@0v7c=#I}~}qq7&Ljr(?_7D4c4qGbS@s+kW| z4ORj2EXS`oDz;WL1f5g3h#$rO(e89yf32hU-0ivgA6bMR`6YduSb)Vw*^g_Z`XbyQ zY1o!ceAYFO5f0ard85^{(;AdtENI#{Qy&hK^oY$MH6;zC)7YGp0k9ji@;V22MzBE4 zunbtCQS1VsjY@u(t&wP{Hra#ged+~`N`FlgnL(a$iW~Nwe_dW@@gr@R=B5;*Ufz;t$%MbqYG5+WjQ|3Q) z{fi@_MM8sK-r&Yz85SC*Cmq}f!F_l zz3F`VkHY-9B*9WO72;dcllN+ee>b6IAV4_PV%gS3ytVx^hf*d)4salsp+Q~HD=-Su z_%p3iU#QIf-i`)4b8lDPR$$)<~F9I^HZwleAfG^jLS5i)WC zMB&NIzMJ?G4vI46Kc}vD{!Xbm9ZeZvAX3Y6Scd5qX$UzhvHqt386W(Tz4~{>|20 zOH2Id9H%mx#|`TpfQ^3^O{z@T%*II{LHmA2c|!1Q!zP55?O*3f&T9IrU1ClaD5Q5@ z(W9Eb5`!8}@oIHO{qRrRlIE%6=pSIg3D&IUfuvrzB&kj%M|t5S`51sN6R0P5ZgA)I z!ku36h))KDN07EsTt43p0#N}9X`C^ncOrxqs5J)Qy!OsTGkn9bD40Y(IM|6S%)Q{* zZhd*4J-8j$1}JrY1yt-g95@pI)LWPFl}(EooqW@+L4CrXmLYKt1BSY_4G6EXb{;(5 z9}nIIe_jI=NPPSYjfPyaGlW%UB1!fXB5~x=XO<%|&PQ&0jZTkOH$O{RW)PB|ZXLYo zH0vTTLZBu#EGW6nc0`4iFA(Z{F`wTp*bMNa#BN66F=|^RVhY_o5g_D`$~B7UknlJ~ zD4z7dpTQyES;!PwFGuMMYxYJdsV&I}qpj@?C(tU9dvRS!gP?WN;wj5`hr@?pq5p7Z zGR@!R`39X^tvDb4NusW_A>?tnI{M@cyt)YaR5Xh?DukjkzhB1XVD*<~s zRJUzW`dYIWi%Yk{Dy1vJm7s6zPb= z+-{uDiq}g4iHLlH@w3BBE?vMmx$>7`exM2#D_+Gw6yl7S0ho||456@sz&%Ay*or*z zV_A~&qI5%?P{&~7;zfu9np75s%c$b_e2WTya1)A2cizi zjv;WE@4Xnd1SJB&xlM{yW&_f0*=uc;PI)#4c1f7vy#L*HV!TT}I@MzE7ng$_yVN(q zog4z~dGh%BGr=v+LLx&LOP^o?{x^_A@}o;YG# z{QIR%ju$uXodv|->QK{JFDJKYu)zUK{<-4QYA@chu@Fff>W zBh9w>x|bCprX^K}Hnlrou%yqKZxW0*Wol#-v@4-V*oK_O!O z7KBobYnVM8sQ``g4)*$gT>$=yg$^Olf$3u4KshXey{`ZrK(L*{4!jXFk}+iVv=d*_HCGwZnn^e^Of^^QMZCHlma}JlBW^Pter0q zh6QP@^fx~kevSh9R_f-)H1+e%$rqx}FY0{bFgCuKNlwWQ$+w5|o@fDl^VRms|Ii{d z4+w8pPQ7fbiWwpED#GpM1do)>mcst>< z%2N0=s>Hncp@JY zoSkbL1?1f2N(-bbXbyk{AY*teRlM>bShPrdZeRYBL*x|fz3<1QDD4MnK*CpH3v)NC;l9DHx6Y^dZa3E{ zez?x|m2FYz>&L}E<>K&PW=Q1C!`E{skk^gE`Y&o8uTcVPj}PCYF_xjwLEmsK5rfv5FYrO)D9ysaCXXzu zwWUgUdAA|lZ?T~fPrcb3YvZY^Rnn$BJwY=6hrPEBimO={cZ0iSaF-y#3GNOdcnD5# zcL`2#2~L8$OM*KD_h7+-yGs}t+=sjPww!P8ea?64{&(yCepOUaMJ;AlPj|2B?)Q10 z*VX~iC>M>V($ED9*Y?og?QCR!%2ZZ)+nW0Y>+N%9HyMJPn6bI#K#@?a+Rq_+35|-S zyn4}B<7?VNe&pGo79v&jvDy3Y0+$NFy(Tnd8P?Gu>BOuJd^AEMDV-X06whcxqsCqb zVXXx;LQWiF2>ZK(obptjWx*G-ngq;`{~81}XM=4|euEe5ibPorE)OadwN{yK_g}`- zMNkqJA4;r6OeIkjAT)oqC~o-eASAxvD+1!X+fMI$?<4t0U=3CBQ3HJEBej)uxpx@BX(0lCJAH<&7 z4EBs#1V&t5-jbg`>5VC-TA3x)41=yb0vc`{-f5+n!|~OP+NCw*8qwV zd>b6B!&T*@fa%G;NRODgx41EDp9F$PZke$cxNd*0JRo9IV7_?Zb_>eE;sWBAsg6yz zNlr&)?(^>yDXb^+3h~f4hf98dXYH@WnetW`V0qx5SEYfM&(s%D@zWc}@2ea*W8SQ& za8UF$kXF|>z>=sV7;&TPM7Ydp2{)vU8rpqJEmLjw-o;dH-kr+4SL%(!x8;N9d}qf- zqY-HDcrB*;x(WlNkTx%25^HnS!{gt4wzJsg&|LYhIP>wW@V1GUf(P~l!(Y6y5VM{Ko5E%lcH4kYXuS3sqHo5WYO;>%qY`|)7e zM>EkT21COO6?jr?U4#VZ5++x$d0=iIf68ZHv*q20U8+GQgQXgRj zMe-F9=#8K&V|t}%zt$X2@Y^s^=SX$HRbjTC&xlrE-SW)tCxtQ@o(R#lvo5CJu|M>g z-BR3gFx&$Bf_=`V&BhXo=2}>`sO>`{&O;9ozSstKyqGIGhKo;FJJQnpRFD`CtwVal znCBt%Y4PQ8Cw#D(48(uwwRI=-l`q+cuofo9z3Jh*WjH7HrH|@F1!PE1$6M8HvH8 zZSpx>p=jWj;UaL2U}^FB93giJL{vHFg3S+C!%yuzCsTWbm#txduC%(f9u>C-m%PV-yw4n zrN#J@?|vBw)G{fA2X`K{y;x94tIu}C+^BB*Jx!nn0X#c45C3%fy0_6}Kk3`Gbc(F5 zczDnY>Zs#&M@N@opm^YV@9eE{%sF_g61?zWvH^ALz$ku&V8dk3b?7sNRhW6(@9O22 zMj?ezYG_Bz=<{lYigBC$_-ME^9RdBi1#c*|?&uuvdVM#kQ?f0NV?!ZG%xP&PcM4wn zTKc=a$9J)te32;djF?dZ1$uDQ_d7{3ZV*x8-natSTKCL)du4l`9~7F7QDgpPsv7+a zMp1BX$kD|sLr%Fe!k;<^nF02{eqj^B(36vN_rA0p#t{7c?ewzU5o8PR>v9q9fV@_v z%!AJ;F*!M{uSu_s&1exjsNv{U{suk)|3nuFZ?0v5&Om6M__jtV?3jI`MQ z^lJkXmQf)}(DkcB-e12S!_#%D9^6j<^J@X`LwQ7*)7{&omV8V1Uv2#Ndjr}p7DWwQ zM;M|B()>T>_Dnq}ybr4h5w!oeHp9ZAvl9Pj{BoiKgA*!$fxtn=nHlM zpx#Cs`$^-fUom|=eJuAlH_c!76e-TQ-A_H6WY+hn1K+1!G5u*!{%40(NBEZ2F*3fM$xPDw%T|0+e`-akqmB7rkKya5F$^geWNH5ER+K)qLSubqGDq`uE4lpFY3!NTADl8jtF>{_3g-v9hODXocu< zs{Q3m>?R1P0?~O1k>9QlQ0xySGOb5?k+R+l#ur3iL7F_PvgyBOs_5>uV*nhIIgAW!a_F;taWixq|DIJ7?`L4h)a3`LGM zb)}k*1)x=6rFeffR-W~l)$ieg;Nkpi4CdkS_z@ZlOp7nRci={UqhFFZjzOKv<6|u^ zi__V&{cdRO7FzohRhULDl|^zNM?I32K3-*RQ?#twmVWa)I=rU`1tLh zj~Z05@3mCwwWVQ_iI8$Byc1sjFL*QhlIOX^23X+k=0E+=a zf=|BpC#0GD-aK)Qz0$;hv%tHPUFn{fnp$Lq2|Iv7$nBnws(dl`O!NUap|Lfyj@#aY z5}T)6$|uETceHqf&7$}|B=YT!Eq{q|qV=n06s(fvMCacPcIP()22E%_1fjrr8^r|$ z-w($5oZKGE*(2eZvt{2P6wcsWqE}|QueO=KvmVt0`I~?~E`j4r{`^fq;JJ0-tZ|gw zhxa8d%xjjOgsC?I=59h^U!^8MC57%ttcMTqeH*IKju3_~?0=2sJcb}V@?J%gt+^5eQF{M;K z${57Fx#+32DidD{U%G}8SpHnPifYswgF@h@OCAIYKT?6VJ!sj?Mn6X;`8JI(jois6 zPoinpyj|a*eJOvL@vc!^5|7b~ssIZyz;S<9(bM7TyVnJ={P_mXj$DBQPVtC;S3Kq$F0PX1>MV$0Fh(sQCji#pR3cy|0~?ez}y` z@yein3rB^O7rl=IfY>t2-dC{p0G;?5_CyDn(|iejXv9Z1cY~Hev;Dpi;$T1_^u&th z0E z_3B*f>c+xnd#~Zq(IupL)#(56e2!zCsmq1hLDF0t+hse$^Hfgff|$Dqu&1X}dPU1V zp^8q^3$DLnL*7M`@Z@0FXiRip?TjPnYuo**^BnIISUPb212Uc5Ue`F}Q`Ik9dsnJ0 zADj!+-$C2Y{C>G7=k1Z#@fVosI*je>aZO++HL&0NS@IVk3`9X%x8Jp2wptCS4C+@? zmZn_7*_*3pdbm3Ft2;hh{&1VV`K#Fk$$GIpsIV>gr1GDT>F5wImu{!W(cH~TTJH^% zn#wQ}pVl8ydku;td%sTdx$c&UeziP4llLZF)y%xb*gK*BGMU^W67Nunw!7HwLZDqO zBjbGqE(Xl$6W1L1`y+!6dsARtGxd*aak=Z5%}Jt&*yFw;d?o3Pw)qiwjN^Hp$Ep<> z{DDaA!yjFJ+J~85M-~GQd~qDdjh4gDK_@!nm*m3sq>7nB;V~2^^8c)F4P?T(`F*yS7r9xuqbX6U$#Yl$qu;^_HPsMvh3+0dU$836H zQ(9_JxNK=t=GR;f(`ovHiNlydOO^U98n~i#iiB*(wSYr^!qIl(`zVXiG}RFeF!Ot* zu9hU9s3G#@?{A9;3<|nKP*o?oLrmX-Y=3LmYEX%^W_`R1H<$g{dSf~9NXB$IUH1w;ewwv4fORZL9aPX^cjxA(%FKHPo$E*Sf#^2+!ikfBPE>yaCydWE ze1SX`hf_f{1o<*lqYH$&h3SS|GqrLJJGtk&73)}|yp~wh%9cktaP5#`w)4ZKL*7sKI7*dG<<$K{gr+uOF#GT+=4_O*)DD%F35QT_i17WuX~}~~oSiEk z?L_2hoWZl%mhJ;ShKoWx$?LoLJzHMVHOb>j{AmC&7Tp%i6@DXYsUwKvp+9lD<1W$c z!|mB%yG%#C;$dM4$`tgACy-rOvy0>PaD}vwt6+q%)>_d+2tGw}rPbSl>$Si|SwZ#h zdy4%Hc))j~6DE5tQ^g^E{oY0~fj(lnq3&L@B?T4^Owen(5f=39Sa*p#c6eqEzwYT@ z;ZqGCKm?k-BqGiU%x-WbTS1=F316pP2p8~s9r7`uy0OCe3Fa#N=C!XAY;H$7j^Z)( zP#tmNv0d20qd5K@s|c_sGXdH;*j`gi9`l*7MfaDnhYF=+S zJO+K5l^`79)#0*{D14KYBm3nAjtC17D=gk3naN+a~c`!fg zp6?DD(#5eXo7tem)rvRUuo~*y9R;?Pn)HfQf0*ZPVjbKZY{j1N8RJ0c+9r)OM+EJ|5aPk_8Yut8O>VlBPt9V6`dy z@&bk9pK0LrpVMkNmK2A3Vy*yw59 zroY+)7qvTCC@+pSov&;l7mLI&_618MuIY)S(rfdY?xCoqm(6O;x{W6G7MZ{KYVkZ7 z#%#XT%toanhp2{r0d+{|!f&cPrxihIvfX+6TLK(3RDURIiqreZ`_yuU;&`CJSO|lJ zH_-#RCs1>&4A4z_-WLqgd5xFKVI2#(uLzBGNrnlHnChBweE!T@^^ENL)`m&>*F^B3 z+SW&pmT(kSpls`Zab?QiXcf)Bw+80u2~k0uBP+3(gD>l>R>iAi7R~$ha>UYm0nbzx zR+6t}JzL$b=;QrVt=j|Q2?e`lRTvR%cu%12FveAMI#0Ys!vhtx==~qa(xQ~`$6G9H zKOZD}WY2V-`@Upv00W~Kch>_t)qbTrsCCix@CCG0UHo1#Jyi6G=WI1eH$m)Dt1za; zT0mnbqDd(jA5i{;d6&Z!{B4(x#Vg7jb9ZeJrVX=VQ(Z-nW@&?CKpeEg^`==-$z?!K+ z9t?%L=V0{_$C|!V`LtZiXp?Ap2(2`)dD@&4?mvQ0IDD&Q+WB%7e!vM0#wfda34AzU z9&7#d+lg4P6D<&{56jF#JS@uKb*%f_xmxrak@4$$a3eh}79?vAI4>n(-R_&)k!p=& zzRqb~8&mM~Y^joR{Tor5PJjOvAOBEWT3ic1O?wP#KdzalKv;aD6q+sLOSyu|u)mQx z!HVvHnb8#)q|g>%FfbS6v+l5i^qk>`XaUL!oYO9g7K0GWHAY;zmz#Afo zv718cI~9oHetV!)L+HCJw=$P`ZA{-`?#J1509;nvUHj>~3Ju{spCp3N7Y-xb=+!cXO-7i(PGn|q zewII?63c|B9w_>GMz$wbnfgTG88YC*8A@@`bBuT`mr?}#nA}6yPr5@4woKq?jP|y^ zs%@C3AoT1(zQv(X2l1?g)W7bl0JNHHaSsOqyNPjo1XA&CX{0on zg9Sj+3%CLXf+5rjTtQeVB!VymGz<+b1GC=9_R%V{hW-s$<^xKG3JNXUL0l=ql7@R- zDI~_Drbb&)*nM?qXNpv$bg=C=Dc9|qQ=wR7BGl-K@0&2z*(bZjFkz<^ml~Nqw>UVy1F?aZFBG`YtarRkScFq41Cc&FO_B}Whs(wIq8~GPU@$7^b56%eYJHoO ziurCBB_V>F68t*i&~TWvw)kv?E*mU}kMG}bOswus!{BBly6sC5g8R&M^)5CEWDA0~ zfysR>VnB?9;L1Dz+cTiUKRuNKewe~{Z51pOs4C;1{kJ)x+;we?d~6D@;F!)YC;rMUa*1)-{VDg}1Zq{h&uM{Lk>Q@iJ;%(!;%xcIQRs2ahKgxo~fL)z= z*&FU)>Rq;t;~B(42)KhTob?+_IZ9<@GfGlL4fKbPZ{B@H?g+A#MEf=)jY@#8t68d9 zS-8;`@z!&tD3qls$Iw?J5XPQS(>EG=b)>*icl(1TcKmyEy5-8DHjIR~ARJaN43Gzs z@2uEo#*>gSe8uSy;wt|d%X%sIEuF?bPXc(m%=UbS#AI`K2i5fhcIB_(ODWLq(6mj0 zzD^<(#i^ZGUt*15n1cNCE+E(s!{zelCX7jts_NFg5Z}&sPe^3U?Hvsr*52MIT%Lz{@^$ zF3*kKonm#}UabgwW8dB;joPOXbzW9b+u9F6OkyadsKhEr`L`BX-d6~zd(@NSvFT?F zB{04;N6YAs$@h6nzrykUd!5G55D$6X|3EY2rP$Z8KY2W7-X4t;X~Qefn|~Ed@#Cc( z=p}aS`w~MKMs-*^`MyXlyAZzLi&}8lCUP3UOU{m{SVn{q&x<^8#g8o>G>p2NHo^ zEL5h<5(T1sF}}~N_K#hH4E9Cf*DIXcF14o0V3J6%%rC)WEN zp|VAaU4(x40X$}tu=N>8W^4{a;e>dU>u@Erj5>0GvF$zpAZN`((-X)kP15g)6B>A8 z!!GIHEdRv9;0ze$^7z3WM}!BKlklnxC5)w+ev*C<@BLWfGN-K2`#@^Rk!}gc*5m>8 zVJbVuXz1<961J60+Tz%^SdkF6?M42!T7vTNy6ZU;%c{pMDK8GPppo$mb1F*5P^Efa zL9D1u>#~@@m@Qj}kF3q!#A_HL0W8UD6(A$Sn0y;`r|Sc;{teJ{`Nmn|&HI9?k+ z1G0^p*`=iCJ4;F4Wc+Apwcmh812G84f(c<)Or7PQBf5LMP;e|=+f70iC-q0zE6!%a z#>D3MaKPn$KpUAZ+lDCC2^EG%Hs*gEP~g~WziXp^aYd_STJL>r^2MS!*w%&n8@wQw z*_*`Eh^$v%jDl%w7q@?8jh-rdD8sv0^GmJ3Tf6u@;B_r6Oxdb*kPCO&g>#Gg#82$< zMLoNknY2~4V_r5yJd+&V-3HR?%ZEy9QlL5Qk-mXy@y;V$0f#3Zn z*vb2`>7M3bDYp$pq6%fE+=7jm#{)Z&<^94RSQ`ecyNM{pC%>wzz9K(f6J=_OtYL-D zCBrp=sL`Ewr(1)TcRSH_f%HH;(4KY%7X-DzSbXhDELidAlIf}7kcAdlo$}(JTxZ}l zz;$Z;cU-4+O<*9X+KQG@UqB*9yIR8Y4aLhtg)c9Rscy169;sK|oE%;_#MY@|Q3eGG zK}1_16HN%;0mLORkwK*RmF*diU0PkjMPag(*e8h61?ihb#guW;$LA$_xuIAIF;4D( zu$^55>ej!WXBdK5@y={pBY&9EXs!y#_xhMJ;C`A#OF-NSo{+<)T-ev&adEi!4#IMF z*@-tI@ZCyx2**&E6CKTF%SN6PeXu1U<97~zGY6k|DRjTwQqJk;bd1=zxZPq%qY6LK zz=;OYw-|%I)_!Gn>Ls1=DXNFx30bp4q>ZKH3I9AYvY1$rO=i$nIAoQWLxJIEAna(c z9DM$AxXitHO8D~`IkSh^dVFSbxt2KD!io?j0*`8S6?(JS!LP5hb{Tskg5{>wCJ&() z8~uyv+Ywd39L5oGVq{}9F->l^$^dVt4F+D>4x%Ok96})j$=_i@J z10GwcSuJBMlY-Eyx5%&h<)iO1o6@Ga2#v@DJaTZ2_G`=<%QlRr0~+JPjnv_dFN{!Z zHa6G$FVdMTJe_yx?tS+A2wp`@1yo|`l*;lU2RDYP#8$jF8gkjNZ?=1eof-d}K)`Ky^4#{0(>sYcW-wO~oX(9J{ zRpheRru}@k<;+4!Z4r~~MyesE#yYI4;klo(w`Wdo0B$Jz#PxAU2?^=X?ns&(fCc@u zZoagMoww?H_r6XUQigkl(~0Vwd?Y+y_24w|JSw8Uo{W|~1VpDu#e7b7RDzK=NOtIhHy_}mwkqBdmGys0Nd`{pL? zXh^ZS|AK?wi245w2aVFet@CKV4#X1PymF6k0L4c4@NGB6xpzV z7e?;;sp@})LaY7>h2|GdslVrvxV$)e`UWI9{3uY$A}-Z1fmb` zXQVjqF4h{0nOMpti%`>&jU|!gbH#z(Sb4aub!0-CZ=~wAr0S-0%FsXyB|SJHV1pif z>rII=d>$xHyvQq7Z!=Q>EU|HvR(ho1iW!Y{WoTyzKe2YNVR+eWA$y&ErN7}CaqH#D zDS_Rsc3<@!1c7jV@h;-!X^RvgOT0wVpgvh{@1nO4)b4E5YjY#Bp00S$Flaf)F8fRe zR_oAexcDJc$n$MKQWTDsPXwwsdjQy#WwFjEI!XAI_@1C+oI*~E+Yy7Du|yXVzsgaKYnfp6 z-{^lt8vm-MdMBc|!fx@5SkQ?2WF6VLFRv;Wk4aM%pC>bbNvU$V(J$LWTyWG10MU^9 zlV-pIKbl}!?y}nKt$R|0^1Lp1MKsqCc>jToDtTpg>oB|8OgoRzVa0~bB`v4bvvFYN z#ahkP5q<0@_M~jILfWjRFwa%G_@-{=O#!KOxI=r_X*x&^Up;Pnmvm>ILa}f%NUs4o zZG9j;7IJj#6|8OJ)af)&LGRjn({+!)>`V5*LGjw6m8`P=#W>;$2E6zfkiuh0jRuR} z{B<|QvW4zZ(8<;MPFeUOfqAbMMyF%gH^JegdAv6S(*Y$7A z=$|=Ja~JW&hGXodyl>iwVB?J2Xb1Q@t22E5{Zuf{5T}wSAUCdBTE%z-3>ggutNM{dlwGHbRv1;#nYj3(l1pAsg|pQu!7)*&olw2Lg})4c^@6q>ptH z2e`yhDNO59b{Vi3wB)?^I{TBfAr`-9YGdornBmGTO%Jf&0-=;vlTdBpDaoG;!0a->tNQ=x*5UD1&GXg5z(adELJTyof z`gx^7ednP7i5oKE5JQdr?DvdA99Zn)p#3N80?Ue7HC`XDpT~_$t#tEtgZeQ3hbW?8 zx>h`d1mVSDE~7(28|l3y040U_Pn6UM$k~EcgS7Y07zQNGQa#c zlvJqcs_2b6O&#Vqo5Ge*0-w0Ce3pE0GoWo3+dO9RrUw-U%|H|%5e?ZWAHy=Jh#Nw zY2U{AAYsnILZ4V%lup- zzUt#H>qhkLv(!_mrO5c-eq^b~!BxK4Y$Ya31_ zbEG#cmMw$)EXd25KlI%|mj(?ugJI#=6mO`N)+Va?oWTnQ#`#z}vpzi0D82)gNnVNuTFr%+YI3qW^N@G^WqOFwq_KGdb?U5XG|wEmmiQ;f+H@+I zFLukvN|f)#>a{?ghL66^$DsyHZ%g9NJK#JqZSlFr={NL(GWe-SIK+{WZ$HdhoW>(T z!#sa4BvPagqb%+JLL!qy=Mvgm`i81!rHvstqH~@5=lyd zVr~1MSgEMFL=P9E-s5g2Q8LVyG_E9-*W+F1fUq?fa|@A8F6PX6g6hAT|ICtXaX*Sv zM-|~8JeESY>I#))mz=(YHKg4`48y7mfnQ|rDoADFx@>Y2Ez20mU6ScMm-D`(z=(cAND|_Rf@m2p_HYLhNl_BSl)M2I(O5!_$fXS$_BTEZH+);c6 zUacGvjue_KDj)h}A{!Ig-X^3L07mSZ44`OW{#P8;<3LoJQVKo?#pePrS!4>hcBcHd zFzQbq&WF*>l;x}ot1QP2u~;#9kpcv?0$3bK_ZX?lE(s=xH_F|uW#p0_5WcJ+MVf?S zN$Gc83tK)HH#9j6SUl@)%S1Zk6`?^t?Y$!;N7vErsOHZSOAyqwx9N_&>bh(d^Qm%J zcs<1uJ6ar9BNtBoVXM=Z$R=pr{{zM&(d!(+;{vc6N{(BO&*Ph333%?Zi>)YXOS#^AC6DP8NCs8|X z%R@T;iK6~yYZKf@yoa2Hm3KI=zt2_w1b2{k8?$F(Sd5KS4sT5lL_^LIa>O1IJ5Iw6 z5N)eclFq9KT5s!ArY0ck(PF^&1aZoVlSn*uh-a%!1p@)5r{&7G`m(oyFoECsUR0MY_JK+uh3*aKuJcR)^HUtarrd70lJUnp{@-sqc1(Z@Y!!JnmkT|t>+0ER7wf1xrPme(7OLj zp~hup0!c2bS56yO5D~d+!vG{|u;Kha`!&iTOHlM7VU#B$G}*xG2}AX!`*n4qI8Kq3 zT&aiC;yHZd_|no<{2824yjmMkI{}@Y50-GALOkuo(31GXyRL z4)lI@vqu!5tHwJ!zQmVWx=#5^L>q(M6+xyGGl~LhJ^DWgomeGuAV5bIw^OPtgSHWtaTLeeF&SBmA_-&g++01lRB zt=B{{g)hV|&4~W3`icm-ePDE_Lxso*1`O#Xige-!Tjw`&PyZGm7!HtSL=StobSr~H zzgqGB0x!i4NRgs6QlP7_`fI*0LTpJKpx>N|SuYg-nvx@YqU1PGE9w3hB#`Wh;d4KD zyDIe`P5tv5d{TY7E+1mO@z<2x|DSFT{n#W{7(buV=6`9ko)~3UBfYZMMt9_%;Gd!8 zKT(E%|I|)!0zOzx^)J}aH#u#}v=Y!kFYk_@BxTiz-^~mD65b4^f&D(3Y6Av>)azRA zI_Cf1gdYK3HPm6*YVfZIyVld-tVV4#FZ$~yvjMy+`s(+oFMmBN{x5a=pC6;x->KV{ zmYOYk<*dBEV?V=0aF5yW{&hHL#zn6KP`)!hKzd@G9=J^oAB4!UMpL+d$21VF@{pRa z{eHC$@QFaYyR z*Qn4}RD0yjgaXRRMHY!4C!&w1@(XpI@;^?vBrK8n)}!3m9lBs|Z@NRqpSuy=-f>!3 z0_B6nCG*M11$_7{Q<@H%eK-3TZ!wL(L-XL=N2JI5JDC{xX&g6K{ZLwYZe4jlK<_ z7A3C&%l^KvX^Y?ONIpXQ2#oHOyswV)0l%yEHU-xQLhwS{fLHsX!}w=O!Za#1d`7kQ zyx$x8EpE{|jW)_eoE=h?xD2l=HN03>URei-0q0wU?P?E=Z?%+wJ&@&RpI++B3zfc}|O3BQrlZ5*5BqyiHmKbcp4^26aj939=lScOH+_ zu2lu7Q52A-q)cZ3l*uB?CqbYG?Omm+39JjUPTkgn?j)~gsG6wVie{xdkp4Y6w#w=Z z?Po0JYPmX4eEw*=h`6TeBVe(F!@Az}~)ymCTa6*fevsnw=fB#`8g)totb^Ty? zOpqY*Uf`Q<2i3#ra9`YH_;3KP-Mj=GA{NSR1OQ_50sF~M-(8PhtWz$5m&ygff)Lkn zkAtts!Gf?I>}T0miA(46F?RYL!2bA0sS$0{^NjzOtu|q6*(fbU1t{j~@dm5{EWhyI7c3a#fexfJdBI>;<#7BJAH^J(>g(d> z-Guo-)n=@hnJ(e&a^B>`<$C%;!1tQ6-Z!7N-flI3(gspEtXZLwZ$7#oZmK-qKIT3} zugg32Rc%!jINL^X!BY{{DEg|`w!`u&0uWJd^?85%{*kHh^$C~R&jh&*FklH5aNC#J z9Ed97r0<}*&gS<{Lc}CugpN~(R=EuPRDL-RQjUA?SED+$^WkLHz-qAw2K#k;qJ*Nhg{%JpC}5#ZK~igtKceVJ~=03Meb z4aT4wIV0HnyC;e1;@9K$T8aCiOv1@Vn{lnI29iJVqDFmaf8#~JEGF9j;V?x_(t015 zgjrSZKo!^@tNqZGWOFTfhOrRz1LPVUE}+4e{qa(Jq0I#eZzOwF<8-Z23-7;mm;U@X zpje)__O;h;m8wT}{DnS>h#!=`%MaQuecC-<)@`Wqq%O7lySmh2j-m7tAp$qvQCK@4 z&aM~xfZR#NPtn7%(MCPhjbpdX^9w=uIT`!74Z_cY%^g%b_O0j!R0)zE>lc$DGs$hs zo8J8qMlCU!eH2bX_5AUo3kE=tqJS*_eHzTl^>6MY8pnXf) zSZ3rnYnf^}Bduau!WRxn-OOY0y%sSbAVnEv-u$X?KR1j#?^ni5soC)4$tHd9yy7XO zN8d^`n-K_`vlKolcsR#|Xsq^t&FQy4j#>M|#qJ5&9XF~+w6=o@8pZ;SpG6b#>K_>e zK5HR@@@~Gb+yi(}8rvRt@lpgIvTMN}Ht3B7dSoLXZwalUfwep0yvvAv#BEEeZv*T~yn69nsJx+X_Kz-xFr}ZCy9l z(*!wkfDn)}Et_ZD>p$Kdi*`*wO4bA~!J=z;Gbk3nL_Ks9(2g&*SK9PXGRZp~B6n~G zWCe*w1bq*Qr9Hxk>kX36Nw}?Kc7LvBAW_;u)Yk&WLp$Bs^jdqF4T-91dD$D#Zm3=%1HYl`l3R zU7)2=c^l&)boT&+r?z+_L@ahX6|k7<+C%L}?i?0c6GPES#k93)NhDG+Gh5!?@z|9p zkCE3oPg~C}Ix2V#YnJ64y%$uQe^uCv3EvQNe`V0ih%*%6Slscb9|4a@4t+?ew$|zp zsdtzKT*o&V_^dh-M3cqEz6mjA>#i`x-0+m)dDGCu|k8NGQQf+ZECy8It-6?03FfFKTr?-ZfbiSkK=Z;HL(~#PoUK#yFf!ev`~d zn{xf?9=R0P9mih``++CLN8z<&-`0Xk%;CJL&z`7dRUMv!B@*Q0OtStLw@}AQXOa+> zjCC5>@b{z&uc|Xk#UH#9*w@Q@9u{1G7lIqCEeX7o|Cnydym^p=H((0jBrb~kj4bvdT%zLQo? zq%AIdqgUvhoFS;VIgK9AJp+N_u}(3V-E)h1o}6;uiXc>$}-Br_>t|HGR1*{-fnsW!Hhm##--C8WIB zWu1c(SU+kw0uRu%1g3T2D|ThbcjPpjK@1hA9KOaY{ku~|4` zWsEeg#AYlHOa*9hb)1LW7LUxFXiwe_9M%&3w)br+E;AqLRw6)HuOPD&T-q%{Dq?@W zqVvCcW!jF=*y>rbt|$^vVYv&$n~T2J+o-bb%qv0i{HBTPQ(_;LZR0vts;ZIg0Vi(* z5)=z|Lk!HWBGUhGuT0=EtI`Xb0rJ}^f`*10Wi+p)sTco(B zi_gCPAl4(>KDY`MewXTLOyAABQs9Zk__l;{G>s$k)hTHqCQ|{Sw^!Hcg(8U^h08Bs zz6@Rk9G8nQy(KhDGW_Ey#e3XxL9~j0s|!(a>-K?1Anf$*dci=0B9ByN-V-OR2YwbkQ^<=#?{{^O`&M+WoH~k!e~;a$Q#pGD6uetJ#roi~J)|B!z*AL5*$GQ%2hi*;CmYhu2B%#}Nsqt~7vBAu zjrE7Ll+zJ{tqhR1W!M0)eO3n^m&^KWa7)0c>&s>~wr^ABiO7EvX12H>kQ=9guV z==4SikbTqdh3RH8xmn(;!LDB)&lM1xt6jV6;k-4-6w6$N^T=&8OXQ%O4TTeuev?p> z<8lcaqtCbLBg zDr9s4d7+t=*-!0r1s0Hjn0<1*I8evB=76M3h__;>3|Uh&X7ukE-qau_Ly*e4%n&4X zC4>VqCi~|iukJQa&6s>=kJ?OIL*cG91eK9QrZD{aburp*way1hXdqLR^ch{TG(u6 z2F?~m73XRqFnQnkU03*5emojT>)S9t7#TTJXj>dOr|df~d*2~GX@ntx(hp111)IXZ zJ&Z>7d=`R~Qd>=?$Rs$Du5yHt&f}SJ@a*%_PsMWd?zNqzW|u&iooF|}{Zuq7G;9k9Yg21eZ1V)&7K%~@MF<1?QV zoKU43M!)mycZa)yUC3Q$>kx1h`{H(IZ+|22?Rg<`M6<@S(H@CVEEyV&UBAJrBUKi< zVka%^B_zT5(jZH-EKgjKhzG~#rA5qPk{V2YuImQLJh_dh)QZ@dYrds8zYW;UKl#Km=); zt|p_4@jBNgBiG{*EKx#-f!}`^OZ5{SwJ^AzjHPMgweG#&IG3uwhfRZFKGG~XSFM}+ z=@ z;n6NKm9y?${8GIk1U#mU$xVLuW7KqGu#j8Ybsk}j1l*4-5CAnpb8e&4#=|I>PjsgB zYwa^L?s*+zTG|DF!c?%~mOlAQg`Oez#u_mZG7!en!Wcs+MhFdmA34oX2labN@2ppO zBjDOHAB?sLv9s3O&irB|8zIMisrO@pj3w6cyOSUf($J9&t3x%lv}t-4%2WXNzL-6b zGNKbY_sz1zazdic@PvBa`IguL2VvDK>E;;(sq~s_6HwTvxJu`(llfzttrTBP4S8Jv&I8w%1jpEROxPJfL0_BM%j zuUG&6p|l4lCa1U1X(kOf@ut`Qvseq3HbnUtf=Pd_{59KeB`b5)d(OM;@0@fU!JKk1 zqLA%5o#HJa?V2d|Dc4!T_tr(U`)ybr&u170yvO*ce*29%2ZRhz+H*!#|9lQ!*ez>iMC*I&BGfWnGP-yg9!?jom$`uUQbz;&7?~HPA^XdTb{VHNDlJP+L&I z{`FNozR}8}_8znX;&9Z({LBkXF$2E`7OF!l<8i;h#pi@rh1UVk4!7q}x})9-d9v$f z60m4fM{WF#rvwvq2Qd58oW0_?@DIOwXD-$yMI4H=Unn1+Hl*$oeR;g9?5Q8Fb=bWH zyJ$Q9bHvE}vK9E2`eYA()bA@oC~A%$9C)HN5rA{uD;De^fOXa|XD#TrF5%@pqNT4z zD5Yo@X?5w;6D4h)p|I6Macln>Ey3a4l46#UiAzphHKn`53(-T8e!!2%ZC)((`F^KE zC}!sec~_(TLS5?H8b?f5to1^W_@_QWtP-sW>I7k3o}n`pd+l*4A+-uaK7+?yfpn77 z9d9bi5At)9I;|gwP-5WfCeJi9Re%J+$hkkHr#lVe_=4zR%Z=iO*T%pkcjkuzp- zj_&rHhj#_W;ne|$MfZd8^!ea)-bqy7RYl##(^MF+=}~tZ6|<#$gY**k2|Qm>Ex1}KrIwbG5|5Z< z#>Lxz4f+a8O_}G9j2cmL?~n28E}Kn0biSo=#4OqD_$YaIzU}{E@2#TZ3b!@w;O@bK zTOeqV;7)M&!X1LUI|K;9Ew}{_?oQ$E?(Xg`y|VW?yU*$I?;ic%^+j`448|yGeYI-U z`sRG*`zW&{5A!5%13%Es_K@0BPQZ-qdRV2^!KD2t9Jil;P-!ZAKkUuHeb|Q&*ebXf zcILM|qdL9my*hjON+Mj=P;I``zFPcY7i)(*3oV0*x5Uq~T+l7`-x8xkQyf#9%coSs z&-YN{KFTNX*nqcU>99^{$h;s?7}J zk>-P|Rh}P0;7A(Q2p2DUJa9)Z1MsS<_FYSo@MOYJb2z@;bCsPR-pI+K(aR8oqT6X=?`6 zvmThLbi9Xn?OUhDfXJD#D+|L=3%-IL&EWF%p$;fz-GgRl^V5G$X7{6+gWnI&78-UO zt(Slv2oCqS9v9lGFLPeHt@64M9*F-Gr=nC`S^6j5p3aB#xY^q&Sca{A;CRogjLmkj zyYi1;wM|JZe&{+rsX6gaOh}q86FL=^Uted6Rh=Co>N;o1d@T#mr5uVSzEU<@i_bD! z_>?~O*uyQEJ83z5$NIK1=a>pjPKC~J<3;06?|f)EVgwggNp6Bk^XfdOSMa*gW-oWZ z_Qi~VK;ZQ{PE&4oa?}7~i?t!)Q8B%w?JP?Mc?M@AHDptNsm@xdQAsrJ<9kFcr?)cX zy#$0x8y9~bv$s(5sjVgUF8PI@QR*LhLR9MvNaf8AQ($0PYMwyz(7%M$n1l90VA6@A zLzbKLC&-OXuCAw44M!*uZtp9QL%>xNOp%`zKR2-2JA{9!#~nrEyZ5`8if~2RL5F-! z^+ZXn_cDE^ve5;PRvK%94RS7;L)aX6(?n)%m-XpqN+F-COPA8O0V4{3b*s@HquB;* zdau=_6S+<&3LUFRSyh;yK?nT3aCylq)eKdPdx$hHOekvja+v zkzWj{k48HB_nR7|(hM`n#D#RvB5JjaVyO_)JLEw~$y3t57+Ar{tUY)M_j+5a;*EG* z9!}#z+SdddZ#6ln^nVj^z0Jq1bGclW-TZ~3RQWgOOX}BLyo3x51fcifmxWgPiohpH z{C^V7u4wg|&N@!wh1RRSxefo#O5cEbGJLr?Sm-h)qC4Yv9L2cK^TKg@)Ke)Kvn|Y{ zR3j(`iKbf}O(=2KEBUSDCH1O2y3!GbEm4KHRWdR&j2eMXADaEy``SQz^;es9&rgdrIwaME35 zeoeJSAncWA(x3?F8k3YTNBMg%@flV&5NIlvlnGe4^m`mjtbP=YB{)sHd3m)`}yCGwCjvERs>HFX5u$j)87A zpgT5J(-9ZK=e`2#;29#=E@tfaC)_yf=F>He&xMF+&`jn=@9Hh4~qyiZ~^1^eYptLqfHEYq- zXcfr(c|-eV0Psv+kc;+TbUn4g36;#*vr!KsTrGagVjWv3vaAPbpsAo5iT>D=Yq$%6 zA&^d>DMK=MQMas^+2uL?z4U;DJ<%g8m#Uh^bHuNWdVMl8CF9v!?UC#pAR1CtyhQjc z%++Y^IXt~fpInVc*RODQ<7-9JT?vnjbKB+0E{VO_h{T(3Zo@H^CejL(yR*g-G)Y{{ zo5M@4Px_8dXsL%kh0W4NEt-Qm?i@VYbbjzrb$*L$T`IqRRshtE8u$c%2wtYw3^?(! zn1@2z0bCOJ8bsb_7)+6|vrT?PrpUoaXTImzMS&P=0{Re%f4*2GcU6dm5Po|b$ALz| zbMvPQG}Ek|?D@)(97N-JN21@BFiNzPI-~$6$c6BL$%<&t)L<>B18URe!s;8Uc8_5S z+==Upm`EORfu+leDjGKkNpGQH2$=?Pr1E}kzmn-`y}Kcr@p;|F0a<*2 zLM+lDEvNHLeq}`{`%gKUw;u9!^d;{6AZ0CHFv_fd#nJ=ACH&r`Zh)`H?FVd| z^gW=AXE)2(4%w!ko#VpZG{!CNg1X-uZO~=Jb>xzXRD)75PMgJ5vO&*FhI5MmXs`H! z@2t~b+b$2GukhS(ZTe<7f3So(7zZ*a|FDEgtIe_6z-wb}5g~v~K;ks)+vL{`Xw8fW zyZiB^Z-5NFjeFP4{G9Ez(BrD$vLaW8u_m1y$m;3^W|y5t3K?VQ-^?V*9f*F0Z%5!B zEM$PX?-4dSeb={AIAldQZ96p60Y>>2c46Icq){YI|1vDFI)RumVP8}PXHL$BJg8X1u z@O(~Kx7-z}n*vJ(I{Ksqm%<$krw9v>XblO%0!ZC9!hIlcraGKySfjy}CngX$xoD`j zuhA$@O~vItIlXeXYiay^zj(g~(PEGwRgmgqz|vCiJpq)rw8u(nLu zEwGFh_Iv(WsI9Y;1|HPjnSR#>1)IdevT+kC_tSqc+mWRm7H72*slqL0tg}^s{&CBu zSy7i?sbE+nLv1Djwn5m6>< zP%tuG-M^__4u9}OJ`rAC?_T4dPTAMdsg|tE6h}I;K%lidVC7782nX|=y&vLIAS`VV z4MI2Szz;3Y7c>`*O;qN_bU)mdG;0#ex>kB*j%Sq#t5#z_h4hF5kp&TW zwc878{h!+-=9AqQ5hje=0++Me;AbT^}<4q2EK7$l`{I@=SR7eAQ^oQ3+cmf^zaZ(hGN3?Hs zT+1({IkgHy3?MEra!?}l%4LeJG9f)i%sapzmFvCaJ1(-FA2BwcZt2#&tYyA^p@E9t z2xz5neU@lLP^BTzm^+DENmrx1>$wZ`AsLh;#pj*oX|FwP`?T$F$MMr^u?oeVXieVh zN$+&8C26Uj2gj^4fclhoEBRt?4Dr|RB4@GRiLTzvS0{1=tZj0c{8Y))RLT{8u2E+c z)xGv$9@l5PF<9bwrj_mnB5Z!FoQ*HpmmTD2BU=1s!}Hnj+O4Lg$(`CN>Ic_ooFp*%zA2v&Y{pCijAR`j3;q>ZJXM)f_7&|hb1y&`)Uq1s7I=6 zVNGfAl7saA`8Pi`LGlp!fI1Bo5hhaif`IMo73r!VLF|^;rmd|Tc^|M5!0*- zhg}%SxXvmgGWU0HcZk zW9-xKwC{%mK^|}ovpVvRL)&UGKg_Px+ZOiZ{($>=-dnvwjD-ulMZe!Et}(a8ehxyU z5zijPEJL}oTE}3NE95=bzN8Qw`s!VPV37PR4Ax7JY6Uo_nVj@*Vxj zI&Ohjtpz)`?|-$HW5&GOsr>pq9lkz*slu|Xt+(W!VBZBqYP;qSr6lByyakpp!Pif9y z;=jGK|J8O+`T@Z%rMj&ai(2mMRrZRbgUl$kTPYDX-GJgTZ2C_zhT~SZ|%rWY? ze3`~vlU^{gUQNhNlAY2}RG$^=A`IsJ+S@Dj+VK+g-{?}9eDu4jsRG}`a`1!LD}Z=3 zP>&HkT#X6r^kw=Wdb|QuX&m6&7%B(9eqckDkM_fbl+4b!iyc$2XWYgII8!E#3cG9> zH}Bu;p3=t^57lGFJ$-aoPjxMsVeeMo%2t|n#4%m2nngm4dxtd;QfZVa{Eg+-O9r)~ zYZc2q5emUkulGj%h0zIe8NB#xsKzZMKAhxluTQq)oSeS<=Nnmjf$8mzUY_Ur{k-I( z6FK4{gFzjW^?j|&eZ4!jmsIFq2D^d7)@3mr_^?+lmfkdQcv7O-stITnF>A+aoS)ba zmVEu<-T=3GKd{|Qd&zC{M`exAzT=D68H_LA=W6zSz6jx+%P(EdJTVwhFRb9zI4BCE z{p>Lc!u)x^lTp5%BN`^WyI;TN6Xyd3I{=5o@36VyO*k#aTp8etFIVye+bpj%Bwyim z>GljyDDWCK{bx>)c=)1_SNC^3Z!Y+X|NgVu?{5T%xO9vMXQS_>B}89Xjpywg3Bl~^ zo)+T`q05O3&Af_LHO5V65XrZ@TDLV1d?nVFH=52=Lr`dvWnM6p;$^%nW~VJ6`_*(- zapL|Ak)JqBpB5Zt;)?W%_Mg6lB;$X_+*%H~pzMpZU#0k|+x5qn@ zpD;`KLgQ%Aq6Jw#ezq6}4dh>{#cWbh%6If__UoLA3*4``Mz+NB_zF=l0qF7)ktjc~ z3qizjU+$Me<9+@S)9P}%xYI4vI-BRolrz@<1R`ssyJ}suC8oi%s@e!k2N)Pmy(~M$ z>eiq3lSHq6cbC6A;7H1n@`F3esT5h3)h|p@=hNf_O|36 zqan!59^XoJt9wJSPWk1&+E98zD(1-Wz78?%njjyp$>+;vY(E3K^)DK&xpN<7f4S}r z%d3sOXF!7r%L2B?A56Z@mfVTOpNy_F4ShV>FW0X7>SDD*u5s3?4*2NtiX9?pI*jq$ zdj=CGZP&A+ig&PPGcE~c#_-{PF;>CRqIaf$+w+(B_Su|vb^_3FV6(t2J1!$ zcGfvO^NgNcCY!8zxXw@NcPAeANo4``#rA19{s=#`Ff^!95MHlY}=L@px~}?kuf!>mSQNxb3IPDC$n^Q9xV1y z9}JJoE+ryE@@xXdMy=wq$H1g?>gDlyNVs^7Rvpjom#-ltgVI~N%y-|FDm6zIm9n4X zs%*z>;^bV2(93kn+l+efj-PazZPEt>wmG+Lk9uz$s&9)wzq2acFqocbUVG>MGIR0q z#kp8*@J`$(diCXrXYBdvVoQlsPgeubpp<}t_F^ih?&%g~Z-lWtsYQG!I4xCEHQ(ve zCykbdt3+zoGRmvi&j}?!P_Jn&xvPHw%Tu~*gep8O9W)5kaS5&L%WMIGgx=lf3@^{8 zBRlJJ846UGhSJbDupvzsx@wdHdJrZL$BU|&li5QL1Y{DrUi3}iEPnxcqMSJi*`o_b zP%n!Um`fd^X?iu)Kd-G#KJ0T7R*yxQm#hbZsiYGT#KP_QE+1ryXE&YM7W3~0i?N2U zjGPdr+}dZ=QW}`VK!0XSf@E2}arO)zV2^~Bv3~JUx zDF;tQFR`_}{+{S=RRjXQdsAxMdY1_Tf|{4NJcCU@skcNU?%HAIuN{MFBT$2S1!XHT zTw;yTSi$ik@20e?Ob5tYJ)9D+U6;w!^}K7R%K(kgy@UMlw`-24+o3+`M2WC`SwP^@ zXj#u@F)oM$gmUM~XRdPHO;s9InH;GdEN2Ju+P%HdzSE=I8I1qsuoxWad83MjCT!VP zXKgZ{C8T+`@`*#+w>6$leKw89VY~2w&-pM{@NJf~1h3!R%rM<$!- zcko}8`D~X$^)K-(^Q`f3f&O-iJjFrjxfLTQ1|IU zeMbfg(TeF0I?7;h60od*Ox6ej-&f;IaJyN}L$^;vL zNV}brbeFciiguB<|UnKrMg&`+J#RW+I1nsEQ2B{leh6roG;X>O8|U{bp{#6*Pd?$JETbyOxhcX5t#JU zTZB3vUVoBIWG_YDpPSP!)@uHRyuZ|EC4zxm93)QS{#7i}t%EQy+Sc1~(Sh0a^=YzI zOT9=VrzzzcY`*H4d$taAuyWN_9s|w_fOs_1CTBh(gRkv?x97o>=)#gRX))wHhgQEA zeB1Q=+IPm{bS|izFPy-o9e5>s!!v);gPT9Wr1K}<0Mon=%AENtwf_(kSR2+NJ&9*7 zXB5io^1!1y9cX)BJw+n=7AilBq-%c3Yn?Sx*R>ZdQC9z!0*`L9Vh}pJQdn+~ki_k5 zQKVb?Cs)?9SP@l!@M?U6sh`hvjR7zpvnFB#>EES#JY16PV`%vYJ45M9T)ilm@BF|0 z*|X-skRk56`W{qrf`!%Y(H)3V}DMuxy`SW@$wN`3M$oa7I#-J)JDpbPs3Ctm_O$ zpxhe)f_1Xh5;wuMUK%$YBLKZDB0{B(J@inZpE}9e;b^2j<@WWXDtr6l66!T@8JOIk zD6Ws;TdcD&xqPWYEhLvM@Ax&LUxi%K$DD;A{i!8~=Tyd)KC&g3q=fInfnh6J(1G+F zKKuDl%ub&MRLlqZ$4pRDqrruxLQADDC6 zJ#9R_pqlY#8_B9BFuJ#O&uM;^u zS=T3vbhNDvF*Ofg3miM&_k`m@A9*tSuC`o$Kg1^;66!z@XgX`-5I>S(n|5I53BOOwWF@x?5%2s@NZt=h(0Z_kD= z{i=gqE}h%DO!-Kh`X}x1aODmi|1FV#&$1QxYSnYpy>!|OxR9c_kA@N&=Df8FwIL=H zb25M6u->&?DGOZadpVD^a^p0!S)1lqV`;v9aOSRDn2RBd6EBspcGBD0W$+r&s&lv% zv7W!49t*^hrE`n>ct8Ky=|abjMi%SK{PCJeyUB^`{`j?Jh?h50tg&_kP^@! zo2}I6u}JtfsX~_tQf$xbaa_%ynhhFoGCQW*O25-5!B}$wLqMIBJ%~P3?x-=;Aulza z`L1_=Qm>3H*9nOnZF8aY3d*3_RW8PJ9R7v$B)dU6`n1+3*>2nQYeT0Q%x0odK+MUU z;cK5ODb2t;%xoPl%Z1NrT)1F6-%wQkHMxp^dnwq?#O7wPb{8LwR(y z34R#ei|7>ng!ZHzWttS{bv`B9BLwylHPcsh>F^(;g5oOA^H;1B5;=gFlCCIzhu>;{ zlx8@b?ypAq+0pk}xg1BB`>9TNTpUJ^1PkSPGED`vwZ1oH$v)vVBIBmWGI)hV6X~;@h z{Md@i_ib5&35*QD_{8UrO^@uk2^NEEYStbcM0ww_r}B_JZ+k%>v4=^FM*Lg!x$(p> zpG8i!xza^~N`hwR){+h?M?)P6@D|b^cT+1-$2|9}zqm$fC|*?tz@#}=eC$}0yUB)$ zvj%Z?YC3Fm_9(T?V=3tt6d52g3Ku%c1qiXz9_>W+G6jG|RVeRYyW1SO?+%!g%rMBOJ;@Qqj)O34A33B`>u;Nn<%* z&sUC8*~R#++p6w`=h>xlo#alJ5{$oKezkS`|g#yRR}<01}e|H zsZ*SiU1(dsVMzM3MM7p6y0t%D8N)n)Ucj1f;q1R%!go|_TY}aM!!uZ;{nQ~PD>KAb z0|$)ev>hYdYZO?1&6$D6-Da+}h+aG47m8=CDnR9hEQF@zq~9&VJPzs6YNW^5EDx7I zOzjXeX?DjDrDF-Pw~Zf#@x3-A2b<;8C?oMegN`Y_8$s0-Ddn-)Z?r+2RHX+K94-vQ z_`IDqGZizg*MW5WWUw24xton@gYhcOLb}zL6^0+o7xB(S>&o=$vf>^vw+P zc|7YvpsQKQ5weYw(`k9$dz56&SN(JXGE8?Hj6=8i!=)iVMYKEbJ=)lbucP`tzlJQ^ zZK0fsm=av9qg^m1+)@8oTS}c;rVmSGOkgMCPATf&cU@8_lhH_*jFojc6sA?4A#X8E zC@^{06wS9Cmp1Da>Lp>9&@no&ieqv)WxEC1cAD?**}uJUf{pQWO&GOFE~DCi2bsx(X_2Zv62MMd^a zv*Qr{vTt`$#1~8;E^nN*mB*Nt0QAwA7_lgI}k8KYQ~#$Zf!i z{^Su%HknG{cdoi<4g54<*Aykd=@H}`tgyFy0JAt+G^LPLqg`gYa4?zA+Btz0g^Tk3 z%V=eBS$O&yCk8n$Iz3;eB|an(ZdBZ#ok*<2yr0GQp^id7p!EIEx8(bvKWnSeib+^(rh z=a8uf_P%Z?;p+K%dWbRVfOp~QmIIXhbTtN+dGDHnayS>KRZHKINGvi)=GR2f)}ED? z!D4P$BDWc&;J0aTl#fznF!eEA?xFaxXP}(lVh(TP263+Do0FI$-(%qeUCxwq1?IZ8 zGuF7Gj{W>QUmkP?=9l~u)MMsoK}zUYLp~BI;%X@2Y~vD%3;~dKe`p6}yz9f&F0)k4 z-%zix95G9A8$XxJ4?KwtODIb?3oe&Z0uSSB*a2}80MsX>o)#znBCmzdN|?$Cq{}SA zGL4yTZPo*sWxc8h&rv1=X+1Ks-#e#?0%|KyIb&=@{P$7qNbGMjt9KO&dD9uX*1N%OIEa_9uv}|m; z8nSN?d=d)^OlGP2h6^l`yP68sZ*c5{Hu^fr;sOv9b^!VL#`vhN^e~X875{4!_0daj z`+#@oO~{WYJXCK-7umMm;Z!p*m}OD89O!dVsA5rQ`fmaH<$bpc ztjvQjR$cWFug_h9Xv?;!C-;U0_VZCh;j^72(j2A8>om9ZSk%LXkYcRC7CTBMD&{Ao zqw0E&Tmcz@znQfKsUUk!JA0&O9_dOK>dq*aYjL0cfgEbJCt{>6k$buB^IUmWzM2f< z<$xv+cp~Ll;Tu6#(8b)WY1DSNGU4bxuRrQbSr;pX1Os8g_L<&wGx&KTW>#URdy@Jj zHsWYRY_Gza_15JSKar@E^JSDI#SiLi)`*LP$E4!Pu@#3X0^>-BvF!i^O>otiluKP{@HegE=U1zv+wW2t2hoC6)X}ouO1>5P{u&j~Xh=t0c|(akEV@dPfYt1ev>}s(R0@GY~ ze8Sd%d3}QZW8i^<8VChjpmKC2@r#%{hZxbSA?;v5P*c{>b`kE)ZQ*?-;!!`dk?}y) zj+oGLwY(lH`A!D8qarH?OsG+BM=cK*MXEm%Ss4ZKs_kHu3;@8i!gSW%Q%zD?Z(hM+NN?{qY zrvATIU*M8rZvHY|l*|?1VMrJ2&Ws1YsHJ-wY|~)D74uW*cl`PD#gNdkOA67w%2$Ey zbF^QoAluAUzw2jr`n2#JpIxMt;F}?q^o?x(`f%1ao7U0hvG#h^CT;g>i531*w2B{u zYELypo&O>L180FSM^+>z_Yq4`>!Z({5ue<@nN@<`;gCFB>?^WDQN%yvpAv6mj5M_= zg7tPAOQi`bI)yDk7Y`;n&}6aes&%*WSS*ZcJ+M%qQ#_5F%{WX(dAn+$rYRg59?9LH zx->t_|9bb+3H%(lBcrndOq1t;(*OM2O|BlJVs58CEJPG=Fb)XP zII5{h)F|L?U~G-~_R-f@VFRg6hT_>>uN+9!>l%`m5x#b9(y>RCw{UlGzN?j!rc-ra zgoFEY`yBN8u^W<}+t3`6Q2al+yNY&r9G{BOUmG&`qp{-Qggd-&X=$CzwJ5aL$((Lt zwRKe?P_G%h&VzF`qSDk1^gn;&Q2qnEgWLFpOW2}GU6DP?h46jBFVL3r$d!qG0}O&D z^eL{*mqd`WzIbMDC*!=_b@2(~HQc~t>JZ@bL?Iko1h~73tOy-H7%%%V5}*4HvAYFP zyELdKW}u-phE%c6P2Zm#@Vo0n*mp|fMu$BH&H9VBQY<~MA$T|3Q_(QH;kBaHe-d|H z|0M3}p|%Bgeml+)_aaob@=uoo(47%(K}DDWDY$_eP@}{%HQ%ESG7eVoxP5Z8+m1Y& z%MbO~IB$f}c)r?_OI;cW@${WkdAA@4gI(PR|%M1p6}its6n{f9sFGW74fj63qbJJRPRT8k}7XxFB=rPY)Aw8 z1kMyV6cuDD8R&3f+6EUe+F=6pb%8-LA7-iGsXM zyE}xHU9$PRyU4X%HgHmBEwQK0 zfCTci8Kaa1%G&%Cpb256T|ax@t&~k4 zDezT0q-H&?w>1ea5)KIzzgf~7V(yUTXPyMQ?v_)v@3YVB`UWpXD;y++O~*O4A>0#o z#s#u5{7Mb#x7$lMQD6$qhp8v)KlziA322O~A&tj56zoQkeTMi45jW)k z+4s}?{TbGj;L6m&MS45sTYjn`&Sj{#=z6Qfs>%BMQIa0E@6(4`l$Zi4d*!GtK?fu7 z9c_TZ?1Bd2`^R_E>r(~t$wiMHhMkziH;JAbzxD0i)1W&sBo1#{P&Ykd`_9ugj7#q8 z=s2^~M`BO)1VtsTQ^z;*dt4+Nxa zEmGgdec91;_!}z;o`l}&DmvqL4Ojj%`$7HAYIXd7&Q^!63cx&ZEq}s5d@m|gMi|hR zguabE(is(;B=e1Hz^5%dl`aG6lJU|^B=>Ayw@(jKA{07H%a4i#=a=Ig!+qWP-rI_Z zuJ8(sbeBGa^UMyn2y`1h6wLCKS^8smkRs$vsejpp2eNrr%8Ohr8}e*syJN=+^mDbA zK*a087Jd{Fk7>ia$2?KrkoWRMV>&Ow*NCyNVI)BhZBJE;E0kAZgK%NIY_<<+8XKrT z+X2zk2glvtO|-GT?GO5c!nxSTX};X^AuJT}!dc^5p4gwq0_C-b$(t5}Xm_6NG8E@?$`(CzmIUk7Q5*edHxqO&P*WsfUnC^?3? z=q2MxYv7BR$C>^?Llm6l^@@FiEMyF(-&>CzKVEh}eE#r{YoBoWe`0Fd0sjqC1DQ?# zsse85{=up>z_vA4xEU0#t9=ZX5eR?AKP2YFJzRL+(RAA1qL|ZI;GWzQ=MP+&lg@a? zRTT2LU|^N#g5^!riutjHdl}Bo_ZllGMe%M05{L2v?eA)nwDe2AuI2iEXrRu=)(iKm zDfR8rKiF#QyxCI}2>FxKx=cO);?Dbk{)2m{JItS-j2_FnZT-k@X)kkV?a+}d^|7nt zP)=C!2}V#dnpHTJ)dM3m$ZgGQn7JW{qnFxUpjc4w{3boF8fMxU4dgQ&8b zfLBN^{D!5J>^8Cb9_!Dzr@yssNcJP%O|xH z0H{@~y&?Vody;=)TmM*-1mzSq^OlMK-NJN0E1fKX*AfFVGo$4GEfS|FyS_@Vfywap z|F|*W4F127)%w4Lr&BDFS@vD`Bjjbf{;u)kFa*e1m41Jamd?z zuFNtAfMGl_uk<;j&b+9-Odzh{`8~{~r-Oy-io<=PXIpI1)av!46cgtv&}bL=RU0;N z2Ru^(FnWTtB&K`bC2B`@7@8viThGAX3D{7dYJ?`vzlRB`e8K{Z-wZ*SG>kn`2{F7d zNRmCYsypkhYmLOzj~wcy0OV3{6}kww(Bz$ZeIn34@aEScV%*g$JL;pfCMb$#vhR&% zk_ZT|Jw{JSe4ok2YR&2H%sfGt*tGQWY93N9C#xq9kMZaj!pGuaI6+=2G?I@fc$6gc z_x+(EclPJ&LS}D8#7e+23^A*`?Y+z>Y8OC4{TW(8t-6R;HP;PchJp9vUovhV zTLGH?#_3B3PZ08KZV#&i;1&X1OoRYt?1q@y<-Pw8-mO=WHh3s3@yQq1Sxxx{Wl)|O z1bsYB|j|I9{T07D0oQ z-t3$xyXaT)-)S+4jt)`#f0AOPb1@C6^~(-nUN1Y02a}b7TnIZheT+ILOOevl?|Vn6 zqM`*mvF;!4?LS2`m6zx>bI~5}$5b ze6@CVM8_r~)y^{bj3>WeIBZ35cr<>ummn2FxIUa%=?DF#WF+r^cNAwhlkTjQZ)lmK za;M=hH=@qt+K(j~_K~Jp0<){#{mchy`$oMlwa#tjPr~I(3`wr*+iMJER|9VHJNi8F zNPG$`U3_X!*R%M&3~w=@rbnI1ZVPDq0K!EQh&0@pt*QS978j>VGUN}nCUxX;0%Jm$ z9ra-fkWq@9j^a_sYe@G2MG@&IIi2l1MT(vASKIBWFU8yA8(ovWU7#ZahZ3=Wq2kBLDU7lZVlRbYRAS0Thu!$TFq{cA{W7(8<(woJc|P8F z&L?Fgs*&36PSd}$Dk04QPBVGdzs10~Osy3ba)hr$8XfJzo(NKm=YLXRW(%;&`v#hy z+PPuOo%g4?eo0;dR8#U(hZIZ>B``E@FVi5B&YFyHdG;56Tt)SID#hAL)bn9JXlD49 z7FUMy^qka-$bYX@<&=z%FZvoi2GnrP;ggFLTKH+CkH(PVLU^W zp`YeB(S|rcXqy>s;`@apPY-4T(PV+Zj!F$t;5I3dK8dqJ!R@UJhQ(=JP+81YI#Dhd z^#e^`!q8#uUg=L>>Lg?e)DPS;=O`q6k2xD?bVkteD+9^}k_i?wBQ(UTTD2*)CI?kJ zy&l=)TRk!@rxp|bEQV04d}SR^mcJvKm;Ma?3NyEr<+u}0TjjvsDSPICmQzo zL#UI%%?{(;eMSf}0i*SzE8CvEbs>w{Rp8}ZktK(6q#|R-cQO+UzoSTcjVg(M06;pP zC$+HM07^Jw3lJJZBOGh==Binx`AhJH&FH8kuey_~9>wu=%9>@(r`!X%=6o7;CC4jI zwdT|l{!RtHV=Jyl>obJrK;LNDeJ}kZ$}5FG5*51aO^Y>kz<`lHx?FFSwJN7>5=Q^3c!#WVNWb6 zFVc|$ts0heS~4LPE745T(97rrfBCb#WAXRat)JoCj@r{BG&e?AF3ubbv*G$8fiq%2 zL)&X1zBV`zt#M+7LZ?Unc5k$&Q|ne%>-93i0TJh=XR-#HBK}?Ck&vdm;qAjo&ip&+ z=GQWa&E)Dec5l=)lE>$(09QD=X?+L6O(A~qraG^{>O}OP2#?f%%d;X_e^1{s5rIsIugKiUSz%MTDp1V#t(?VKaXRR8!fKXyljHIr9#}<>+&qo-rRX`(t zqv_UUv$lYBemUYccHYXD6?4CT9w&=^jpPZR=#Tb?GVU3~>ml9rsn2f-;^;_L2|3Dm1qSMjxz&1vpE)&0i z_H5DR=Nm2!OjhoRK5qa-^Y=>Tkhd8q^}yC*r#P6}Tp#mn?cOMS^+uv0AnZbg**0&BwQN?wYu#vu1hhJW-QzKcCo(vSwj^Trt7kBJxt* z^u@_~SNVmqSuK=W^6wR)6VaMCheH55My9-$MCoiX%cf>ftX7WT&bWgg9|b42ACJy=?lrHZA}V<#_WxSS>A{E>YAeNNxESKNkK4uPxKTWC#zU zO*c^NBmVLej_Tp`gf~yny143nxbKPc!%Nhf^SV*iC*|xxLd?#G{70hW@lRYYPuIUD z^Q7=URWdQPWk+A=)yJ*sZL%hb!`hJi}aYp$6eMCPN4pD>PUaUUm{$ zKv%juMN*S>B7LBGanFaJw5ZAjatt}*k?5e+_;Qs6a9|UGc zo?Wu~ErG%MWJO5J)?iI>Vf&P4Hu3 zA>y^B>!Y)mAzm2o<%CDGsb-^sP{e( z_5gw{D!tR?-K4}8w?=_O_i3SV6n#IaKtT%UcSurQ0G_zKTr5|bA_nn;n{e!5HXOf7 z2E0SNC@*rpSkPnDw(9*P4PI>}l$MEm_m;;Cd05|*IQ=#)?spa$ewPF;K{LH765O0p zq^@3TGb(V~(@Q{KH0@?0d`dZFDy}P)7|+?Y?7x)5&@gKHTN^Vz2{S=yZcB&z7Lma zSX|w~kzXHaRmwKIzb)4!mpeD3nv&dg@lrjCr6xG zF1G}gyAP?2`V^w1W;~MG|2`-zlEm_|N@8(NVN!pEG*!oD-+lhwBYkYH2PFM@MJ3Yg z92<$i2Sc#Wn%BC!aMq$HfJ9!wC7=3d84@`|~y;&{j$I2HljX>S{~hw?RaX< zbY~Z#vVVJgoWDqL$_p^I^C%Vn=}4r_8eV-Dut{2GD3%T$Q}NeEe?D0E8;Np^B^Zcp zpZa2v;*g~*=4C6Bd|0fK_h(_P(K7L>g(j|!p_{2Kf$8qpe$)ucI)DfDkXdW5iQ@P1 z@71H(;`*o0^^&Bze;WbyCtv^33xkn3ts6vQ^GYP~xsJiYUTdIzP4~7U9rbkFO)rNME3Ac2th0ij1&%PoEws1C|RHMpr z)Pz&_N^idX{B-IvC#@ma_JoSJo*Q%*ZnRI}$5*%V0#(cZrLUb!+wWKj7^BJ5luapY zatn0ln6JF-DK2k4r8IhiM8aiY?^|}@1R1WumV2XitM`v@B~Pv*yZ zt$f%c>Rsgx=A^$|AlVK^1Nq`~fiw;yy3ELAucT&;)u*%JWXqB${T`=JT^X&+qq0c5 zV|&I>ro#tQYa7S^gT1$ks-s)ib%R6j5Zoa^aCZ&v5ZqmZyF+jY9^4&5aCdjtpaXYz zw^RIc&o$@TZOwn5i*tAO)kvc;sH)!k$f)Yy^F8lU1fVrtl%X`Dt!aA^W#FCV*K^eW ztzp>I-Quz`8#9;`pG82e*1-uycem8&QO~##6i6xxB!6nr^oH>gZ~tjoTp^v|r068V zjd!}*xMnBEEqtly(9BW!@gE>JWi9+T<&FdFFm2E~%=c+~3Deu2_sR&H({&Jlfap_-#x!I%{=8L5DNt%#sX5p0nqJA4D}zJ zp&${dgrp+d=|Z>S=3}QPGGBP&=#PhPJYo!PBV(>`WQN+X!Qy4Uxc=+(;nA(%uA#%O ze^7Pn-@FGpCw>)}~Zil%>rf zdo>7YBv5zW&JLq;{oV=|=k236BgxgWkC#m?OFe!Zgoh$bxPBgRB5-0n6vCA&E!*C5tus!vW4f+>ewe?~D^y*KvQ>eM~ zopUDc-{n?YvD3I5=_XK}T=CQ*%@Z`+`NpPUklf80H5 zr+OmR@)~u^FI|{}o=Ny8KF56OLU`F!N;;z;%58sQn>Y;hAmXs|m59v>xyHSB@tJ!~hH%l+_o?KO zjOOCgcZ-UHol7+bA<^X-Bc`b8Vpdv{nI#&3$PL~cFO6Pd;t5f_pWcxlJ94DQGbQp3 z6Ct9Uz0BTX2$|EFbd-UHTW(K2ZNps|OE!~H zQM4$PoO_$V7!AytHS6hSsu2HrYB&fVq(73xIqZhiVDE46(YqO20m-+4*5iYxGY56= z1~t~8P_AWZdr1_F(rhBv{G`EM9yU)!``}ob+vSLOI=2nhLZBRy8ZC+1EAx#z9=8X% zXybRQyXB5dAJ;SA$I)zj-6IxRl}d=hqCT52HYKdV=i?qY!)<_mW2&^3sZ_{^@)p$# z#XS0N-amA_sT8Z0+&mNo-PA=Rp+a`7-KzQflL;40Rts$A72~PkSH?qXh%O8a+|J)) z2)l4;ybdc}YfqDBnxKNY z%;?iBIwhC5M%MU#7PE0Vy$F?c~$BA9B5@OCb+ z6gJJ9(2I3*fc;r;jT;f{6e7$Ylnw6`03Xe0E#;E>fMIF#;`qCD?jMQq*^P z<}0Yh;#X(-&gETLbV7SRn$2|!#0%P?9jgO=q&5=9&0oTm2HA9&)ct|sTjzP0JH&s9`t~*-f=j*`N3a<#7alsXDG@uk4^|oPusal7G9Nva zS3mAy8u%7Oa@m4PBn&;~?-3I*>2G~F9)%A0y~~SqOYC2k0I<}ySa&tDYjZ?TZYZJd*{v0w+fgPg1(R`F@MD2J$B)0v|s7=AZ zQ&ukBzmUGy7U-pNt;!sAY3O1Ob$wPfP!ErHW{Lx8_*aT;wF!exLG1|yJIWk5{bJ-}K!%g%Es8niOd9f(@y5&NnO5$6yR04^bR{Ow4b3k<9%5 z+AV$GX@(W>>U&^BpqNRm9Fk0cLsKdqxrBO&+8qp+w#z^LJ3@Uv$8QBfkozR`D_@Jv zwecs#CoNQA`*QI6iayDZ$(T^BOr^|}I|e8n7$a*ZBWu1bjj2Up z$`^uQ_j)3;lnp*6Cgi1G1Kn<4n38xfWcwsTfIGHCW3Fxp>P3T?&c9rK8{D+oWi9G? zvd=-OB3l)U1%yv1(2xn=`QmY^?zcTk)izeqLL8(&o!jqtG|){)EY1Uzbx4WA_vsP>d2S(7faJADQA zz@7)2~NBRxAIi|gh=_sxL-3a5AuP|%(5B# z$sax}j40o+Y~5F|f;Is>ns1!0d$x*V+T{>hIuA8yy7=@!_8RP>{8wONbd@D<-s~xy19Kk)?($@lFaM%%*zeu!1Ia6>0sT)hY*ug`pCipu1_ zql08;Miv;SCFR>YbxUjOIzn8+t8C-Su6PDM`P*0zioy42eD0z`^`nK<`-iwsX*@W# zUvN+jt`pH}&E$9h2u>LMG`FLb8o~ylLKdszp$xeBsDQ3NmR)#$6>(?_Uf=K7)Y*$9 zdDq=Bq&bn-GEX0R!{7?ObT3W<7eC8NHOyuDW?b zxNq&lM;KrS`{VGzxu>@Qym-tNr)_4}E=|=B{&NmzneR4VuoN0hzjoRW;y-P3+;ZO5 zdAK(`HoQ82Ccja)i=g8&j}w`%TSa~I))C2v@GRvtB3TovimVbbGr*A140WI#I1sE0 zK1!ZhcnZ>@1n)e7F0Z z%6~l5*dk%J2sqpV(}!bT#bG1W9r5$m}$*Akh5Vc$Dj;bZKdpY!#|rV}S9(x)FLC(7eX# zcEGaGU|pKblvBLfdmbEe7!9<4&+mgcFsM3iht(r87j4{ATcM}Hh&URdq6Bt3FmlbZ z!Ey972ZMfKt~ykJ0{*C1M6Zu=|1mW|*cGr_vyfGtWa2}xGA8((k!TpRwH+79l&Ju= zAv;PaqzC@l6l+qn`-pYTD~HiVuCU+5Z=WP)2I5WY=hm&1GehMnx)DsJm6Eg)j&h`l zhj@)fW2e!SXu+s~?A?umjj@i~z|S#K;qeptHre z_HRs|qytnXZ4d@je|zx$*Cs9eFfmwcsR}$ zPV!e(S#I!gZLt%Lh+D`I4xI!qS@tiOi`eZ>7cz^2SDpXf)$So zn`?oO3r*ZVt02NCuW;9NN1@fyyF~hCBMzZv^Yo5ymaR4eA8Ngj)Og)#F|()Oo#~?S zFh5&+CPPXxgBTeEbm4}U1*(dg3DG5V2kUyze2-7(n6xrn`OabWP;M&drb3{;g1kgq z_@C|*;neC1W*-*4`{9zO_1x@MNkPvseE<;qfyVLuW{v=u-5MWK*?T|iwhB4i5WJ{+ zv=E$O;Z!@8n{(x-g_;i+nDNM>Gb#0+V`{i0nZC$a`qiS>874m@tQ6i!H6cHaxu^BX&9~zOV-X@haVkng zp^lciMeY>{fR@NZbo+EVwZc7blXr`h5c$XvA7YI|>41wb;$q79KU(;j`@iT2ZL_C^ zqpv!XrH!CuKD>jlJ^8}Xl4`ie%>UAjsY>XKDK4oS!I&`v`3|C)!PNW zj(s=CKg9~{!*D@EXFniVm&IJg&tt&kq+dnqXOTJ-NZT<5-3If&+A?kz1LM6*{5IaL zwy*~0jvPg0Sy`ik5CITS%7iQC?w$1dwg=Z*AzpkF5*v9IIvxa z6z!Zs^0xc7cV!CwZ*-w=4~2jm&3W==B%Itkc#vmaTW6&nB<8iaYnyb;UkW=S7Jx~N zmlLEy6WZKS|F^gPcNh{t$yU-6L{6ERcK?%(4){5oz?+fo`@pCFh(HpNp#VlYvKC`L z(*JCvdk35n+B=fi{{~_H`}z3By&Wj-Ea5)bU)T9xz~R6BLjfE( zkYbnS4)ecbqyNoHCjkC-zFwVhkbtuma{^Lmj%&O>LDCF_V|3xAHFADh|*U3K-(f@B1dU6vV<;lm=IMJ?n z+DLoukLy?c`t-HYKO)L$8NIpk0G+GO0Q1xJoNkrgx?u9zgYcVPZ!u3W1Y6#W*Ck9WL;am!e-8m@{GnS z|6eY>Ez2qXmVesxCjZT*cMgP8WB(sGNUGlV-#W9r8h5Gh%_ej73Pej;Od zZuR)vKdg27kG=G+oY8kk9Nhi4*O1?eyYd%m#pL#9@>4AzJ%h7i z6X*5UyF|Fu*jc|=dmQa%MwA3R1Q4_N+NJ@)Z9v%b5v}}#c6f!DF<{*D=rzX$-QsXQ zkg3<2ls?KUS?h{(;<1{Pv^)*n!F#WSJ-^U#7?#2NaC(E{DhYia2A!_D*mGNLa1I8) z5>yqnKsCOvifspNx7`F2MvAwJMr`{Yapsww{}h$=H(0^%8Z>v<84Dw0SI7PaJaNl1 zWtOe#8MPj9IslqLeiFGj8d`3bW2RkX&qu3{!``hZ4K{uY)@ebqNt3Ec!saMd5_~b6d-q3EsI(Dm4jA}bZKX5~3*M11M zZ$hn9h_82D%*G8?mgy9bmM}YRZQm@}L^<)Kxag#^s57m^f4$fp#vF}8jdu$ckq6r4 zjBXP>K3U9qYy<1#@(J%RZt^#zPNQWg$o;UCNsW4r>l*jqML{Bwu5{P&r$Y}qwdzn! z{mM@uDhtMIMAN!ar0vB%Q7e(FHWcC?1m#mh;#;Hem-0=x4p`B?(tuz4jXmTf93%!hTG;>zOGUEZCJrc<{* zB1VUz^+^GUKy+qp_iB^2$3D=~C7p?pD;$znb}-o-1#<01w3`6-{_Px&03h*`@^7WF zdlvTPElUSd;}8GjEUm`B20on;`{&zrL+j*F>~mnP2`y>m$rDR>m9@%FVVmsy1gc_` z!9ZzJCV_@-M~G~$jk1LOjhqC^h%rz1mv9J)L@vhSKEWRcGWjtSKF8p)Sy8zxCx4kO z(;{{;9w0g$OQs&@nbW}VwoTj`yZ%MNd2FhDg)xw*+86$V_s+eaQ@ih<0=`01z6lX$@P;<^lVM+YmG|+1nj5$Kl~@t583oE@E#_R2fkB(~d=uNf6z~9{ zz*X8X6*z3>RA%)VR+CySJYlV18*D%;&(?6P7*Oy1hMc;}XsC;Uo4wL(PJ3 zsVU-0Y2iG7GS{GYTb7ADYtcfN<)vt_tQ>#_M+oaSFA=y9RcTW<2>}Jqv0_Ba=^LKV zc!7B~>i&qY`z#vaWiy(j{~_~vthp`4>G07}sY?~8XrsLmmV)_I%3dx;E&WgHEs=sr zo1HJ|lP+(r7`|EqM)TU!`N2X(%k;f(_Le7!*PWJaf^r`(-{=atc;+2XKu-b=#^&sK z9ys_2q|2vO_eT-3?;j4t%TXH;o|rdJc-0r-^M|KBvgSc4+w`I^cYCb%bTs+Uqe)bzx_oD4=joAQ2Z{pzD0E~wXz(d@`jVHjz zaf9IP+^>&K4`3-2LJ(h>N)W3kGd~x@em0Cqt!LPuwVEFYwzq+3GmT51J?2Wf1Tp}U ztLL(x%=<^;4CAC2Dm(f4Z#aBnk-^fZjt<%NvTP3P#?ylpECerQ_I}lKvgu9e>4q{H zdjYnxaH$@+(;DOGP#_E3+G?RG;j;N2040j!g&GZw^f~;-*ZzYjAiMOEo#*x+xE%%R`rnVRsc}VU#F#!U_VErh6VL}Hw!&Jr0igA=LOU6bG1u2< zD?8o{_$tWxb+=Z?fu(3gJKiCrMer*(%MNCxV)g#CH%pSu#Y_>o7xk%7nIA%cOVBN= z!_88`>RHv$80?Ma8P!|v!4C60eTMN}2xfYl^kqV6xU}_VZj|1(Q=BP?kXVVcW)Aw{ zm|O3=T#FM6qDbr|CJRXj)v2wZPYLcVU(SRM>*TiyIPHZOJKCNfORun| z5m&<-kt`b2?a1k85RZFakh;=SHIhX;;1}mP17^Cbu@dNKIhMR;-3@J=+d2{84410F=stzw+lW`w68T+qV*v$d%L<}IMDYI ztzF$?a(~jTlV<#?K>nb`oxLi0UxIFdJbNh|F3^+2$-UhO*!;*(V2i}&=;BC3-kV-I zHJ&WyV$$x!?*jjwE1nYYnk|N7*LZJ|bKPpjwfn;#8LkWIP`cW62_N*Nd=d}7;(fS^ zdAQk|4I~&yjvY#%6e0~m)JNsdZT@XgXL;&l=-elk}@O@XiIxo1-;-=0hou>92=1L!_1-dUxuvsQ%wSPGXNc)n!( z4_6|ak#qx}ZGZ>DAh2(<&i6H{_(J$bS`NYIUwb^3t2HC0N zDqm|65bVrix?*vWL1u^LqkA+jftbn?e_E*Vhbln%tkBg7C7>$%%WP7){vr&Q*TqK} zpL_HNfaEm48d#qy5Uk|{`QObx;hzNr1H8F-Jhk!%X;((@NlA%!VBz}RjdDFhb=gsT z_e!#t0Q4tzx9i5C%O{!fG*t1-U_qNQo&i9x1~mUl3nGoq&7`<@ zupFOG7?;Z-!LMK6^()5A`fx<{b~_ye6oI?C@(=V0lJ0jx{tMd?>;*9o%TD9VIqBBp zC4{N?elQ21N{H)&6>VuxMinfkRX^1g9?(@mnx0tAC4l{Z-3&tfMR;imvoP*{?2ZTo zQLB1JvzxE{vhhZ5*S985+(mR5@% zy@&F?638BM+KH%(;{1j>OV|CDL~TB-l9L@&lfCk#!9me<0qzU5U#;UZM1cK6V7{nsWB(p;3>F{LL3#& z&BBkfG+7lK;~MaGINl}+DZ{X3Uc(zRp4NCtLyJ=)Pr6bj5`wN- z(zif~H!5t#Ng`LxWPAZerxu~|Ls0LcW`EX$A?Y2vnb86qbw$B46Eo$!LHrnsifOas zp*5Y?2*N{Yvd)%DF#=l4?7OGRuQ>1M{MWCaofMuJ%i@-yv z0k7~2cE`21a8NUcMgkt_^sFmY*irQk3`Gtka>U+wakfF2eZcnX>z=pU9L>jb6C|?= zeqMMZT&Z>&kT(ZFZsqq}PRHc>1;{sxs>#aMCPw*qHFC`+8v%ogpLET3a`4PJi6SOB zZi@4CK2buF{1r0I;5`!2-z8NWrL3dF)P`!7gyq*4r5KB49zOfw?t@t%mR~}}WEt?e zW}sVtWQUCScm_a~2L5$ct8}X^ULbqiRH1nIQbDs(>+`kEdW-=sklr|3LCkk{d8WQUDK%Ub0^BC0d!C?E zqD7nWlE2rpcrv*vha(>4jtmicvRnk4klb0N3E@tnxZ(zI?vm@cIb0sQ%GeAcIMmCFE7<@*SQc{)H{90 z?go6(LkV}l!LLJ-j^G|cB5Kz#Qhd=wEN=1K-9uVmcA!htO_5 zCoSCmo=3V33wZ)L*VAy6jQvnIN0i9GlYCL6Ny;t%LowBakz}$!&4*jUkSS@ZF+NO$ z&hbCXM=qTbf$uRY{3LM@OZ``;@UfXS)^{LNf*o`{Mzx0Q&NXc=sElWZltMRpOBx=l zjEJ9{PnV_QU6nC#BGp1QDh<$;KkG*RR`nU7B2`L@*N;Xk`PF)HV9Dil5LfcE#DrGl zF7-^Xqr0R8+7OZHyl)6G_Uf^{DvfQ5wgm&DDrb zon+$l9t@N2<%>Eq$gXsOJo)kBk}6BpkF6uI#CIu3^*ds|Y>7!+SnDjW?<``z7`BpM z+E(a*1>%G~RL8^gEF@o^%}jV)vcqkm|r#{+wV$o zBr12q)Tb3{j+(RImYnuu#~d_y7j;?Sx~5s__U_JqyT!b#WudyH$}hb`aL;h|gO5JQ z^6Fq}~BCW2we} zd4xZX=4+a?he#Xxa3V!+;60+ny>B=x>=rbF8sjwfkF3K$gi z?MZXl5m`srN(~SUdjGsRe-{choUkqU%VkYvPCQ%=m)FI?8snn>)JOgybz1jYypR8? zBvJQ6=8X86TJ3M8DoZ!N0tc&EJjcyxTjU+;Qq8)Gr~dAy3X*a5j@u{#Mm&jt-^xWp z8h_l(*e%w_j7#m!gvvB)r)<1^Ul1SS!ol#K&L!hM<96C}~r`S-DyBEK|7`}0c${hRb#n{q~zd`pMpenNwzQb-FIa_CPP zF}D6zuNL)lld2akCdfr&>M<}I*6I!D9GUj)ftAANRfi{M`P|nf!3@UY*0KIFXy%Ix zN$K1bFt)|&b7HxH&g3Uy|Jo_ERc|?HpaN*-yV}?6Zl1u?G1nqt1&5t**(xZ%iRw!Z z#nYmF-dENyQoNtL{SB!P4LLk6#z$TFddJH#xE6V~HTaV|x4Q`%BWs#J19i7v^lw-O?WdLeuwm*XD82L-UF zic)}X42CUjwW`(-t%e&!k=pG_vN+Y$%V(WbIS?LOg!iORrP;1$j3Px*AYEU{sb;4Z zEro4#-es{e)*GcX`Lx5&6Ny2C(pz7q+wRL$ZHshD&?Uv*C5WI&P;r(;lghDEm}4F` z;75MKeEmLTtFK}6KCjmloWu2-X3{9f zPcTdsQM3&J*rlKS8FbZo(N$oWrDqk(io7qu<`zUr6`-x2bnB*z^=lg$$uT%58>bgRL z4piu-^i~jtGI(*d|L9gMlr3>br=qPBVh~hR*fOO=F-5sP-N6kF)3v4<-BG|oAQ}TQ zb=&3cFk%$)rgQTK#fXCT8O%8-Chzc(`3z_EHX;R7!XPlH{>r*P z+32nEGYQ6^{x7Va^|!i1JZ=NycRromKo9 z*w+t;YLpu%=s6g?I&5^Py6_EM-HRdp<(WTp#BTvtF1ZDd1M?*pS6|^BzazCu>3S!V ze;Ri(M?KF5)k^_xt}|E=J?R}LgqP3JV*{1!MQb9qB1*AiSf+5AVJ!}hC6-!T8IkS% z)HXGyewy|VlX(WBF3fFA9wHoHXmUd=BKYbXeaopLL$TgOiT*KdH&$Dy&M1k}gFoi| zW!l~&?~=TlqsD9xquFiZIZky_^Mjgw$n30hU)W8dpf3Mc@fjF4{W&Ps1H_K`{A5F!Oh8wp-3gIX`Q zSkPsXn0hl1OSfK*quO6E^fv@%E8Z=rPOcYYgg3tb@T@2G6EkLNJb0tux`|R!hz=_X z9haVsR;~xBO8EzIwNz813Cv9ir@FlJV8Hkq(vhYGn?<%j64p1GE{;&Row0Q8{ZI{7 zGH%$9e7lDu$vJoEilWj{iLaqJ(}rAc(b5lyxFToz0aj`QhEm%54lUu2x@WSv6BsOh zlsx};5<`D)9{T$H=%YxjRYV0 zmtE^o5WcUif~c)tpWx3E8=j}5X!#FX*3<5mg$V8mhQurh>G80WS3zBw9))ZLS-unp4wFvzV{ZIDbOF5n8I zn8$()lVy$MIj&4vOrK#QHf~IxJr0<1@4GWpKY7%xzC!&r zg<4VCNt|-k`bdyJ{{WP9Y7(;zaBEb1b)uEMl|@wC3)0cd=249~YvM!_W5%u4eHuP@Qx0@8#4 z7&&A5%H@V@&uv4P$b?G$JRAy_L#Z$V6I>{Rghx#JVjGtWopOnW%uI|}I1Xtn zwJGnqCHhFm4z1=dAf2ab*}XvetN-UZr_}~Gc>OcG-#SZ|oi``SR4REZp09l}sm3A@ z;pF5Fy$g1%O0n=CDXAbZYg7U82DQe|0e+>l)^PbE6jKMahr$9NK!%_$@DZHT^*aOveDHxDMk`8< z5CdhGUeqnzH?{UQhRcbR#cuNF?u9`f=}e3B-(Hre0rXjw;}OIh<33G76bqn2=tM9F z)Hpaaj24@l7(oZTGT5#n7?#1aWxnZb(oRRHSfFZvJ6WBcBQ4%N2ozB0z!^+?4CKUC z(heuGg_!vC!r#Thed~+7{c=PomgZkQ{+ZFmzp;|oGN|J2IsfWC_g?Tld>gA7Gpo_D zIt{qnez8d#OS-f5(!s6J%Xk!lNVSP{Vorx}R}FM3uAU$`qO0Z2nQP5j*Nj+{1hTLA z!S1=U-?^bRkBViBx>eTHb!9(eQlmC57{QO`Wi9)5|BskfbGF}OVNpD8=v+ikS*QA04Y;j z{ppNIpEIoZBaorhO)u0R2&7#zn_9!T79i)2n#a^|rMCNFVsm-s(HIzNsa5R#Kmqs5 zQXrGma0mn(7pRX7PI30nbd?N2qsQ)72c)yIYrahvq8}7p*N;>pn=i+>jrF}DY)nFT zkAMP{t}#TdP&*`Ji>-%GJ@cu`{^a|X7!%!5Q}Zk5;Pdd*UOVBZA;PNo7lg12$7_rhn(h zfx!#XTy@DnJhUDB`Ar;pSgrS^ZxDvg)hh!O)_YQz%f9vy7FdqY<&?mPuq}Y{o}JUg zLAvqF>j!>tnyj#MX!7&;e!sx5o+1inxJr%P`T6gGs<;7b2w0Ko>sbW2;c-U&CC(K6 zeVo*3u^TXAB@}Y$O!Z;@ig{n~>hzICcImawYOQZe@AEPzocGW%bkcz1YgTP079^c2s3BxKYn}GAfm>~Zg!)Un!kC>E}sxx&?~G*4+geIL)xrU!-`l0gb$`vf2K#K4k7krOwP;`L*``^7$^?4vkzw<7%475+xK7Pnux<3g)`A`glw)I`D8!Vd{PcGY9S1>A>pLOpl?h%YYrw%MA)o8m^t zrqTsNd2yYF>=2c1(|1*-8aCbTlsM(?goLu*_VR( zy5~~A8k5`24a37dF6s%Go$tHd?B?{ljg|C^Hq)1%z6&rR)P(99;XJ@)-|MS*2o)j_ z3|j?e5I*K4&CZ70Xn$8`F{A7nboetAZ%k&!Rt#hFd?$47(kcDZ*js-b+pW_j!HXH$ zqvaBhS~!uDx6T(?n4vo$G*ugMF|WG6({ru%-X`hS_kN9|!l@)>7-E7PHUx8(|`4d>IT}q$5EWm>qwI49-chR3l$JN-E~7vY&kP@^#Hzdr2El5 z!)%w(j&L_Zv+qEGjRr*TSuq%+G z5ka^f8@#lCZLZ#J?$7j#^gcI~gm$I=ikvDY~ z$hY<H_)y??lRej0{LteM~LA}!o zNnT%U*I$NUk#9+7W}X-pG}6q?tI)s@U&(2~dM)P3pig}99>escp0E=HX=ocwH~(%c zyxH@faAn;(pZBnMtP)qx6BZ5m49PY7zz>mL5Dsn9yz7aG?Qs^7aG!V8B=guJ>}oHb!aEXJFZf>A1{I-9hKA_C2zK1H6&QE?7{XZk21F~bytT_9VzyGifOt6s|gthqyFEG2Xrh=oJU6h!|pJqUxfoSs98}F&X*rh;m z+4WSnlK^+&Wr%ZSSCgl-xa4KRhOb#qN4Hs-?`KpOrW?GbZ)$wUCanc)@j;v0>fqZ} z$lV|^)2ki%8(UsE9N1qbB<;KN6VfcBP;E#q-uGnBqs10(^MtZw&O;Vh@5~lE>1Cgr zoMh~Islb1HC&_~S{EwF&Qt+XuUeft<`H!Cd%h5Z~!!F6y;+9<3%j?}r{$2(ulIUdN_QQnU(+cupbD)sidVnPn9yo|1}W*!wH68zl&hn;%o6HueLr;CQ^-(P?ZDlo4tU8RA4KW;H$z_^j#E}M`3{jzfv z0p{WSeLY6OKR*utS{wiN84;!frav6yQpl?Mx65P${cUwpgEW%f356x3$p!`_bPb0T%Iqmy3qO|33y#IE~>v)8{pRv)?BRfS$PRzsmKn@&k`Q zt@k(FvuJJgu5kDqt`kkNM--TJI(a~Pb<8{=^H|3C?&V|&r$d=Vs$JbWhlg4eHbu>Y zYe^k>vnZtZ<1zcgP4nqqrq2sp3dCYhzEq-L-zK+U3efz_YKC)M>?fFuY8NgbxVLNg z5Hfdx;PZTqNvm1uXVStH3$TdeNh-ugkM%z1P^GICj7N_xD{Be-89pz5P*vFUx0X#)kQ}J4E zeSQEQu@AYjGqV%}8CWRl%c=fYxR<>NRKEam#sx4JJi<|NG$K)d+|zE3mJjb=@}g0s zk1pb=h4`of2!(To9xFYRyNd^Wdv%Qao3dyOBSvu>U?vVdV+Rd?snP%)onhBvmXhEHRqJjYB3( z#cFRi)2HqNuCV+87?L8T`}dA62_*q=McIb^QBqi4$L)Zt&-CXWV<-s<<+G=>u3T2- z5bgNN^MhiPj+fbPqV8bNFs+wmf4N%aPt3!eKIb|WZ|i&AmOJk2eg5b+e&>-UitLO} z1$J{|!Sp&+0niB8l4{c6y?@dSpLzyaj29gbuet6U8hTuI%jnH#9LA>;b$zVj30En& zMElq`=5z=azY*-zcc}Wh*k5RO+Mj?*o6q=LAFghtE`U`vLyAplDqxBsy zN=Ysd42X=kGR)d!FP4_+T%7&vE8lRB*+U7z(%+c3L)$~tne_C&`ut$K>7K%DL2k9w zmY%Uc&-k@9F41(fpnkQL$y`baIkAHfx;V+{fE`8OEE4ZFSa~VK>tTuw1KdWdIi+p! zoO8S3)%zQ$hV%JqdZFS%VkC6V1|%*N6a{-9^7^>?y0_NGJ5%hzE&;P{eXG-IzN=1qHcELo#|fs?Mf~nmYM{3-EHm%qhWF`hZ~B6+3G|I!SyR;pvyKVb-J+rgq5N| zSLh|vyJ6vhce*Y@BB%e;4*`Y!-E_nF#JOiW_$`>5Q!!u{t1#83ZDE);MHDqe84~*H zklIk_!Eyi7YD0*bChC28AzgCsg(JY$tsGE?ObZI0x9ENmo=IeJ=_{C=4< zapnS3=p(Rwog42t9RU~4zxc0S1g0^KL`&Nuhwvu)sbK0po& ziG|lpJv-jepnK$n^s-|U{8pRH`ML8H2tSh}quIpUecXlYU1JV`&*vGy<-9L>mmr|y z+RFC~Q{j9Dt-&m~x7VzoBpP^GLi%t>qo)5H0Wt`vP#**-?)8@|sYgfiM}?XR6ox`6dGugp~p*bBM?}n^s*MS8S{u(-NG)g)N?TDhPWFxb7S+obBY2+y0oip=x(_!m)b&I?2wi@ z<`lb`7iq|^Lq0fd30I2Xr^pIO)x*(-q+PPDddLeRmC97u^^~qQkqIAbn zyhSGAHqJX91vu&_ls9p!^Asm<5c14Ri4fw&)f^{#Oz19a|V8367`J4Q5tEHS#S_ ztH*~Z!=bJuQL!910V2{(-@IWhSa4t93@G`rZArLw^SVBcXb_3(p^Aj_J!4r(rniiR z)U2lSZ@*K|c?LYN5uUw#j*x92p;Lu32XmKa*kKE1TtYpM&1i&b$*_S41#BCfqj4@6 zzycvH{r<}sQec4{I;Gz?xrrM-doPqA@HEEP<-8AUsx)aP71G_e#6+&?`WE_P}gZ{ zd-&^8eJt8K#T%lb5P{2V91PFTHr(vfPD{-85oxnOJ}K+h;{SrqdWp9LZ7uues@o}J z{e&|}4Fh$Ulm9zCYY{`r>D_1g3+Nn&RTBXN{A_&s6F+O2<@_nM1OvqPA^7EH z{G5`YqS|YXoR5~{vh&PpiZpXr*}twrwb zg^37^exZUQZ!#RW2Eoz6`nX1;#LSjK|bj%Y2ehmyT2-|LTA613{}+oerFPkyzFpuLiu6n zh$-q-h0yW&LJi2+hYWWpjangv^CQ&ecdnAigZxqOepFTAC7f_4SWb-Lc1ix-`4^S^ z;Zn09C9zvxQ^&aZo`G4X?-tibs?GX(9^Q{60dcr1XWQ`c{YfmNHZ@DHmr1r%Z^S&w zot|+Ni`An#)ldbajH3*T(0zHPpi@l+*$7RMu2YTu81+$N=dL?^@qq7z>5!F8__A$8QS&Rs-RZkvLmu03 zRyV-x*J^uBGYF}7dR&7jKD)d8`2_7h@;c zCY)Ya4f$sgg<0V%Yg(4Rj$w6q%Vebae1A5-84q*-Hy2eq-1I&QPc*1Vr&+9W-#!M9 zN5b>^FiDLcyaXS~VJSgY*7trroUVvQGufXOzo(Ep2_Tl%^L zH@IcPa=hr;koCFkNKZMAx}QRQ=N`wHA=e^iCBd&J1XeQsS%D9=CN-w%T=4BT%n&yV zJ|v7c-_VX{>y^}V=@in|0pl z&;}#1UY1c^(u`93HIXhq=L|;`t7xlR8^rOn6o5A0c7@_uRZ=g6Is~O3K>xsBr34nN z41yCqyf_+qBXMKB4pYK$XqDo;vV#L1vj+V+rPgOAwEoyT-Q6DK#fkzst(Tpc9|EIQ zo@l#`z|H=myb)BV57^Xbl~tlEa#_jXFuKpe;7-4VE~+TGn@2P;68L+PWcCt}?uhA{5%*`jMGjnHFDB;^43z^I|a%@*mNV+~CSU5;2c^^sxWf_7O{ z8oWti2Nov|*vcDaFORL)pMyCox8hwDqDFOsHH>wF-nW23L`4oNp`y|!%samod{NPy zayxBrP+VM%7C&sk0I`qHtO6i(MhBao{WREJtx|@$C|>C$aHbpc08`6(mN(=IO?att zQPhj{q_gqr3UHM=Jp*3H@!rsDxg{yj@r}@G>poU>ZAibNtz{2bTWp+z*6Ae~HChhx zwHpwGwWNda%pp%Xf&2ZDoTMvtavS$1TEp`mocR?+-O&TR>fx_b%Tp$O*JWYvZpYIUx1i+(BBW$z?s8^srR zjE0Io@xVudAj9zLF9&U(%MyOC4gW<+!mi95pd&AvG@^=%5Ho zWUYqb)!;FxzY9Lw{O)ntUC1upsYQgq=N?cspzrR?ZItv1yi#P&)OqW)4JUI5LKTcE z$rYqx_?P@!85 zP9a3&=M{uzzi+M}1oC{##W&@yoCRA#b2rfC*&k%e8`Vq z5{kIAqw(}up>&#bUV0YwYe(5beEq7HvdZ-1-FY=j{KTUrA7=Yt*T=?fv~|>ixMgSA zzcMY2Mj$!D*Z{?RvW2koO2^J_q_rE#-uL&z9QGuE5k?IO>N4%V_ye;I1luv zP7(b&{6a7Cx$8BZ7H~|*%Q1UsvhBxpc=#3F%9QsIBXK24g(%;RnqHvWMZnZu9ZXG1KFgBt_+nahgP1aqw6U;Oo;{ zJh#G;7++BI-}~c+JJ@`Jc2@3WR|Jjw7+f#*7V1>LzL#|P@WEEjQ$^b$?XEM1OtoPK zFFPeXc6goX-al{znaH@T!mQZ1&2Y<-A13Ild4-}fYql60zG4b;7R^TU{7mJUX=Mvq z0IqNhbBS!G!EFp1R zhvd^z(V*?D4L1AW7KVw1Yt@K8`a6+|L3lChCCpqy|G&`GwZG9+Uzzuw8HUenx2roC z?uM){__3({xEHo|uc*?xXicQD{H43TDR%qv?p|N8#={wn+1>$RrfQbrR#HUT@W0X2 zHa7@gQ<-758oBOAzuRK!twuOr!=pYyE_Rw82@Al{kzO4=A*F*P1 zK}39#KgrSD_}vWopiU=fPJV^16Pgho&+$z$yZ-2x$`;{D-O5~d|2gE>3baBc>&hs$CeyREBO`{8xI;JKWX+7|v=%?=mGEQpbe)c#rT)SZ`-Yw#$FRxH zEw^2|IX`n|14L5CX*tp3SYPchZoW5J1Bx~#z))k*xa$9jXjg1OcjBJ-TD4R@y`qe2 zj4f0Sa7yPUSeKjE#U9mdlh+9WcLKnl%Ba*DiWSf&s5Dt?-Q7o$sS_#&Ois#?>yO>w zeIp3R_7B6PO;Kz7I9h)Vk0|J5Zo4w#RxSkh(iy>_K9;6<_^G@&cS&R zwKfy(KuwmrEhmo}dHSSLQ+tp&g4T6YtPDjPr|E+a86Ng=pv=oiN92>rBe3Y>!YPtcqrR+^2x^K zpVnoMmLCn#NRV(4%ZIfv)ko1hE8 zbFxySu5N8?nZ6!Gc2corDyGF4`WI=c^X@Ow6#04YKS)z0B*lM_rp*B`x}|)XjPDDz z%H;7|C^1R!4gr_xW1W)_eberVJW;X|Tx*G|I3*Z8MuuLuwg4qci%z}+neWgpZY-1zg9x&yu`pVQaWeU6DkcnI?T3j{UyJ|B6n zTVY!?V3p3*y(WxrYcW`793JXU9>#N*9dfxJ$%=K}oUdJ(cf`nP4Ip40sUia!Kc~LG z)Ly^bn__FFB49n21Xmeb-$5SClbF@skDYwebsYG@g<|xsCwjU+L#RG4RJTR(06WyX z&LA0RvM5LbCM#u0CEMXR{u^;8nwqx(chI%#q_J}ogFJxWf#{n^Hy-56)5>&4;x!({ z`pe8k9yAaTC*T#R=cXu?#~;2kF?HN5Buy(iJF2JzR?+u*83ozWtitUkNnOMG_xcrI{-yaAdaTug13e?zKLMeZ5OULGD0ph}U@aNkE z)0hXx>I-_Ja`34_g5*rKte2xEV-imx-iCC;dk6`DZ`oJ}-T=#P1RaE zmBEA13PC{o-Slt=-Ea1ikRB4utRbLszhhlSX zoD_SWx{rSTsud5-VL$~ICgXP&DNYqJ5Ux z7W7DWfJ8&{5Y zN3z?`I07`S*Z7|Nf^QLEX0!%$@L#J3xSZuU$ZyH=OX=^YMlVA>&bspF-JU;Z?i}H! zfI}#W?exZkvLvC&+oQK;#q+cY-v5w_eo1x!NZ3}M}ibTpI<^JrwX z6|AS3k(w-!tP{2`_~RZE^G?L(fx*g74%1+bb)`8H0f!wVL@d4+!(>%-b13|3L+L+# ziqVt_jYc~6$bjbh?|UYQMkX*pdIhJe)9XXM5E`hN zVLgP~ncaDRR7YJD6Lobmhew});{_|ra5V9&dYhbhxYm#5SQu%TUI>x#LFA7s+i;IY zLyunSE@CLok?!2G_`HdDAhL8p_fJ7X^RP65T|>M}b(pZv2pLu$RM%-4vli)H1^UIa zrdcgcul6l6ZKao41808P2p}`J&|=T{vt=J}+Li$Oa-pvT;kL;jWh4{LnMeu^6Y#<) zWfj&{DY4_PrtrN|T-Vzb0|*Hz(}GGJ=JLW#Z}*T#m_O46E~($ZjgvvJ0ZE7!f|(Z& zi$bdGI<0(Gl9nN?ICKyHP=Hx^-r35)Ugu=04Zh*ma^{E080{Z#npuPUm1X&n3^i0R zAtD0K#;a?hkNP8u=H;vy>l6Y_Nxf9+O-5bTNW}Poh!+H{q_ zwwSgTWUOI#?^Tz8@}_DRT^_Stwp~h0nTG}vk*7poG*4u3;GtmL;X-4mX{a4p5F0FQ zW`({_IG{FCFng_ao9~p7E5I|+1lP980UnOf-uv-MJ zz@>b&Yhud`_ISwic?%QOHB(u1SQ9*tKfpl~>5|Klt`Javwew2Al-i)O(G{PmG6?pn zJZ<~IGz>&(pv{WYeW$MYz$k&vteM98kTeo53+#MP4$|ohkMLSkm`MjF?LJ zsa#yBrf`EFLEH(rqtR63E-V(J61f$i1 zWEUHE2Pord*XJBG1WLv^QNAlaj9*hvFEbc}fZy#oh+EY0n$OJ!&RKe3V7F|gbZ$NM zTll;#z8`4X>KX)jeCH}O^s~d9s7-DTIdAp$?68?dI%+rK1ILDL)yk_;4oq1Bsg$9|lx!6#sI z{((4?6LTf~`K&Hm{PMM-uOnoS)P$C|zwsJD&pI-?>BTy3P{a&tD@1vIKYLq-z&~hd zSfXbo7}QSkalOo#zj#GQz8COydP@MrHfCmhtJ{*Y36?z9?=`mRBTp#(HOLB(i_h`4 za#UI2ynGXcF@gK)Gh&fW)_poru8$@Jr6Z7kylaZ9Qpnc@3280Hs;q z&9|0Q_Z;+YuGY@oBR#01x9>vmy_BQa$_V9+C<{_&@FdFoaet+O3Rf%#BQxy)G zB7C?@=CH^cF!ZM81lt+IQ;Zw%_FuH^XCLOU00W$TE{0mS-gdzD_HfDXK!M4Y5{_WY zs_P*(t+l;BKX8azui}( z>VfGY*nbqOr9Va*I7w|AhA?KP7{a|n9|Bq1o`&yy+e|YEr_g)j-pu%)$>kxUo ziErYw_w)I^9@`@+gf)E+OhW*p&#|V`&C=S-k1H!;`}}2@y0va`J~yjxjC}`qJ%uxe zr7N}u>WXc;W<%BwjD*b7M)uar=295Zv2HGb3$|w22Suw5$e8XPH~&UYmZHY<-}nC$ zaa{P{5yy*=6HNPdqC>;KAjeg+1GdZS?wC;$6#NkRXqQ@x7748FND88RTB{AOQJ6*!0*jB7V3>`3NlJzqy>_wpo&&7x^t{oC#!|(>dLC zSPLH8rDAf1t8FWaaqfD7vT?e$5x}?e9;jmy(zwRJw*?Y|$>crfm_qN%afQ_PPigwU z!^7GvKiYwB`x0AoUXn%)e=?#514YZ~{7Ct+{|Re!UDlCzwdnPjv^)R8q#gqDxo5*; z(fe|?!Sc55Y%|{xj{%3aI3&ss_CX^%r}LH75aR9+d?w^57VxN!ok3#rD~`{1mgfx~ z@@`0PWJN$K8!*vow(eI))0sdHQ37~{MKBvQ9efd-TGuGl?2UAGGI_boA@V@efb9l) z=?k}D-{f}cx;O4iA+|$6P?_mowDf=&gc}u=${NmXbIZyc5j!nd@|7$mlQ;RBe5%jj zWz_z)>mXjm;?4PN#aNdjfVD7P%qdZO+XBYZQSviZ5w%}!xvI5p$l47TfmyxH0w2T3 z|G_3|0&F7A1`Y+wJ>FgaJCh6?VN_{}?idh8PpZ)GHC~yqI-P63)0Jfkz=HAc2^qii z4wdljwfV4^q-;Xo!0b_eoGv2Ldof$H+R1n9qs=toqJ3}O_gscu@|0?Pq}Blx;cVL% zBK)|a5|9+1yVkKmd!IgS6=*5EPy@54dnL^=$<@ZeqNQ{Dyn;sIVyB``p zoM&#scwFtu&1)hxI2?X>VoWSIcGxCrEuEV5C5Sl8Ptuf(6>ELgrBbf%_Ejr9F0*29 zn}0vVq94smSla!P856CYZ`80$5q5rIHkiV-<4v56^oAcsgzenmAdZUD!3*~soUvUj zaGao0$jxItmhj83*6=j;M{EFzfm6twWGX~u6iZ&TOU!E|; zlELJ|J_9kFTE1a6j)e?=-pmsI3niD4rSd+`^CsFGK!Q4y0Cs>OSG#P^-au5TOvU^u9g*X>R+2FYlcs(6~=vuFyXi0FcpUGlIIoLueVRkvR02;!YGo&Z6!3*LJ znS8r}QpDm}-+di5*`=sgFDH0}Hn}A)H1gK`}eNv zPrFFq!{X_3D^d0*e<;)}x{#Rl*b7W1kDRWHAjNAJ$(qTgau*@3fC{t|(MfsV zVh-dhB@MA01DdDonq_Z|Ad>3^r`U~b--wnfs;6}gB03;9-qP}fDkhTXvu46BKPMJ( z_?*@b|Gbb$--vul5x8flAw~gNr!YDLrT={dIgW_Ov#)6R@O3YMa_zFAhx27G?d1r!s>afD%%CF8rOq)nIh8u5Z2 zO#E23V!+cn3kNUQhKtEdOpfOT5XlC&U!B{DjzJAfd%@d)yaBx#y2#vO7pT6_7|0HJ^PM^e>dFP1ytnj>f%#vDC> zk9f~^i)-cbBn^_}mOVLuk~_GvNS(P|z^TnZLJKxJ^PnvTyrDzV~lxDImtgZ|Vuu z%)c6zM8s^?NLKXb=(lg5Ajj7zV;LV7!ZdL2#*0guw$x&$ z%C%tlOhzmQk9qo+yc9Rguh^e|mRu-Mb-DKJZ;aZ;lq>$@2g>@@Q_g-gJSa#I>zd zJM%)vQD9nJ1R%-pq2b*>WH)6KV&F-^^Iv79kgYI$piJ3h1dPgl-xSkCfvVITYG6aZ z*3F;YyhpS>1QmYs=ke&AgJB606dI)J>NT z5XUf+OUmJ^Fdu^zl(y`6@v6Br0F7R?Zm41OicOm7eQl{1=w>OU+;5J)BFF=Abx0xI zjBdipOJ~QfRbm$T9E*N|cNb}RJL{d>HkqsDJB-ai>TQ2reBN7z%Hv|eZTU{`=J=ts zTbVQXV}$IF>*j;I>DK{PBgPre#SAY$Owus0G%TA;{(N7I2ucV>s-$EL$inanfaZyA zFj6n@BlM}XW50qqsOcm?Zq9ir2X)-&TZitf$9nrdUM9u9JZq{yYE<$8Qbf_C9Hp26 z+xBeKk3D(J2Ue|pJ0q*@*j=bGL4#h-)w&gaZL;kJD%f2yke7u(2%-r{z37~_Dl+TV z(qEUwA}}DM4A4B;KAS2mhWq#mtaVPUoAymwt$KJF9KzzW+of+l2%hF8<~+p&V)TCX za`m-)vR^3siwm23Y>WwtlpytUaUj2!Houa0H0ZBB^s>o3G}H^_XS*%it4fU0IHXZp zF#!)PxHknzM)tl$v8M6a{5eTG$76VcX#hl`w5EAi#pb%3_#WB<^h(upHhSaTnrT{s z2T>l{KA&~|2|?Po@C?9UIlYpu1dZEjF^HCW1CnR;ln-21w@)af9tXz`gH~}vY?U$1 zKqbn#0ww0@kYgb#Dczqm48 z3-B4#y1G;BHP(`2r5+g>M{>ko_6K+sfQcdj45FM6m0;9RfnNnMDsjCWEs{GeH(*)|}+aV~hi zbZFuc_Tlcy3VvIPkWFJER&V%xV4-g_7`<-#1QAkE22v>C9Y|F^uG*i5E-XE_J>%!8 zdJ*OqA5W+B0s$Z?Ew>J4$GQ%T0EttO@Vrb*9KnbCfW;ez3lj6801`U+3@V_>z9G2s z5XjK3e<|_3uZ~-7~rQ;RTE( z4-+*kB2|3rm?0cY2ZS#1!k>o%4(Dt$(-{>*)y!WR~ ztvlpG*#^-Fo&$itX(3hnk6DF8#f64iSRQMke`5}TE>Y}s^)&#N%D9aO&QLP03o)?Ba+6wD0_2MGiOgEDCcuL zMrHaee9cns8rNgtI=i-wUdIj)DPJJ=Jc7!Z@*M%@^UBpJCLK*-$!rJxxOs{>m<8IF zi5E8IvYinV0p7(%Y7d*qB@v!c3j0&z zsF?i+EP1mJ^DjQozL5FA|KtOetaBFq#at+Zc$VUdHN_tEBPaVWBWQT6E7 z^XDrVU(Rj2&drC}Wdg~KDB(bdEKwv%BI$f`p4Z2Tm8>R`jSKcm!zqC)D#y(t^=eX< zQ=I3rue2gx^*1JWCZcmL3_Z9gy4jASyKwwgE`Q^|eu#Ig&SXQDK2d`xw=w^8o8V)2@F&)<|B&r8sJkKilQ+L)U z1MlJQEx&}uCpwdU4vwfPLTtoC;3m4z|^T;4rxH{HzzUdv;~v1w(-nIZpmAR)5!>M=WWlHG{{fZdC$I5CR&{s%}C0i z^MRw1f*?6xrOyikNrw5pz+JnwH$EJxuS6k`%O+?sH1*fZU({lBreS6yU+N-|+n?FG z7N?{P@{6?wA#`d2br)*17&pqiL`?R96rGBg=E)6P!|6;-u7;l%d`~cMnRLn_lM5(i zCoFDhLd~Mv@Hcv*;5yrO1a8SedU=_;ax<8hjjtJ@|6tXhR(l5Srye^nssNZ_h{+x zL`ZFFi5f$yj@x>)*7^ZBQ!S^9^`^S?A-XNncymN)b_}mz8~}mH!M4PyNf(nSJvjq6 zD{LlV)QJMY-&Rk3QW1Xw{E*We3AufIXsuBf6^G(C$CFV1f&IcLV>jczk0hyqc(6|6 z38VL=9MRJatF)bpKXPsdececzG$|tN^qPHO&CFLxmxO|nY~d*Jb;y-(9nF4-KHjTx z+F)soba;Q$&$e#@4CXF88^hP)qC1I(F`C(qcrr0)m5!x980fw4zDpi7|GtY)+njfJ zb4wxc+uy4GgzfcGa|{xSoGiNvrNIo z>%a*Ha3r~)c$~DReTmGIzcyWqCg8YZ0xuI4X&MuJsTYFfwp%DSB4gUz)*H5M%s8Bp z+0tj2-OiYHe238Gi8)|gvd3~JGH}Ry@lSTovksZ&d;y<(xJ|PpeZ{`67vkNd#Z-zF z81l+fj2au5(`^QmKxMO5+JHbQ1XbnY<8ICt=gA4fZrO3V}C96f2>7Nmw291s#(cF}X=MAs{ht0-QvPUbJusROSm zh0o&JLuH+p^cpnPM-H6ZBZ$4!M6Iyknn4s!B10?&zslFtE8k+7)0SLGLiSq`KdHKy z1W;>Ya>Mw4lgZ2wg3B}*PRC&)`1mYWIwtEZ@%6Z;+?UeKI4@jA^Z+TIw!CK~tA(0X zA&C8`jz^|z3uj>kH>tK!bMS;sQ9#T~DdP5V=Gs9mZNrm=;~T|Iw_oNPO!b1YStNDM z#DDd5wmgGfXK9)dGdFBEJ#cDb9+(R)lgW=E!)ta19xz{>3)Fke@vDAnv z9L#%v4ABW~7!}r-E4L%dReNr+2#}NonO~%8Z$I4ih&{8Z{Sd94BcobkOIEN($RH0o z<57A+Nz*;#9Wqd-rPclOHaaF>|c@}ZM zQihVTz@BQBJy+MxpddFbz?Cqsk$3XG>jr5#l~b@}5a49ClFh{KSzmc08W{sS!OMbh zh`{hDkM6@{!U1dRLs|)kMZ}thJQ16a{v+K7`PkhVR1q^^l8xIMidAhy8O!91Od{(} zbX``4%o!YI_+l6XjlY5dqse);FOfazyB<_VMDDX8uOT~zfd!5}DT7RDEr*j!0(T8T zn$xz2^8Ls4MR8uZ0AHt)%7=574uiETC6Vg;?-D;?e6B?S4JRFms<_G`QcLqpS%2T7 zKVlB2Rx>bwEUVRNTMfk|fun_E7v#8kU4c9^TOoqumEE31Qcv^Qwbs0o{auG2CR!6~ zy6Idthv2tq+vDMZ&XwE>uR6!~q)lAbm5$Tt@8mJ|l zk_W$NmvzGUSZlLPA!mAB5hXI}>)N8JA0bbzaR#|W@X>m0&ALykG%Snq<9~l3XatiT zkP$L~puhqS*>1O>`&p^Tlvw~v27JRrinEoO{=OGCc*|(O@0;W9`V7t%|%z6Vs&N=tdm4HRx zvTeNe;hjYcgq6Y}Q*v<8^*)$!c&(iG9nzDwqLw7&c@^=6<_rUm;CrE{(UV;>wYQ-k zy%nv3m|o8-baTL=8J90EJ;6jG(#*P#-Rf$Aw+Mlg0Lbmj0rt-XMIx9rpyIp2z90)| zPqSRbN^nPC`hkxMjLupaJt$eb|AH&I3JnY(w>1Y668L4q-5*dX&GPB)< z36#PwX2)AwzYFa5t3&DwLPkL_y3%;_r)5*ROgf^+6^eR%g*1-$f6+>Tp!lS3(eiMm zqOj6SBYv&5e!|d~z@UgoRxRUg;A@Ry`uH(@Z^XCt-2w0mntpXfx@@veSH0R*regtm ze5<0Q>qyWgET%N$*f(`6EpIFeKfIJOmk9UMmVPgS!3&Jq{!nSY_*12MaPaGQs@?Zt zzvt@JH;P6}VG4VtC~TZ>V>pJb@G};>au6=}wAN|dK0moMVFJ`dQI(=RQ+V*1%eiYK zduLFK1Q1)1`wZaS%$RK_*6Tm*2c5| zGEFGbl}!&!O9@y21Vs6cT~kShBDGuj{MKGFC$q0ztsQED~Pe;&C znO-{Y6fCQ&eHR*t-hbkY5R{j#VI#wD*4@<|z~k78z_Sh_^m_z;f# zyVY<;xy(QjsVg1I#)&@9iDuZJ7R||jvS^;y7hQnim)=&e*xA=SqyhT0jm@-Fad50&#wfWMAUReXF|X)Xx0&lhKf_M;`~3%}sc9Iy z?Z-fbG(h3jz9t;@kkqfZ$Lkw}=b4o)_^|_Nb&~Cx9J=PV*F!Rstd~@h*!;Rjdw7Nh znM6vgr{9OiY(2j_mmpC2@!irV#AO$jrIks>S_;yQE~r>m+P=T;L=ullTfMkGq3g=s z>S>BbC6z?t>erZeRinRYLZ4aPbb`-6-~9TGYNtF_XeQThNl(gjUhGo#{`yId$t4HK zF(C@?I+9qrU1j&0B+xXJNMD2!mKYGZTa;$?(;w0;z#+9fGfDb9x^V1#47)t}}r( z^tJI5hlmpfN6nN|uAnv81Ta1}wS3Vyly}T@h_U{6J?1(q!EL*in{{=8sF{>D;Zn`4 zs_{SRF%|Q#z9~=^QSw?z&x;c_u{B6Y_|L(UmYFz#SWNpkEKkYbp>uaV%Wk5_F}lh2 zP9Ag}aXC~hj$4vT4dih3YNl1XKw6wHmeC{8;0}Qa4B5zZLj_?UNTAt_Ede*J ztxKR^{^+jJA4Y1!kl205fRXI>t=4jj_|=hcrY3T zxh_WEpN)Qt9ZWCl3Gpr z)1%OKg(PWOpyjg3Mx_!^ZK|`VW9Xmj-ss!2Y_NDm#!-M{HJRx_Uzl}{Xh@FnZbz$~ zvUz4*P?u1K=3Tg$U#K;UlVBaUzksy57%p>FYtG;M_j*dQlo< z6duhte)xo&1ihWLY$x1K31O&y*yyIEaMo_F|K7F$j1fY1w$39;N*1(D>yZyD^Q+Rb z)A5P`q0y&@;e;8=)$$kzT);ror@((Of&SwKE_2#C;SwtH1yd*vLAw;~bG4f{0>cB!g1%lRK0o@BTG~=R2 z<$X1_=q@5VU?$zj&J}mt(N6ROPV^(rn=YGSF55X>R+hz+Yzepq+qZGzUfcCS`bARc z?%D)c9ma^Oc_`yZMsLAccS|6PRPOptC!?P)`d^c#lUR0xUzV_g_)9K7m0y3fW3$GW zhux%hovEF}*q#^F572-Jl+1lJT&lGymLWeODZ4#cyk1H&xX^>gc3nclr5$ltzTPH5 zcYnsOo8`KJ4pLlF2`nFVUUVw2Uw^UPbmKE}Bq4m7AL!W#tGjQTD$e5SoBK9@aDD_( zY#LZX`ybLYoIsY{Yw{4q!E|#$jz=%8R+S4{vN*!NdUU$Q6_jH|#(UiF!Y2_JmT2Fg zt6F7mF&limGy55AJ&$C^!RxEsrDj(L7e32EDZw%u?EeycG7LE~82Milw z+kI{E*e(Mvs7Eub7AE~X8nHQh0q(uMy|6O;hUZaRu9W!6<+FG8H_{TFWf zmaPE<`dx2S?FoG_39uElv*2!3g4U?W!e7SNpH+LCHz?JN1+3pFwiWfXyC+>O`B$$l z2`CF5t#eZSvZNXG9!Ry^9zuOXp}Evvg9Y0Bn00Z=i%fIjxYUUGd~7dq$&i04af+e! zV+g}5j1h-zf&5Eko3|dj8q4I5L-L_G^aRVG?zUbJBzZ#-GhV_FiNL?e3oq}_n+FAFkNS(XUm|Mqo>RHpF zcLi0Tp&Fwmj>+tYaL*-P-)$S=g^SZ}ewGI#U$Fr;;8YUQw;(^AN>q0WO+rrvUUv)x z{t_2m-fKzW(+?!>Pp6NW&fD41>TL&yeoS0n`K_;Y+?6~LhrXlL(`noMu&Z&?Rnk$1 zY>D=TTd*KqWb(RB{MP%cCF&+QDvlN$)ey(&hm~lQ;Ww$w1MjNV&xbed4)ywr&sP&h zW!#Hko+$|oM5+`>p^>EN{8OE^zZ;FHHzT=GcYBfg{z5!YUTtgP&Hk~ACdF|}w#0jx zZNf?viaV5UlfR4NvvIQ6-f4AyWn=qzfhzMKE8F8FDX7$)Vi@z}nf`NAy&?=JE$z^C zUq018*VF&=XMbKgQ4`}dYO|off55+g_{57m0)g-@Gh{~m$Ey0Zv}e)b4lxN>|GBAt zjRyFlHLw4}-dhFL)va&73GNU`a19y=?hYYXa0u=m+}%9{w*Y~KySux)yDfa-?s{hS z>HY8B-Su^ys&ms9UB$(^SheQhoi@fhe$Vrid(!?#dH!=KZ!v#50ORfBRuuc^r25b} z;6S05trvw7zW*>9(lKDXxDW|Evj5+O$@(ah%wn7x@V~3h0s;lE?U~+p0}mPj^nX6$ z2Pkx-F+!WDKSy0b$dYSGcf;lFL)vCjJp_(xKmXxB@rC&q^CpD*P?Oi|!I7ekH!t}g zzqpwI)aCqF3McvhSW+Nc{=fbC4Nw9n03dAhQvdI-{JSVXb^Z^l3x-`^4?rWTw%zag zK4oiu>ms*3TPJfnJa$P9q}QrZyYYFhz2vPtP33oyZF_wQ>3K;~J-*M%$}$?d+JBwr z+e!&3U+NCVwfiC^_jAysqF7>mUe_l9?EN@;iPrD)r10|cXfw{lCkge64uzPR8dVHg zdjK-+O^%x?vK|3bsqO1eN;?zv;9w_hEtr?X@^+(}RPKxEP%OK5L|X_#>&@ii+oRkL z(5VpuEkHo(z5O`xIFst(_Uy!HfEV&fOJdTHZ}D~HCW}su%-O1lONQ3B^)A6z1V^jw zVfpR5^DSYg3i-{(Kl19yV4z+Pub&7-RS)=&Ixoedbf?S4Q^IO7!tO9JphHK zgU@cou+M55lc+;X9JZMQepD>-gdI%gEf2?iHZQph-RK8WU5o&*<=rFAvdc-d z!1Dv|{;A-P3eP6DuuI;|((sG-&Tymvqkx%Vdal=HAt@pgiEs<%fmVx8EIt&f8vo6KO--wseb(?E_S} z;_BjQ=bc)GZOlpv*%qY_fxy#5sO?ssjAFq8RxH0N@!%iU#P^KaDigld1_VE7+Apwa zDDX%;)ebnVKgxTOUGBDfNj}}A#eA1thU{=`y!#n%FHkl>uiYfg+hGz&xVlomD+O#h z8+Z3bEL8{+T{zi&I9q81=W0gjKpCoPNqEkW+r{kO{WL5;;MMlHx?+mE>#^jgXHLkHd^0(gryIp@{%d`fPl(*Uf{k zSrrh{JF3V0C4heaDhuy)l9px( ze2xM}!@h__yy`WH?>1&r@n#n{{tK`oS?(P$PfTZ5g*wG`%)usu7eZ>Ejsbs7%gHXE z)uIno7LOc*7ah=w<-+9LZWrIx&M9^~Eapn%+Mwaw1vZIXR>*XHu5%st7dit2?CK2@ z%c>0Lank0`DG0!3@*%%gk8P!h!*Rjacx-yjWG-6OB3wG_m13XtwcV9xyeV;> zFL{D*G4!}kpE2jkmPvuQ-B`D3aI~k}r7OJbx~aM#^%+jQZycofka9hT#Vj=GO$)nt zI59m|e^lJFhqKf$Bs~tu9UlB$s>$efz-l`FZEJUY2vm3)Po&8?EPgmYlqmxKcIbQz zYfRQtcQ5c_>-Aat@ZL0q_q>y?DI`Rr?8$L}iZZXWeU$56th zcGcgXbtq2TUfWuVI6lyIUQNoM7;bHHoMTDI_x02sB+OT8Yr1)?rAiGLN+1Z7yj?mQ zyxz;cFqmhEWfe!mf_3FPb`+;ab=#ftMRRfUP4n0CH^sp$~c zvq}`n>*Se_!wKJI%|P?C$w+*@#S9z2t*Z^Y`Fuh`u?oR>=*OkTH~TLJ=yBpB=+7Z8 zQ5I(}f6RAIZi=ez4R#5tM_V!SRKeb|qQEl33E75eTW(t!;R4s4{2gOU+sZvL_;Njy zQ8ML*Xw*&XFactTAN@L^8G4F)k10(f8PdX+!-!#jRx7fFN;wsP zo5ju)P0USrAq+7LOlE_n4gka%X=i9?loR|JkG+g~I5F}0v7KY+q_lnolkVh^)o5a@ z7tD-l(?UEPjz$$u!(6i@hwPTy z0^2SmJdBQ_d{|TLLgK3necTLc_1*z9C70pcU^Q`7B3|#$Dtr@Gwl>QGz9HsZ{vKp< zLO7kzk40@}*ILDfLi4q5mpkvscQlD6->%}Ai9JHOb<($|30R?!-8>o08m{^`bsnfQ zC6Nl$!#ra9@9-BNlF2@vv|QCGTg>$sgj;lUr&soZWSh~SEeE8!rQzAr@W7Z$U$CfT zSx#Y3O?nC@IT{w-na_r>Za{Yy7ClYEz9csf@?f)yIyv8E7(NsgTp})sYxN5MGBnvx znP{*En+>h;w?vk{M(oi)$+Ftp^ty{J#$n9FXap>3C#?|TEjAWcAs_W>>(&nPcNYN9 z38*yJ$dg%(f8|;Tn^s?BA3Ul9M%Z(1dtQUFl&r#lltg~6Op8n+Oi(yYE5kW(UblJX z#jna9u=1Pw7LfobLz;6W<1~PLN5iG;IxDZc=2K~c+sRb!h_|dhZ7s2!F4nUo+39a@ zQSzdK?MZyhAeB9mq{4VuAqV1%g(4n}FHMIAlYOo)t&cZ@hWD3)PRx*4YB>fVG?s~E z;FMv0T@9!2GU^7@H1P4cgSSAq#v)&q#e4Is(m?79z5AGd^-}R$!wotO^;)xgf5eb8 zP?Hnur{kGS5Z~2$z(vni)Nsd(7v;0pt((AGcg75a*>Y9YhpPiJ-dT_07;FJsAF&;c zd|=nrt#l-bNAzH}%&Mhp&gO37G8KY)ZFFO~2-DZCTs+SuJYUF+(aL&%x#JRfzg#)q zZgT7nxNntjkCX+IvJcf9E~qcnnN#vNyTnnFd`5K}h>oy1_c)cDA0x{3Y-H(^A^3LE z{wj8hNk)Va(uL*E-DBHUPwNzc&lX*0r7;WylNq11++XPZ{>-9(gxjb@u(dIuAAufA zuc-p5TDESMQ8%9DjzQz?kG+tsEs{${I>u(VCAGn!$FwSWen_!_x#x;fG1?M~VgcXQ z7uBTmyQlymne@+(H_QuZg_`~68UFTgHlmKS;kYkV+~`4gLpVLVVWJ14E3KNUh1$h= z(&^m8Ev8tc;XdhBVEvZPv*+(Lx??EyW##;y7%Pux$`}XxHYmpdPX%!4h;cfJ;MY6t zU=)14i+I(vZgwuPbbEldt|u8F@s6!B9>S&C;NZZ(p|nTrApL1i5W&HQT`3CP`3U^}4tMe0@0zw{@_#JKrMJ6t*hQRas!h0EP4 zNO8GQ^ufHGpt9(a^WKlm)+rz0se&7Qz-ex<%Pvlo%xR;uUj*`F`UQi9+mFxbF&BQZ zcW>Y_J9%IMdC;8F9Ojm&XgKCY%`t=w#`W((ooskI)EHttgd}zx7U)l#87!e+uDNT5l-{rzwzc z*fa%*UJ0y`IlAo0TMN6rA+Ij?EI#IToLlpn*B&brw>RQ^)I;Y6oJzUTiBN*A9ly6! z+h%f`z7lcUe~;XYx8CywT`;;xo?oP2S%VfHUmP4}LvSC#8`nDinh8kntZ zh1JG{YvDKH3H4Yp*o zna}&;ADrYK9<$+4hz+LlTJ?A9Rq`sQidrdl8&p^-BSRkf%mXJ*PlmoCW5i*DEILCa zLM&1YHAqto0WV+H_F18gKn2)gLZy0Bhmsps)GQvC!;Mt*8i$)BZU@kq-wupIKr(lo z`{+^uAnHhdzCUf*N}c~g`7Po5xPz=-6v!JIPE5Jzp$02UvU*pcblHii-LCz!dNnVE zco4+-?E}N*vOqg_@NPZBcY~<{*kr7>!D{-fG_TBr%VmrHzI^K)S*GJkPCNgognd>M zv7Jxy6Hng*BDmWDR^NlC$TYo8K`x$l>!}azLSvDdu9pXUFT|bl4_7Tq)CU+XM_2t4 zL&^L8q;ss)TRrY|Ue$n~&t`Q4f*fFTaew`0pnsam1cb_ln$KruUuq_l6@2WK`dX+O z&TVoT^qgml0r5KB`nq0iczKrS(Z=GYuKfzaIKb1}RI~viCtG2q1%%ZB5C0tX?}hd| z+0oW?ofoNNAmcQJ)}bRxg@XwF&Ca(e#1IM>OiDlVTJsdojBeGeJlXVj$~*f9yY*vE zTYQWc6m58qAvQb3tsbDu1!0ihxM3LJCDeH0JJ3vX3z%T+2#k1PHhhGNQ5;)#?iz1_U=%S(=8%fDsSU%N#@6b;jk|0rLj-@!euZm?lBBOv}gP5j<*yHj%xlP|=dow?`{1Zc8<4;9vak+KAxr za?ckGWWTaBk`!2dw6$5cX0oNFoL@QkocvKUhJ|;63cwuOZ*uE<6`h`;6o2x z2NABwqo6AK%I*@c%?^+l%-MRnCtT~lCW69Qf05)aoT zP!G5J{7Jhi=P73`637KffQv{O>T=l~cAD2IxkxNkBRdOE=4Y*`;#K$p5$PrH3sf6> zJV4jk@jxfy_N#hn+#Mw}N&2=P+8@UKkmT?Y&E=wyljxwj>I3aJN)T&Hj`#e|P&`NX zG%(Bem^DNjzw3Zb2EG5Fgx+8LNtY%Ya8mA_3i>5@9{8zWUarZB0_<4eH=2W!{k25z z>zfzS&ds`cn5oLJ5A+|6zv&Of&Wefa)S8r@zv+q=VQf?x9z{854B5AwVWP|T;kCNb z38(NsnseMrwrSXaqy95R`Hf2{`zQ>LU0>XYR6g=(5r&-oOAIhOP0@4M9f^xF2yRFZ zPhNsbQ|1H{u8LLd3$K4KRQyI|6`&^R)DMDc?W1>PoY!)Q;0~l;f<2{IHwd<|0aq%W z-A0WwdADQ(f++ZBYzFF6w|y%hL?*}`f*qzD@?rkUpuvTV!7VRFp|@3Wgy-S zi&~C@cob5L(jyT=g-XD)7NT5ZrE?dG>2DTO@?y$xa}STc_i5T>&$GEYRpF0-aBBL8 zp@jLn>S62aJy7P_WkPxxRdFICt5sau0U~0neZfJ(K)UYHi{it+Bc^NH8ox±ZkD zjhN1DrGA6V6AVM$XUwx|gR-~Zo08Cn=mnq70`A>(qc=KEEXw9yj+tQ*k;v%GqkdPL zjxD8|gh47UKd$={O%VpUN z?K<6W-biY>-z&F{YVDm+7h7`FFmc~BxgFIAUUQ|E(pTU}*I$UCVC>Ld36$*e8#b~g zv8k1`d_Zxh=L8Xa^%PMGNvr)|R{Qz*cO3UBHCK#fSC?vMe3OQ`K~H~-#@aln<)O4KY-B=Yq?Y`^yLg{fC)BS!j&laKfgl z%a5RgOb}XnN`c4`bAA@PC3zP-i*Y|Z4gAtbdyuCI^?jt1!wab& z7gaRI@V(GOK=bFio#s-HGT2HU+b``!mmMDrH5czlBCR;HY{G_!|Dy1z`P6_Aj;WT6o7NUW)L ztC!-luhgG4N0IGN4vP`6RsK}|jcW)gtrv5lczvEv-%MdFbCQ`4ynRzicqFsZC+;x< z#vVs?yW4;VnYm8o%H&^W<}!ow<~m~d8huu~byK;Vy?!mYK~GCOKaRJLX+;CRI#aSm zkEz3>_0fk4s907)=Uyha^|st@<5D(K2`ss2K?0B*Sl)w5yh9fnL8yJxw)w2cKo|pz z&vFZQE2mHPOZXdY(|T4UDP{6?&Kk@FH%&=Z+v#7k61(J)pl>QkA{WKkmb1luq6v9& z6U#fee0oeZkVkwF29q|tt4p)OEbft((3#eNfGthyM)hEHB1}4jV3h73PrW$J{1KtR z{MTZtIQ!vmLmb4#3XKaO&d7C?P(k5+a-7Uf;1O?yVQU~g3z_Bx7a@eoSY9PZ;F-uxqy}`9|j6G2a z3KWpZoRp#9>$dlsLAUUiw0ZTZ<~~f=mN}?>zi*FNW2D!A(3lKNbbn&LI@S}`XhNFg zgK|mnWxdn4SFEClb?2iWGSJj{de=wnen~3d)_GSwUYFt-VaNm|mxAwS9+?<+?qa%4 z)|!CkrA5@5qHvI|XnX##dAyx!jXdqDgJWLoS};OrMVyDn*J^3N0A|I`vat+89?w1HyRQ~{EvT=b`V({>(2TB?(9To&jDDX@JeOVZ>m z>zy0_$(eYdeOBQIskxp2mxXtrjUP~|eYd-di7^r8E1owX>#|yQoMiR7k>5niGfbpc z9@^WnVO0CED|LARQ)RYVbldOmt|_>|HHvY7D1G=$y z+slL>eJNYjErRT{nvlS~?ZNw{=0$Qj*GY%r|K#6me$x18&6@8?bAA`R*yL0!QT%&M z|MOU!5}4$IXNf{ig@#0Gp1aJ!Z7mBfNxm(pVa%hkrW+e%oFwwT3AYW|RZE*5v2T4V zO7MIHOO;OBZ?hHOiewue_pUSs*O-J4Hw)Fp^wlc~7g(^qN%(=pw?Y_;!Uy zo>Zx-eG1cXk2CCamVf*hho2S?SnXd4e2qo%qjMPiB`&?%}=`op?G&PrOB10)+@L=t{ z)t&rbs|nLw4)EbQ0kJLa$((X|Kdt?lj#=(`R!3JCThcna%L|kVdl^a1{LV1rEiR*E z&#BWGxb=ysSfjB@T^ddVrNrQvQOlFWgGs99+`2|qpwuC908^X&o%M#Q(nMImU>DHT zFa0oVZb*6pM2GLc>pHXl#^(3ANs&I@6s4EXiI$BLKjIzR7p5z?Q!K@N75Zd8tET;A z5fSKO+AL-jQGU41Z%1d@8;qz4%_nVAs@;}`*mVUtw;dBCD_GrOXKhZ_ACpg5KL~T} z)7+_fU#<$2(^(sI0uG_<2QvvcOJC9B5U3e~Bc!4BGCIGm@!aMW;>eq{=b=)UV5$k^ z)i+uKaxta}Aml~3O*{lPuhQ3Ev(B*wax|@6gy0U!mrl2)niJ#Ca=5`gJ@;oQ@7ZXg zrD??^XDz3=`Yx`>eLBpdcCXWHB9nh2EPfW&`Vd_ECh+w6j8}7R4^0AOG5q;YAU*zX zm5OyWG!PH8CTEX67k2R92I!DpLn|^eQ%@%VQfh0J~-WRr!|6o_e|#+ryMtu!Xn(H8x?D8--+w? z7gf+CihTHt^mAho#5T%Pp2JM+JoyY*!)pI_No!?1NCY;~k{)0*Ybc9ULT^gCKB5Bq zR5&ey+x?%49Cql{)Gl>vV%QeyG3MRSL8TMDOoZ>lH1uMsis^Mw{C7f&K^)+_aB6^t21En5PKIrUxOkOcxFUt;O0-Q?JnO==4cJ{uoYNmCAvuBxz#%_0pyqX+C<>tGgYx~7TO(SWpk<2stl}FzsD$l=x6U&1I)qeB+kL19 zk;<_X3bra`JuM6V1zob;T1brP0d@cN2>tI_vwlc3E1u#kUz0L{QL= z*Q~@_YD6pFUIkh~`ix18=2bwu_m=@d5e=A0*J_P#vipelK;F!?M5yz4*(W0u4!rUU z{WsU8TPNK+{vCPWPZSvQFdy?$ZnrFXjeIq1Xfq-S%p!COM#QD!b?j?3=|=@0$dV|xi~=x)EWMhoI08g+g9(LL$j!N`o0lLk|py>CQrS}%Gi?H-z=#EYJ_ZKm+o z{{XX45A_V!W4Bvu#_D$n28pY~% z$$GE$zQft&zxq?K1Y)|v(4IK+3y#*qrHW_c4eawOeI(p9PG*0Ux!sq#DgIuz{k|$O zHM|@8Y~^dzH&V$&j?x=NaR-2lz5lA(!9tw6@xJF}6N#GM)Cn!Ix`;Gy_DbOiOGqGq zpp^(}NAUYmwrhI9$ZGfgIVe?OPCn~Pg6*1d1((wfd!ytBTV5CnwLAH-Wzk$Hui?y| zPNo={^^wJD`^_H2LD$!3?_o<8m6fy^!ei+R?SuAMQ)NyvjJ4{n4QJBJ{uxU9QPaGP8UtqeTKm7iE^4w-uk zN$n*{s@Et>wbOf}I)oVIvmTj=hPXMZjmWLj?w>X^bUV+q_!677(8L*q*Pv5`%+6YM zY1%t4ut5%^vMC#Fa88Izh^|yVJSpuqonHJmUYBX7_wwF803TDLT`#+Iu+nj}s98XE zK9Lzv<-CK0NNp2jFX)2H2-kTL$V=SplWQCl(GTW`ual!G>D!ZEYM!kKzJJb$TZ#S{ zlq@ywCATsRGF?$(7t-x_v1#|ZbwacwF%v8bWBYPypO1G+?9=8(qO-|aXc##f);1pK z>BsUu>=8zz1kP^#u~Fkv#qfZYi=z=|(k>rs+*2_`0PHlcUAdOcD^2t8qvEk)Ys-6+ zBASUGi(f5!y~;tt4JPbXiN>>Hpu3u}hD}FQ-j~C(VbaxrzpU8t0ci|gF&0BL!=deb zoiiLy2QETrh`P)vz1enyvZZ?0}S3+TfTMoX*c_NB~EEZ6tIaSabHaV zZ1#E*oGZvKU)f4QwZVLBO}%~Sn<(5lS?H=Z6rg1U!Roh!hpzARqKFCAw*}v=V3wSm zd8O$+5T`*@=)GAKKd6nt+lN~ro# z=X@i%#nH?M>~EvdvU!V2uy+fPD8o-K*h%eQf{^wdS(is!w$J>70^{nTR>>#q{&n`- zxyoxw&T#VivA-=-zgOW9u!P*FMp8t@0&t<;pdoz4B}$2qzE-PqsXO@^X%tTL3?5ZlT-S2U8$=3Q z05?5A{v~B-0ObRgCjLiwQ;WlIJJE}hk>bB#%Kznqgd)rjX=%ms6m+S7M0OM9eEWKa zP5bF2{}El4e3!w@EZJ%;@cEwyBk&dKxINN(* zye*#2EwTRygT}rG4iqJ`gl7&H9gD?4bkdvh{(}M(p3k{I9D1`$LQrZ~}nDb!qPZ`z!x>3I3m0 z7etmFcy!{}`Ze!<$>lE5Fyq&GUEX%A$FW5R$oAu@kHk@w+iV&mE$ugelz8zOWo>i2 zE8@5O`Hs5gDlo|su2~wY{rM)v6M)Ttly||mhC*pvJ};gfmv@mEy8wENiW_XKEX2lV zmw-qSX#et!zpYl*?)hg@VMZ2F~-tbX1MBSHa-24H-qcqhAdh$)SA=Bbx5$uk z38B6H>%`Mim5inv%_I4RT1?i3Q9}OqFu%u_Hi1ttPv)lr_j-#gPKyCamqCN=q@WAk3?t)Xa> z!L0eG??;HARI1{)2NJ~L(7pCCcU*wxP;9xpJ{DDTuvv-I4!~L-^nQXbw9l)Zb0OH&)93%cfc@5BK?wN5s0W>cG5v@(9Iy-Zfmej0W%~6K!7t^z;Tp19j{k7=@NqLpE#QnEcDd-rEKh(LOR$UaU$5>vG7T zIfKw*J&Uwh>y={Uw*IzoJ|iZT&yBXk71LoIpg|?UZ=v}~&LeqGqRZPK6q_86^tw75 zmMf)H7b_2u#}8d&VLpO{9>+jCV4s(Szc}iK&5aZgSdini>}5#QwO`jus ztWjli%aAN!TqT`!)zp&bqq3&nAn$&C%n1E(!wjD6Cn-zB%9s5H^d>AZ5c5miD^+b8 zkLNT&7RQnF^BAwx1e{zxH(|zRqP>3 zM2?ZzUgRB6{cfM;!_CPub!~bfWA3-#Ovc-3<1P}N{lsxR?kGBQ@BhA65E458TEnPk zZ<59CiEFXlw7$mkg*)5DYO(1$wt&NW$!N?;arG#r{AS>0G+lz$s(Qv^Vc?t%T~x06 zT{bz)!-C=c#iEGGa=xmZcz)G`a>7so1%SHSkFw7vp>1#V`@|1wL@it{$a)_eK*pl= z8Hbxt4~=%FDD8i|>Zn(_#Q zqb+fg!BjVg(R;l#tG}QvtQgR!kkKOIaf)QWf)?4a6?9$fY&III*^k5to9B*8dh^=0 z(7K#6%3=EsMThYx7+dHqgHswww^}E>50Ds&>1w#?o^#o#QVPpYmicps`=vH?_kOZD zP@hllkoZfD-p0p5o#x0;Op!7>;xwBx;vW9<$n{OH&234^LIZU zwr9SOh$Yu&t4yKwwSi)c?aDm*G}c=&L)=9vEqLxLDGGQoM*V$M7vPjXUjR`5gn7Q z%l-&RP3rMR&mQ6nD;XGpE`c5b^Y%zGD^x35Q`Rib2EAo(F~b+!o`^en_%6B~Pu#WW zYLj*|M+(P1bz$w0rn5v{?-;V~;J?kP<4@Tl_r=|3sSUd#vsR@Ynoa4&hIV5u_2Fga z0m|GWb9$_R43o>AVCWe0mUI+=rU338X9Juh?L|HZDjn zzZ+#MqgAF&wm257AuE*)Iz}x*XSmlZY+0xcqmcS7Et@W2pef&!_W2<7UwSkdQdk5h^%GQNahvFoW7i#Ab@l9`3~rtm zvkI1mpH>BRtVm+QVCFDae%l%9+f1W!I*`)ont1!18^@7R)_&3?HTK+&UUNDlify)4 z9DvW;?0i501dql8&(Yz1@K{m~h~Hxl7g<*UwnSzaN17|}bfz?g8(b*D>$cYkzZ2R| zlIZvR_y>LEFxmakcGJ#WqFA@Q>}r2j#JT8Jg`6HC(=uy5Q}!8dXkR0l+3D9zvG%fS zP$~E<6T)5oxS<7*tl{SvK}88p<|tn?;{j2w$H3{F!0VsdLu0-6wpi9?eE%9ItNF@m z4qr{l@sm|)KfD7bLIUvS?OI$Ji1!;{L==w987}|&;-T~#`|?Hux5nRzYx|X zCUA>!@d$`hmv|&rlq#Hv+{GMSn#*2oE5K+vxD6<9#)i+{a`%mV?3zmPLwr1)W}hoQ z`TxYP+5`M*?BP}Oiwb|`SQ;wcwx$NXgMfYN8c z%b7ol6E+jp=uuy$*sLOxEa33lZJf#$_f=rIO@yBWKj}38%e}oxgVS?xgbYf_NQ{bke?-rm4{)B;RcF;rEE7$+G2c;@SE0A=o;yf z6zasw4ic}Y(8ci>{^mtuF4qZ~;K4ZEk~$JEw*37XA$XMF_H_@yGU@YFt|t1ldcY~v z+7pw&moN0i>$$7Na)SHC{M&d_`JD4juOKcv#gnHbZGTp4wY>MZhJ&VY3P( zghs^s6+M2@^>jMoazIn^ah(-q`fczh*+BHyz*%C*;QUf!c(6l(Vnv7HMdLwy`svcK zmVXmx#DO@|Hqq;OU}OuI*_hj`{vK{4>nzd5Aa(km3fDD&jr?^*Br7xgBJRuI<&_pT zKZy~bO;1xi};m$SKbY&O>R;$Hu9pV z?CA&P%!ug_fJin1X@61v!b$WvNiGuWxl)uxMbd$?>Eo<{xL*BKB}D)ISM)<{Gop}0 z?ZU7H8=MwyAR3m`u1G2Mt?2tFnQ|7so+r>{6ZM$z+u(Cd2f+AQ3mMCEv1&&#TZ4VN z`rVOxYQ2c(#rBXDQoXe{5BGS*`Bs!uS!el|&^|;~U4B8O{I=<*U}`lQmZ#jz3!|V5 zsy!fzp5oN!%+K*;K_Fg=5z%3LK*0fpSk!tjqm|#8tL@QMaYfwt`K%j#IA60)E4Dt} zVomy?7Z+*O(zvo|M)p%FIM3$M-Vx3p>{`-f=)61##WOrGv6AF5$ z7_~9@sebFGb1FF^Du4Sq0E=%%fVbxCy|e00p{ zv<;O-lHispzm8;s_kS8Lx@JG9QnpY@kwtSg^UiQ!SV9xH-RV52z@+>&40k*bY^ktC z!!tU0UTO0>AA7wKVibp#O~Bse@p8#HD`nChLLigfROt+pKE;9%JPUnrjR=bU_kFUA zZhyP#(%a&6}D{ys8wuKRJ^_@mlRu3g`D<~lt z)$z|yOv)oU)~cQ9(Ujb>NWB_=w$-if?lk0@>-=&!q~Wd5I3a&FDzT3Tt&g|KMk43c z{$wV&NV{}y%49YEU5FR8_!R3)M|68ZAZ81dlJ2nG_S`j^Nq(##o&J( z{C&m*#F1yewVE6oVn%keITyeU>kYlM`bo!(BV5UGqZq{<)=bkf3 z%(Ax9F>lT_D(4YLKwm}Q==iDWp(s)5Aw+y|glP=n+H83C0!XBR_E*u{X!6ssWcF^y zCpG}1vt)jc-`68QR0Me29>(Lc8m%3v_bDx`fm=r}giNOKX4|X?9B^-i7daJBpV_FI zB<;YM5W)VSaErW3XoqE{<=?;0vyJ^_S zkk`9s^v3HtDabD|YK*G_hE?78mxJDr3nrH`ahYVxd?Vz%e6p8J#&*kH$r7Dov!|9iPX3A-RMdcxw2QIpi6<*xf zm25fP%MeL5W*7f?ZWqs&8;8R)MO=E-L-_=@!s(>U;@))E72OG`NbSFHpbjSK2(;g3 zn7z!0$MiOM-O3d^KDVmvw*2k~RP1OHA0GYS*M%^2B7q!Od&$SR1fUm7%6oR4b{h|rWg)i{ zplQLUw|Ic9EB>Bs3Y9j|eYYyCyOOUd(?d( z4aNFjGSH3DYsCGx$G+MLq!J2LS}01ZI<9lZHRU2yyL`5`VJYZ4;vKR^fXhd>tyrkK0>>x zk!<+w7Qm53l)e#(VTKwWlLOjvNq?ZoiC2}l`C`dKk`uv_Nhf+nYk~!e`eHPqb;(x;W?%&{g;Xi#2%ffXJklr1o8r7v|#&v zcbg^?8%)(<*s2MO`H`W$7Yb1)%xn_xhAJqh7C^7_HaNc`cK{Kz2T$r@PGya$Z^t_) zaHZul42YX@hgFN9l9rAP%bEj8asR+{YKbMeVqdJroQ~X>KTD+xEL8yMf8Vm-Zwbi% zVU8#Kfc)RNyEV6C{(z1K|KNVpbSoDW{2$>~{{h}jHEDZjx0be+=(K%jF&UvzzVcuZ z5qDzeq)Q54;}_58CqCd1kKE%vXUdr=OAX^b17!NDtR&wH&Y84~usWXj0qysE@T0eS z-FB-KUm3~wD2mSpFxeR(=<|SFsZmcsdA9P&{{DJD(VP{3eNLu79pr?UVZtp#wDfDR zIzrifkxt^-)2=1)f-eutxLB1L?_iK+)H~&IXm;5sG(JkY5PaHcNfdCNmYD>9BPF6^ zSUVXtckC~CPv?C1=lMAst$R3yI?4>eU~FAQgG6Rp;XMbdIHFN(QAXd&XPZ*-& z87XX%@c}2E7>Fme%4N{!t@@ho{!%VBK5sr9mJ)%xN5%g}q#{c&xHsdg6KLvBEmu$6 zD&4qE><+nwb3cnAT$En?38?^4g1tQ!V8)ZqUxX3?1DxnLh%gk<+K?P?n?cO#0wkzT zLz|rgm>;?j;UCj*72ZM_!lcDR|u)5RN zU+h$oM1qs?TgqBGp$Ji;qWPn#W-34Gj5j7pvcz?aCDMgoW;W)d37m>(>Q^wf0s+QN zA2$oN@w{(u-_EiDnDjt&v&99L3}5Oxuate}&(r{J)R;j%G*N0nE?5j-<#$TaWvafx z-jLiO#d1K%pS}beBPMyzi+z)70CICm?5jH4cZ`G^5NSdf-~nmE1^q5RtmP%XLKERW zESfLZZ0{^#LKek}X#4_;oYYup4`I{`9_a9Zy&TQx`0|v*NjJfL%inTSeDIMw>FGnU zmxl2you?=+qpE52pg@@aJT9#1tyDUDTS_NJ|o~j%+BE z?`(TT*R_H6{liRQx=0t)DF)jw`Nb`-F9$P4#=G=>^Y2yoNHWr}a38dC6?>yMQ7AFO z88Jp2esW``Tg>)-Z%eo$UUoTRyZu^DjIRzGCcrT5mG=>0hnNF}Z}99d;+AaKiBmG6 zo-M2GmML?Ep-2$9+#~a*8*g%PS_22dQxI0FxAXle^Ad52$%LUR?Z7w^vDUOe;jNOi z-ian1XsMpFbxp1Jx&ZC4zdm>)$Z5s{8zht&ul(P^sJjqUyrU7B-+$36b{O|5=2h61 zi1p5^@8KI=c_GU=;-s-dKnS+bgBsJ{%lLDPZUrtPe#n{H{@mr}6&}-{gi`s*&x#?E z&y_z3RW@rP=JcH21=ARMGtR_KNa`HT&VP=_v2FrX*bH0LEhYyiQfCl>kEW!s>7g-|L`Jw{7K;b;LleC)q(;! zZA{ZI3$P;dW@-tGe?qZxZu@~^YQ2zC7>b<@6@uci_$1ukwe+_2Vc#GPm4Iu_tMiJD z>*(%4_3VA1-@E<|AsBAYyNa<C({wBC6FxCky3S>gF-j&YQp$vsYr|Y}Te$$+3!_{g^)=W_@U?RlDLsmP zm-QaZJCNNzDqu>SrG=g~obi5rcY;Mb`HI!o+^MC2&YvV2VVSaxPh_IfEQVTGdNN_{ zcM+)b|Frj>K~XL3zc!nUL`8yRB#VGblq?w}O3pdw9EJe_0g)^qAW4$soU?#}WXU-* z40*@{4B=h4ANRBO`May$Q|HV1aHy#%ie7ZMSFc{(cVGRxwMemPHcq~wE_q-o@9J>( z?;H1<8->z0m3hLVsd=TaV@hAOITN~)hBx+ujsQK;ryBHr*84P7GmLjIF+&t5#<8@G z&v4uZQE`^3=dzwoUle30ZI*BPrfW`FVjowg9zCs~SE5?cJt4%Q5}wLWd)M8YE;By_H>d6b&(&{ z9i1#jm9d49lHW6-`{f6o?r>uEhe6c`cV1+V0$xKs5eflDebaTg$5EjAW?!m+VnysNi*yp@>v%PK7*j{tDV3 z-g)vLH#9?!NHC&-UHh2VDM%V`r9r6S&vDhoX(N2Bnx?<)wvOVZirY*DF;PkRMLgi{ z=pViKSXz!#kndUqcW1jKeV4x8>i_BLfRG7s0z^uWfG=`5Zg5$Y%v5uRzgZ@oNgZWs z+lvpb`m8kGKneOd$ebFg=HFO|Lrzv=m{rmY5HTf&jWU2pM!z?o$|u*xVB1oCSFsy? zSA|0@g)^gz!j`DnoEJ37c%hMUWYJ`2EX(J{AP`FEp>R; zQh$ExKvRsM(6~&^Wt9NQ0vigcViT-$3F$#l#^yofzXSta2ht{t7xvlrE&C5V6+W3R z=RkHET(|!42|0qJ1{QS34lN}cL30i4rp|NG@C{bpaHOqi#}RWT5Zr?=T+j7ig6J#m?~tu zKIcA^xizS5;C}aWb2~l4L7&ioqiCYoUHYyqHGfw$hfw@dUUcp>g|0VL#pV&pTE@cDuLLwOF@Z8m7uAv4md0;CWZj;u8z62V9p5N%pJGfAT~|o6kVKFyf*=W~j*9{#e;q z^3zutV`bCY@90Z0VxBNf3dyD4;AxB!^dEKI6aQEY#r+DY_qv27esAet zphmz~085boXE;h)3|>t3to@P}DRF8{P9?nVr|D$N&WM^a*+Tt!e+|;{Ykf4Th4 zM*o-D`_p4$JC#if90C7xQ)OJ>0bK?%hoVb^(#XuF;{Pn+@tc7T`` z{cps9AjL(ezA7pz&&)55Z*0>K20Ar&jm0DQ0AS>+8RwKb{XU>i*X!7JweB6dDss(y zyPIiAnob{ZgAwpXdC+jv)o_sW5{P762`JSipIw-p1B6Z)7nZSM+ey`s*}#muu^|)| zm{6qIt*v*x0{yuQ`IZBjfS5(>EyN-h&%zWZ(ale9$nqkMJzDJIeymJB#dT_}S@F&Nzpf216$vsJ z?oAUOp2*TJX@cNA7=0hSg&#%GgAw@g@DILAZ#oftz*PPSVFNo`UoHZ-865yL6Y5Kj zAw#MFG@Z8j--y}iZ?_lgND~8Ec1ASiCb$&;zqIVG<$BKlwf07#xW=QT@%;WgXBXAdT<> zW8ge#)a@FJVV?NV<77N?>iftTmpUQXt7Ge+&rL`$)h8yzEV>^jt3}0&ikkkP2kYi+RWPI z_bzwawIA0lBTbh0-lgWY(-p3XSfjd>E`<~M41oIHw%G(8^3W*&fB^BFl*q0e7Weeu z(}C~`blU<}1Nd&)YV$q~?JAQ7iaB_~V;@j!GtZ`bvH*;Y5PnK2kPFUZKiv9$*JW*q z_wXQeJ2KR?Eg-1hyUfGslUp+OIfv{|(K~AY3t2z)pp{AnSYI4bZ@14=Y4;7Q&G!ts z00n_8J_NW#Z$l@o0dkvnB4V&!D9gCP#&rS0E3bt(ID$mxFCxc%i}yaxRj?;k2d!Xf z(5E`nzY{qd)};J9=BsIe_55yIc5@#U%r0OlC>Z@$OSyqmc=ugul@IwX%iwi2Ef1F* zAkYkdo`WVKtDWcY31N|tia`8pjbc59{Vs`%m_evc4YAjG@I#)~gE9rK$HPsi9F*nm z__TrDN`U5_d_1X3yBk|N6=U4*^F z3r6+4hmTObaav;ElC*^H`9{OM%TJMOtrv$=yK4-*2N$T%@zWU2qGCDP-@LEJOyzSH zd-hB7&T}=AmyDo0ijsF0*mRc#;2HXwN7dYJ)Pa@QF2@`2_ZNc#xnd2KE=L2rFQp>4 zdk!JyH`is6m7s05y07mL*EkQVk8A>f4Nm2e&kD9HOCtB`^^jxL00EkZF~aER?QV~C zHPKdsmtB7d(9lV@*_=Mb!OMPNFd6b$S1z%PhExnEK>E%x{}{H0At0q`1-=Vy*?O2N zJ@*#hT|L7OW$hPwv-NORM#?vTTbabfYA(jO(+m)cqV9=@=I|RC*`ED-}BikO=7pYfy9&Fa5hiKW0^L~h3 z0NHRKosh8P%F&8_cz9vEI{rOL;dbkqMKAx;RzkAdMTr&u9uQ_zChv4N;>97ruf_Ez zKa-t)l{=<;?dsa1DZr=P*dKhEn1e;nY--YlfF(j+F2toiuQWV-Ws}d_)o-mz5iN;x zce;4O)At83*=?r3yjevmz6u{xp32nM-JB_ii^$*`zW*%GKingrE5`mOf`b$MM+dEM zp2NJni-5Zo6TZ;dC&7cW)o11YhO+*P;PmJKgTQ|hIK)1JpX@rUw4_0-0PXWX2%I7E ztscNqSP`K`8fDIbrK+YhK9+~jJ{ZMOooR;uz|6_Bazn$O0-;Aw{-=~2Y1BWYnZ=z8_J$?%G2E&$=^Oi#oED!s{YioKU-b_l#x z+>oCHj?3A;alvAFMHo7~RXmda4__7`Agmf6n`eO#-f?(V!kg-Jw;BK?Ovsz2uh()-mzzt9 z%3%zi^Wek+I01UvANeT5KfXHgZm2rE(04hNKRy=TC!TJ0PXT;Nuh4C>i7r-(OuLhp z0C<2aOF8?615a7V3gML+4a~klyut>}ShuSc{9!F{ zKI=d_jfqfdoR~xtJoJ&zEIN_{^Erw?}K(Wv@hWEcHJ-e3MbCNs1=^4}3!>JO)VI zCatp6SY|4OM!AS#WBRVa=Q!0P-)`onlHG4cU5IeB8amjxTPCBwTkzV5kAf_7e?+X4 z(7=epWB1{=cf?QqbH}PU)eGZ4Yd-(+ybQELao>oP<0*U|d#L6~dm@`I`|U|@K(OQa z6lA@bawU#Q(?L5|&b*F2E!A==zlFze@BC4Ym))oHt@C7^7B`xGRBXwRHmo`44r2Q{z@8R;JVN zEO#JNPp2V~jj~0=`p~SPWRwy|Ifx4tJ^nqUwRZmXP75H`na%m=Jv=2kAD67Ye=_FtUa@sa^TSSXuW!FDYl}q-Czf3_5KKj?s>W}g|7KYuueNv_0w?eZd^2^6)}>`H&`g$Tm66&RZ2~6ga&DdDt>wk>o)^CZw=8cF?Hn*p01 z0I@q5xfF%!Ra5%z@db_>GsxXf#GTm){ts#sTdySaxk~sl#%|`gB7*Gc93_MSk>pw| z4HJIV<{-3NKgbGD9f$9q2>v{MlX*|{7idF&3)+ZS9LFQR(z-K#`F^)ft8-zwb5YjW z9n}T9cZ+A2Kk2Y^W>hIfHOckvg&*jI&%gCsRTs#qaO)z&;au%Q3%vBp>=QNZl3mNG zKNyu#X#j21aQ6ojX?!HB--JP3ZrS2pcLO?IS9ZL+4D>Jjp;-aC$QkO4Tq(n5w6-aAOj3r-bO={c)buNtcb2#Ge9W$E4eu zWzRx3zS+EfLwnbq?laoCtWYR=5s6YzmxB4Sz)|EMbI5wNCc!EXEr)PQc4{Y`Um_=p z)6jl6@`@6@NZOq{9jRas73Z5YA`411xHbfNB6GN&+&Etb$(&%b^}hFGp_Maxc~=bE zT2kfRL){O=)4iqlRpi3&O;^>(EGO0)niW+uZ;`x`#iV_=;=cgBh-#SZpKECJyxwam zTXmj?`L->d55f_t4V;$3(9|LCF^z^VDL%zT2XX;?30fJFLUYXcVD^Gcc4j${lWgQx zW}|7j@C1x8lC0qZ%D2c|BB*zvr?#r!`+mk!%UyGba)pdNzH6i0R0iRvKDxR+=wo-U*(C!5|kun2zCLWbC zv|FYkz08m_wdMua=WDi#4|CbHtz_oLEIJC1BRB?KPv?O#9+s#&isicbbe|C5@Y7j? zi?g__>Kfhd+Utu}8V8|=?p*X97qmj_LyUMGy_;Z5$~V2N?|1W#?+K@`NP!C{>Cgf? ziN3qFe`Qh4`}lA((y@L}(+LmWF^cdz$8V0^P+(^E;lv6lLfsyVqxa$*ZWMIT%9E@9 z6yU>)-u|d98==HTyN!=JcV7mJ} z$o&2*G`8_aulCmhTRr6!c<+h|GAQk>LCJ@}CcA{YUI=^t4F|;BQA{Q3o=yg5cV_3! zt3B%N+Q(`D$9el}tU>#w?g)oLWKwLGL*=4~aXBz}1s;d-MKh`W?{) z#sy3bnPyu;Qax12y{3c)_xtm=kptV6p*FH%Zh6=acP9ez+q;_AUzLqCCSlTqrt*}He$aMza@EhDZ$ zbqqg=bQfG_MeSDzoYC?ZL}+{o_=gWBI~$cVRl$uE zV2vUSGp|*{-qr1^!o@Eygz=~5ypJv|aH-XXb9icO3`~dyMa~}V@a|J1W6}_ai$1?M zeqhw{lJ~9w;1pwG1r0u5F1mDao_t}Y|4!{L`P|!k37m>bR1qMv4b$ru*~JEK6ME)$ zxFk0fhGYw6!sPx-_%*506BW)W+rXhBJpz@uI*CVQ16ja2fh}YGWeas@P?%hRt6|-q z-_$YWWJ{7Cb(`R=E{!T0A<_NqnU*{RN}*$PO9Fri8dO zmu#eqoXs=x6Bm`DZ=e3SD$$tt;aN@4^xX@wn&tAPugT&->lNoklr)qgdsP~0ls3_2 zT3Nk+=kg#yfP+Q)8+^NU=!Yt+#txUiFN9h(S4%R%0+(U3crZQ)83f-E@}k3i3DO%9 zAwttKYqp#2&Ya)_KhWKB&=ib0qQec1Y}g*!p15dsr{woIWeYDP;Bxa#P0!jL^fCiUr^)JG7MpJE#ajQ}y^g}ijGrd=A-H50=@OB11lON7 zsvdgb1FlaYVHqmado@bs6xGE=HZ>eRD(| zQJ|@69_lWPrUdOgU7*vUWECopaxE}!bTR{|cRjjOSC|(gMS?^Av;#)g++{$)IT7Uw z^F}2!#j3>p1N9+RJd0|V!uRKWQ2sHG@!a*sSgMPtl*)L%ZTXwD>rOOmahAAs3adG0 zzpnPX9v|IK!k98DWmmg^<@|Cti+*E#TSFIavWA}+(KuY4T2=XJP1f`WN0-e+w*@63 z;m2wiV^{GUy4l)Xf5^uW%WV&NEQ+5{QQZfNjIYIdHlgnb=+t6r9FYff5`Xueg3Iqy z5AR*`tuEZ=Q7*x{Jha-F{ElTevLwXSVF}z zBpr4_;#d8BD8jSM_Cda0MOKPr{oBQo9FVJu*f<)yw);NXQ^=LOZb@fhs=GT}6FbPK zEI+G2;{$e)#zE(cUd_Ul(C^GW@I=GX{T6z6UtFTFTDB2WNgdAtbgi86N6m(R*Fhj8`j#C?cP zv{Y2m#UV1yRag4gL5%^Rh-i`XvEwAU!eac5LXydH64jiCGJW1w>wH)9wa%Q$wjCr_ zY>Q5-3KOTL;zx72kouhzZ`3i-Y>9;}aP|Y$lk}2hCbBG-b932}u9L0tgE(mx?YfZq ze6f5k;yZ80ic}{JcWQ9SXU?z)=jk;#E>Kr4D6AJN;$T7s+{KzJ*~fDIf*euu9-osp z!3NdCTc!o$6(g(GDligAWV*2FXKFl8|Dp8E&PQjOuy+@J@A)CqBlIpJ^$T7vI$dot zB+0TUd=(27M_M!;Z-Tjxk(}AN8n1moHfy8TRk{U)4->qEJI6rEEaY6V$z&NP-?UrY zELtxx!bunq$s4WtMa`Rx_RUq}lzz!K1foa*)tGbJjKccOZpj(_ zY6WIK4L#3^;#F5$(X6`x5k_R4vhfTJPfI_{ltCm^NA)gEv4wM&Mxg9Cjx4 zKDmVr zAmQYzV3#YE1Moz;b?Ghg(25DP$0yTS)n=N830(Je2*2qq^0AOK`LtlbCmq4-$^?8A z0d%#NCQpvJNXd?d-!o)xldm-5hbFo}(`+Tf>Csj@CYgNBKUgTtr$}Itm?3_+H1sOP zkvo(#@-C+CaH)B}Vl56tdt`H>j=aCbpTgDJj!*5d_Ff)C{CylH%}$r9lCZJ<-sYAr zX|+y!Q4e9)`Se?Ex`R#>j5BSa1Wy%7@fnB4iZIyjefyNfr%^Y9E^J(KwXNrq1Er8$ zlYS8Dq{Z`xFya|MBiF%H6%ToDIA#-z_pUOpYo;*LdVd;}wTPG3ejT|Qb_UAxeOwaZ zsoxul-&WFNrO`M?%BEX$TEwlJXkmoogBJ`-(NE;QZxC>F>gisM;p0|^=OA_D_0EO( z)Tca}y0Bm&@M7f?z9KH2veT>ft~@I0JtmsN?(qG_@5ywUg81&=g~9CSM>B%TzrG32mvM_zJZ_$) zOd}E0q!T&ge(3>zLAkWRq{d~#Fku-Rl^b$eigAdrxkrKZ9*fo z@q>{$aUHyKQ7dxoTzY|0VhU{%cX#KGnge+z{;)23s%hRXjeVt5{$dep@#9J3jUr!H zZlU!y@>{aVtgJ!Qvij7C3&D44fiMDA&62|arQOj+dxTk&qOWyvM*jAJvG~sQo{(?4 z#O|l4yS-_k8!-Ih3fhQA(Ehe(zwtAV-_dwt&qyI!^fruixoxagx*ofzGG7#rm%_uJ zxTUt96sdgJbsR$wWnGjf09d@R)a%*p2}&CYYeI{E4`+mN}92AP9XG2)Y=Emfgb_qY>A zf|Y+?#a>@+F>N`x8*jGWIa}#1<)#o2>0vTmu6nZdn&&1o=k2qsTQ5! z?Kq_5o%F+gJee!|ayh>997o{&R(h!Gz#PUOsL$$NlNl$3D-*`P@wZgdmlI)WE4$or zVW8SxD9aviUl~3-=+4mdxVh4J{1^iy2oJZU)*oWz{UgJoP|~l0sk6$uVd(?tUgy}mp(4S*K%iVFW1l{w;sa3leCF@M>()Ds1QU6<_A8+HI z2-N$Z)wY0cDtYzF60{_-ZV2zn>p^skOwz&`o8J=2t3`{R49Z8p|KO?;EYv)bP8FRo z9u@mx(!*)jM_BdraBNXlN96nHp&oiP<*ug=wZM#X$0VTy&gA#>KDj%k8CncAlj+wy z_YQ<9w<)GSw4g;D!_W7_Ub>Z54D$YILD`bQeWeBbF(i@&%sfqq6r6!m<4O)e;qc_Z zNw_mAr!lWIEf5KL`lTptU91anA$x8{JcH1dX0L3DisYnfz0-08zhJi{O)`=YA*Hm> zu{@4iLiSc1q;FR#(KR!oFI2T|FqgO=JwM&!BV%g?`L+BDBw!^ddiy2QjC4}*HRb3n zCe3RkV0s=?gNn)62;~$C3e(<(w2%M!*PluM{c=*KfwoJ2_%nx}+5G)0U;`PIFnhKf z?N=FpPy7S511KQGxcu#HY5&dg*KfmUn2b_&#@N42#C(HAAdk~&boci;XJHe_M>up6 z|EFpHES3P1kS2*Vr2XOVbCw_hQs+b1epe@8(ZVSD-OW9u-y7@<=w@BeuLl3f@mJ%`KtQdguNjbj{o$YOJ^;E| z#86nv@61Z@{S>HG)zwQRq~BI+<+ht;5Q1ob*QA(TpnF>&5|NO8Tdl85F$FTHQ4 z;IKLw5f z>p{zCN1_^QPqzKb4S7nV81m1zK74TQxyR`= zLzcs2xaVP?>TdsvhgZ)c@BwJsp~QL@g!T`U@ayquyX_46jtjAHK0!s}Z2 zO)Dcw=Y}$l*gUJNG03@G-f-R*&-4 z{*-BS71b4W4OtBb_PpbIHP&gz+ssS>E>KZo&Lho#SdYTtNj+-2R}wX_&0wo zOF3GGet;*LUb+ZE$@vMewWVn01H)0l$T*(nz;gHNKbrjv+!;jcJ=m8S; zvu*x{R*5CEel?19yhn9qjn`7FV29@^M;ADVjUdmG#Z$%B_eGH?)b-uMzHob0$Y~|| zjhBpe!wTp2aBigi{Gsi~TFOdJ z)?ZIr04jD)u5gYjtKJ${5lFJr%(7u?n*;oMFJDiVHsR@z&@xI`p(%HLZ5X)5q=C)L zv9+e9CUyX^@v<)ty7A-6bbND+v;1XI%2lfkpYTBrq5XOv;SS`0V@5XbekJ38&PN}S zQ`p3}4{QcTfTB`OxZdi4m?l)!=m4H3VE_F)$<Q3u*HWQQ8gUAo;~_q! z%rEM4ZSC3n9{HFq#-aV|A(T3zmP4m<3XO1JYpm3qDb+~~({?5znFrj(dU{Thp$m~q z*jfL~I}J%3uX5k^D_>`Ggd+xgOl#Q;D#xURI7j3X9$Qnh>sfV%h&TYlggKyCRvUu5 zr3;Pn)E-Pt_Icy+{SDgfuoSSvR=x?v6^@yzhNPX2HrAZBUwVhWwwmf83_{FcF{PW~ zI@b4e35h(L7AC7K&U&{5Zb83*yviM{ncy`vHDtf}db&4l>c4k2igB;Td83L$JNDr> zy}q->iNnq(wln)uuW~p0RrFD&Y&Q_2SSXKw&*tBm~QD}jHLE4Dfm9Ad-0gQ!_N z_etgPYg-q%@ltTKwH>z^J4BS-Sf}B?Db=azz#t@PhSw4yJe`|r)IsF&S((q%k4`Wk zZI795@9CJC0eIKFDR2AA<3;eHNB1txlTzIUUaZ*TbeQ_ohP{}2p?CYJ4>gps8m&P2$1I5c3}b;Va)3!juW zcj1RkS05}XC;OfsA?$(3xrYG~^IE&e?_Qj*Z*%O7TNCa^aK6;awqV1}&0(N!lLa3hATqs+0-^Ygar$Imrg(NGviA*P26he?l zGf;_c6~9|k%=R>C{MeS;xa}n86*KAi?5Tb2G_HI4&4lgs-iva_-J8~vy6y3{nFVi9 zEt{`@YZ1nE9~uouZDZ*|BAqlI_5P&=UIc+A&H#hXqJNI zXf#DSbbcU!FsXRYOk=ygEY+?dHO>cbks} zYvPpndm@(??Q;i(hCbAPdf5K8_~_x9+|MWLvGT@P*5?c>^PVhx@OHV*H`|Gpj?YRD zJinLrW*p6(B~KK%bCO>xXZ-Mh9BfYQ{^8$LUbJ6`rYjyty?yhEyY<9F_h33N6AT=l zgC=r+^?74^rLj3ul|kt~mZ^@0#*v+%HJddbdh*vZ^AL5V1Yc4hvF-YzuS-y%0~qz( zL*Jg{u<7#72I&i&fw|P4f7ho}N@Mq$bGMy*<0E3DpSK4tHn_6ATCXW;J7~*OBwh8# zxEV;fDl9R$DyeBGDzbMXEJe-L&6~(np${i%cC50HuZZPPILcNF1TTCP>Vz7N*+;s0 zn;g@#dUPfG9Wf+Q^_P+EWE^@HA1O3&)K!Iuign$AdCc2!A00A`+n$MLZ7A{|NECS( z6X`bEX{+^!1uNEMNrq3Y6^!}{OP<)NP1kHk=fjOr{_I%VlA@hf>B2u)D>}hK;FqpB zwzK^?`V{?PLiI^is-4aN@6tSF#Gt!i93RZ3-vo^Y$`4h)A#|28Nd8(zaOZSX1iI zA*k?HvtaAnMD5F0ICKmA`cBQhXotFSx;i@J;kFm*A9EZBi*tTDuyRckJSn9XDg~1syXcLzq%}bKap>ZZOAbu zlh~>+FjL3cc4v`c-KF>_(p#FxLMK4HiQMc1)caTnaiDK0gr> zx|l9f?LrV+4)kBHZLoY`)nw)`iUWD&dDUs-Y>m`{DCK<0j;9Q^Sc97ag zYqPJrqt0WYp7~aqE&4*A=(x>widY9X!g7D+IF)+ znjnB8qbXLOdcu$4xE66#SN*Gg$Sqml6!Fv2OC=#6$v;l7;=@n+fMbl6_bfA36Hg51 zFEpnQZhB$=q@!M%_UAtVqSI1#c`E_7U@0-%+dlG}5FyX1~ecVJ% z74wA@N3HfiGZ6 z>pLq{hO5)42;Ji@NJ9<_l5SqwGUT{ci{szcIz9kaibA zgK>3Mz6+t-c0Pi#E-}42V4e5ROEZ|mc*?CK=}pM%z?Z~}C_-Uzc$dgG3-i;>X7yF- zTfoP~hWPh``YZfq%TjnSLL)8&(vJq>LXAw+zQ{Mz(zZ*Sb2V*Bkpml#^B{lTBK@k* zE!SP-wda$DCEZuDJ?qgfe@H1*AD&*G^Z0d0h+w@m=Iz&poRf5G0msqoj;9hY?XUk_ z?a~Nfs=0NVGBzJ(m?d6pjfL@m54D<(E72zkS6lpM8Qe>qIL{R7l=PxIsXQ00EfXKD zS_G$)HyVDkAvvcpT`-M4dC~I?d$I22mYzj8iSJTK$K_t=4^-I}ZoTy~Az8XRZb<5l z>ceU(*A!^9EPsuw7Wb9AKaXvRZ3etVR zqfpuBzY&1&T~cIiJNF5$vG)U8A?(@fOlrv7R|-c9NHPP%Y$Obh<{$MLE}>=!=whhO zb10=}?{;i)UpbjYKDPGkJJHw+lq+4}9TG99d1-@K%5tc-oBihFW^JFHLFKU6n4COb zU)-1_7m4CcK(06YHJ$oA=>+B-eFZ?Lv%6AJ>^EMrBf{?6=Zh8Bz@p4_KEpm)lSVu+ z?D^wR5*6XVPt~4PPaPLq=YjZ##+(M;#ZcB~WWmVvJmeKl$S13i_5G|&o);`GGn;!_-59**{~b;E1Jt&~r6J|2cZg04EbgSH zs=@BS61=>xw()>iaOC)>*Q(+rBV(xMh{0WA$F$P9^Mu{Z<)P`AKZL+|+e@Uqq4Nc{ zuPz>p6Jp@`Pf%^n&qvd%;7Jofn)W6fot#KWz7(4aU5JSB};JI znH9xJr^`fz2XL|c>wKzcwpsUzkjaJn{2z#2)&%(=QiCWJ>A<{V`bB-LIjT#&ZwP|W zoBJb!%l3r5$4VHJgs+yp=H&do%ncq1rATj!6F0V-)U8C))HSlD+1JeHjDJ4cSVTXZ z5*KUjDpavauppbV-pDrklly5|->%c_Fco{R>i=sC^#H9C+Q|N?8c^O4=h=<>m2Rz_ zix8TpcILyp;E|~u!9@Eu`wBz$gqpi zZ&NEj0M4u86JHnZ-&)ix38`H|TKAJ^01nFfm)~DP6p+23+sa?0*C)~(RdTa`JxBT5 zMt;SPf28%y82_Vb8!Hxwiwg61R{;-u2~uY&=dNmZ7~MaN@SnnCv;dD;*vb0vua){g qa`|2j_{9F-KmYwk{a4$~UkWtBZ#o>rc z2j6zZ5p`5cjiBDU#@TBxPzl{qc?JvLyfSP<>f{1 z-Ka`jX$`{0hNZv@}%t&_8;^Vt;L;d~tq7!|quD(eXqt~6cM)F&&P?%FSE z(4UN-J-hEdF}hhFfZO}_3B<;VW98FqNafvhhUdOTk_@{L(Vn2sR>+JCher2Zc*Lbx8E17r%5OpJ*0ajW1;bjW4~b(Jqo5jiUwP& zrP|JN*ycr4*`)Oy-HaboB)JJ*VQh(GjL05{`e1FE>V3=+of@tHAO$=TO=R)c9YlXUw^RT77r=!nr)O!bnosyaYhEDE>CSI z-;nANsaQK%Bl_SjP-Vf`{My%a-d!XiG2m5ggcav2zjut;pdI$%D!1=UP@B)(YqM34 z6ZurEF%kW|zwqYiNY933ZdBmbetURmr!DLKVW6)sYdU^S^C$0JDG{cO!RId2afA0< z``3m`IZF{Z9~#^F%DZR8M!m^28T$d0PjLjhH1vfFc5mVc?>}sg6y%sosEuUt>I|3nA1FW*)ls;@fQmltvieyIMAg^ftXThd!P#-8CzmyWQ8=zZq%} zD1D=-z}F6Ds~~xVC*SsZh+r^S(3Qv-Ur2_#o`mq*Qy-$vZ~jk-WJqZ~Noy-y8wovl ztkwO{Lb@zoi2TM|nXejn$+wBI`pq%oxwnKL@2(Rc z5WO(xRS6R?$DhQr`KBU_6C3=Xy~mXqO!DA+#)4Ztp(LKrcc)(h^$%!*&Au<)kj^EQ zdrrk8`^P($dodl472n-3cuUS;tw5|9S524M&1r>0DJN;+`UhKhEKj?z<>}LeKkhJ- zRo$$*s}&;h2GHSQ?)-|7$MQqD{!LwKhbYhQl@&abR*qb?H;lq1f+gNIzeD`EV+mh! z6_x`(9>iUXi2jkcn7Smmc(m})wY1)Omel!CTvzde>Mqx@xj0q!d+zVZ@3d{%U8seK zUD_L(1e~D)Z4spMq@*`*Sifn1DtqpT16H zl;>q^h^JR5%&p0F)0Eht-@x4{O-@dZP5!L?h1|^ky>`PGPhp>Cm?mw{o)uXoi|<_n zmHFqA3eHS$rq(=-CxqnQN{^T_DkC+O8>D@oqv=#r1VJ{6?sz5rt^3LnOOquyXSl_<9Fd&ph$g--()a_ibtl zX{r;~YtabIKD~y1%=T_mFblbUiAk32AKvubka?e zA4W{7*2PLmy|OwnMUnPj?8`Py$D@lr+w%+93d98zgf{3>G$Ep?GU&L8}Aa} z7YGn|^AN(%Wfx=5X5Z+LZ|@8fFA(`8H|{pR=->nsEk6t`leQfh-B>xebMm3Ku{OI_ zA7Thm_SfvvV9_j)E|6LK$fXGSn2`%HsWb6bQCD^enZMaA+U$Sw92g!Lb*^@%c1DCR z8)g@FNT^B(B~BqW2uFu|hLglRdExvb?S;ULy7#X;Z!hsLNpuRSnv;noktZo7QRP<@ zHnYM#;s7Y|ZSgMOP+x)llYwpB!zdK7*XuPKZ;MLAmxrw#Ir zcIuj3m|Rv|bj-&s4m+ni(k)E@z?pE1CCjSm{HfKN(n3zCS2==OtY{7oa_EKdGTWx! zmODB;Vz}dXhv*K3`h=4I#SKe_rAmg0;r7=xOO#!4YDa2QYF(M$BB!lg(Qjr(8p=f zkY%Z5u}c^=lX9DtTaBdrXj?ti&;0Xf>Y+y)JVi6PlB6PTGHSp5pXr$5TeWY#eSK#@ z>CUMukV-rAC?YO8N{m#fWyt?$?A!L}ZHoJxn?{cH!Y;chOhw8gAnILQ06AcFhw+h0_^n^VdUrGCrkL4;qg+nv}ru^x_(guzYCylNL!!rQMX6$Y=Q{$ zO;YOa&`Jth0o&1`R~wC!3a4Q{PrC#Pc-Lf|wRa-L8^mSi53QN|1UdXukx3VQZvAe9 zdq?-D*(b{WFzzunXufHfQeOnD+1dO)-tG_pdt|6=m{EI#!s`ni3H9Fk1gjypCof}t zbkc^3Xr$kufBEvlZ|zV=$|7+2VzQ>qroD{VEV|)`o-&7Wuacz8vSZUp(zyDH6mTZF zy3nEOY_-|Q)ri38q_)vXLYqx5-AuIFsjapXae(+S14q1rzs~EgEwN_-6!@%3`&853fD^yzBo>nw)nU2L`uv`Nchj8CiDQ>DcZo3w2S>rd-!JO!?5&9U_DR- zleH5!4M*Qv0!`x=U01pjAo4_5*!9V_hJ2~46XMljd&bs+=9D$nASl8H@zWnbZ;egRy8)ek#j=O}j zgi~FUvq|GXBd}EHOC5x}5~+6a={!ViRt$>FhXaA!YxB!z0wfhAQMA319R9iIUsg$$ zq#hD(Umv{VM{FAO==}Jc)OLL(se~mv;#Rh1R;t9PpDt$h1f6y_J0XLcqUDc3)&LO^ z!-Im^Y={|UvnU#bF@tqkbcUNnn+Ew;?Uwe=gdrW_HE|I!QhwRHfo`9U!+IlRX(?&N zq~wD91k*5-7Y(P;yP(u^Tr=8$zT+d+Zi=r3^0)+^!mLEWLf3I~I`E{MiEscv`(q@W zhWal;0eBaYwyVDYRo54~AL4wU#l7*u@;=+fXRf)M(l?6a>hUK#Uo2Mb|2o)~w62uL z`L+J*S1r*azh`(C)oBgqkd_O|klao9Vk?fkEm12Dt&}_##p+t?E83{4;;><_Z{XlY z0B{JfSGd^EL+l3&b`!&Ku48}iVL#7v@c#Mq+IkNDKd#Q z=>KdXioL$P%}r1DXA=(x33`214LVsDH)}dUuBTj2=_PN{(b0*!S=oqcK7a9#?%4lH z(A#-kb-;B@zO_ITyP>Fmz%*C78K=ef1J zr5nK21K{FJcRB7W3l~oh33~cVLjU^r7oXNXfPYhRcK^q;uoL9Ie8SDc^_2TxV`ICD zU)~kf0Qgut8axL$Va0=`At@vxApU3jzdiXk#sBE3|L>kW0zAC`+4Vmj{dZR#cWXCU z7bh%D56OQo%|AN-=fi(=6z9I2`hSSxFFF5t7b|GVo8sL6S~SU)?9XvsVeThlMOLB|dQQzL z>z)>>PIHcS^VOg%_hwk;YunYq&BlGP3uq>WYYmKQf};jc_nX!vU555*GM2V<%b$=F zkV@m=Uj5Y_5c2Nb4We7Ok{0@<$mM_AP8uhq1CL;xEV?2V2k!=5@Ks;Z6gNn7pmYds}Siy4q?LI%qE0` zbh6Sqx<6YAPNN87*DbL#yU(UwfuxjpLoMbt$<>#_4q7dHz-eeTGf||Y{7hC>4l~=} zUO>*KtzbV<0zj1-RK2)EE6HKqmwI19*TbXw6oYK_GOMx~81ZmK$VMM+B_DtzM&)yw z#b8|`<;hK2M(_>Pvt}q*M2qmwqbL5T9o2DCr4%+MWDBOuV`GF#2XRl2Svp()jpvmL zGs2qFW(NIMK5Ak^7%cGgVjn6c7Ey_Ombbm<{wnI3yvR-S?4CI9bnRR_!+AYbLcrCJ)avvE+NUY@P zYT8v}3#A#Z#V+F_+w0p`@)AI<@TmUC(QCQ;E~>;8ir6qYq5!MU7LYYT-_ZdV-N(Mw zrdAVdc&;{4u+WtO?Sf~k?1qfZ)P@_?#B7a}kXcnbH9@wWEu(5s#ptYPvM=g=XyLQn ztU%ngbYWOqL&K-+Yis+7^PlZ<=2(BYzwK_(;_<3cv3ognANY+6^h8&Q{s^-9HvB=- zdiR+Z#MY=FZmfkbOtQve>`MKnqr_#=&s0gdJ(tI~oVkGc^|L)`+rK;hajof#T*z>6 zVGEdYB&#Yv>5=q3v8HQ=WG>20WhTBesU45+Ns`1CBuyy54m(q6py-J(E?bY`g@BHgI%VYHz7@bYCIIH}?%-@yK;$K( z_6rG2{y|VKD7mSra27qtvZwI=-VPnmuS+IavN^Q+D(1T<@FK&KdS2ND^AytAx*-y0 z>O8k52p|Ha+4VkzJD#=8tNAFTVjgZ$>-G*0rS8BP-R!-HOy_w`kGlv7dc9)o7WtJqs~eTw3De5H7c5lFpuBe<60I zy2J#NogQ`!;n*yuO=pD!%$4KHJ8e;}A8fb7tXaC6$`3~kxdqxb$HqKQ$2wr2WQ+?; zlmbO5I1AG}OM&(m4?Xf3$z3HOSulOj;r{lv^5l1v078DQCOh=ntwvgck`TYStSOn3 zl&(9I3r(@d3LB^N7Y&^)?kGWs>&Y0a!Rd+>tn>B>C0OQci1=z7`UfsC*QSb-YTKs} zD;DVz8MC(a<8z86jLclf<8zn% zoWpOTY8blHLIWoP+XJEQdyYsa2${JmARN+|Tf06GH&(0cH0Vd2GE^kK=JBP!o^Hr&}o%MmM{910pwLp@{`ftDex;K!8%_|V(gWMo0RIy zSlWJ;T-aVCr+|zF^oX)yHeWR(W_u8c8KHxm_)B_d+X5VNgbKG3C~?hOBN~J8$EfuE zBn~7RrgpPr{MIt=Qi^PA9kNAp2n)HxH^dnEK~*1qUeW|7-?_@4R8~nZE;LytXg{5t z%APj03MsMR(n9h$AD21r|{HL?}GBUmA$Qk zg0^<1>P)Yz`$K_|phNxiB*~gCh1s||PgRG{t?F5Fqq9CUI|eKgeFW>HpXH6N+%flv z?CA8vA|5t*;DOZWwD2=1P56JjAB?&{uVq_f?H1)+GlILOO+&)WY(|(bpPIo;{TpCS z8A#mk00eFb40$#SJ|v%@Vo`I|M{P+GrD)$`gPg}`4e(1lme>lzJo#jUj zhQ2w52**$$37X2{y;I_7?%tF7gi0wo<;f5BQ z8g+rH(qZOV*4m<@@}`7lV9b~aT#+$%vhe3guqKa;FjQ`Zr*3>#xA8( zFBQFF{97HxzUt~IFEvFM1qLkm1h6b~~^YFX}#iLI5wBsgL7fO4cC^vpUx9RR6{!KT z=)j3O#oExhn|al9M4h5CCkF&gnf!ce^WK?}rDsKP`=oj%FvQ?%eqm&_GMgQEOvJ8r z@HnZc{pc1${FtYrz5k1tN~@m4N^j&mzo$--b|HVwzT@-Yd95((E6I=0#i!wR@ZV;c zVpfYcW%uP`nd$CRgSsCXA8C%43yR2GlswY5ccIVqa!i|RwPnjnj$JVjs#$yecr%Vx z$`lKdK5Wu86CS~Vs5uW35j*tsz^-R8#O-AsL`d|v*bxhtt*+72(=&2#sDvh!{!qJ; zx4-ODH#rt(rhQi`xJv2%y2r?k#hJdeNA$l(ysTKn%l9@#?JE287aavtEaKgh+?Kn- zr2jP$I;~6WeUm-&f1%Ta#oo)n$>{&NG5>=v>HN!60J*N{|3b$XJL4Wq#e?4*O0d#X zxl1RV&069S^?N#d*cm6GUU6UTN-vRr8a-BQTKXI0zo)Z&?ZyxbvfcFe>%%LD6#Q1)VxQ=X8mC&(?|;o|)L1_|w95va<6v-od4uu%k$3az%}Ac0 zB)7FerUv(hbC(?mZ4l^-Y}fJx4T3w$-6H;PGl-YCREZJKmZo+b5gK>io_RArn_v%n z*fX!~X%@?_ATD(Y*DX2f94SBX&a4Q+&cs_o@FAqZRE4#CBG+K-^f+5@@-9>vWR#u! zt~+dRZF@IS%FNWZ&a4SJPsNIokkBpC=3toYJR;yliT%uwG@)Og;4J!rZ()YVA(mC zOmZFGU^i2sL0fkkB?`Ih5Zo7Znq!5*&tv9}G%je|Y_rvfj%-(YkwK-YQlQe*`MGJu zo<+H;y`cw)uT8*MfhQMZ(MZLMd00=F_hI$g5n4#9kJp_q3g=ADzmQV+zcm4u6{}}S z+{J_?%_$>H(f%`!e$5z~y@bxbdxDgNsxo&s`khe}YE>m_bLdSsH^&fBH~^+AQXh$v zbBeehn+fI*Bh_jw1Ge%aa?L{myhMtF2(Dl6tLQ?;r#J}pcEh9+%ePt?efJn|OZ!tL z1un^i?XUEu#=~BS)1|N{r-Tj5H|sRh%?x&Ae(ic69jCr`{CaWPo0*HNN3^khL)qoS z0KX8gpO!`L&fx&U4|$R$XK9>RARNsk3`%iKg&^z)0+i| znR%GG!F?0~b|K5jBMm2`=ed$ys;a$DfiJHBfk6-Gq>&MNcB`n?9uS-}-UPzn~R`YN~d6prI$NOV{#zoyhYfW?j<_uZsI_xImwq4Kfpgoh{&-$|D9c1T3 zb!NF)-L@x6hsrTixi>l4=DkA?=UkcEQ!Vc{+P>u_?$zGbt=h?2kkA*Vu^_XSte$&uAmGb;EL`4;0;=Wss5wj_D1>AdZc;yBZw z-g5SnB!gGpw(m|0hMCo}CGFSwvWxXR_eFQU69mzJYD?RqYOh@wZ?2zCvqC%yKxGubI__~GDWM9nswz=jctt=(F>Wt zb-9XP_L#u+ww%OJQTe+=274HiM~Ug41*X-iR z($-d%!Na27*}L%3MJcSWNHMOgbfp+c(c9~p!N?yJ=zIP(u(q{BYToDK)^OT@y3{Pp zxWIzQD@XwI0NrC;1;KbGO^-ti{2CFKV^RT8h3R5X$ah3ei>2vfRb4 zqMD2q&V7-SrR?!q9f3v zs&4*5)9(XSP^lsRFF|dTpji;f9I^ftsDf7m>x2j`Y_#>FK_w}9>_U?{ujl%Ty64|1 zWW-@WQmMT$kqmQuwE~lMuDeQ@5laWI1FZsRw|XmA!Z@?5uz#-|Cb*+?##Qnc@4@|0$cra{InGjFwYQTa}=>ddE*PHJWx$qQ88kpMCp)mi{fO)8soxw}Or#PqnN#TugY^nz ztfVC%`}D}4xA!f6{wi|=)^Z==hoq{AP}J_zeKPOtU33e5^Y%e3EF<_QSxxo z^c+cuwnh6Dntc8=R`|LQ^UI_R@Twdx)dy4DVo$uIv}wz@XW{GC3F>)9+o0?Lh(n^HZwgW!PU^Mm*GrYRHMqBG}Um;{$1roJ`Tda0n z)66Bs4g*iMb{cD{bnE283SEMP+vkNxw8heN4w&1q{*;qe<$REEUA;4_Qpo*4RF}<+ zc|7859DE_69`hB!|A*<&EckfNaWw6Cb0IlbvMQaV83|;t>dbwBsQV+9RthlPIc?(y zJn*v<;@xq68dSwt>0aUVkr6 zoXl-=K^wI(>gCW@$PaE6(c(R+X!QirdxJ zr)>nS_ES>wcqLNLz?h%^JqVOc(dwxq;AZ&)=-GnnuT zTPeTqkVHHL<0V<3K05`v?SEU#1EmQ_tRFe=yx!z+CLTmR^_@PLNOOwruC0 z^wAb3xN1bS=6rJZdrc4q&dW2`%qzfxo{mGRXy@nm*@`nhExTXHhSACoC0iicT13B$NZKW&%zp zs&)*1`GtMnFO7rXBeF6#!{QEoaR=+nC6Bkoe4W1YqnG!Q2w7*$QT>b943}>)83#4Z zl75*1oGxipkNhmn$Kqv1hG)q~D&3)h4ZE?n%RgX}se*d^$yk7DwRc;u8@l$or3;dYNqD*H zab7zfg6UKjz4@MwnJdh$W!8vrC&Az{-1R7C=?(B2leO}+3y0%mr7Zt(x^d(?j{#wB ztkMn5blwu0gv z&68~(kMJPFzBp#<#8iKkB1)Fb_k;u2=lz=JB%!(TyBvD_4JxKo=fV6%MgxI_WChRm zN{Xu*4?p&Zp`Q1Sj9)Z(1I19@kbtlr;_RtBTTty0X?pazEMJmo*yR6pA7QB4jDC3hvug-J zv(0)cmap+6aK?hPI;gt+VAloS_IIBO}ldH}4!MrL}f>|IowU(1=%Ai5i{djaks{6j`R`1H8gA(Y7bCxev z0NRks65i@y=JjJKVznJs>AB{NnXe9X!t8KM?M2i(`+#}iw|{Y5gE~vu*kn4qc6MD4 zlQ~=|-Ij9w)%0j1>+lY>*R}QNhzCcrjv%UuLi^DSJGDKEOypf+=?MY!8YFF@Dv7>Z zZwmo2hAmebuGy4idrQX=Hw+9=p7m1;`}1{Kf}qhED$aoj2JsS&P{9=TM$K=ILPEF2 zl_mMu#RT#)e7%D6F=($rmys`O0+AxGMSPOqwr*?j9o1!YS(on88 zpmfyM>whFSH6T!&CAHL|B&XM#W3oDfXUISPe!d=p-1sUepe?(9J4Bara|X0uVQaQI zIjFhIu!~)-!vAlpMVf=sVT<0Rj*(=$JZecymr9f;uxE@Vx#hL5bPq3$xPNmmd?+^v zE}NPa9rfE88{Ql2C|{9JnpnNQAFl*_kNL%pUVmTUSQP!a%LI2Db29ftu3Z)?rWTk~ zWvG1cN{@{cSdlB0l2b*(tt!qufY~vKO&5n7^4Ix=`RPb_H-MtSnTp%<@pf$u*W>G5 zO7pAYY=NS^;_=@3{b{HIRmEIDWT)z>Qm154`PoEh`9zoM=3UOmzHU>flH-pglw2RA z;Jy(f3#}9b23c7n-^59EvWb6ML%!Q>S1)UU6$UB#?rN}wmqWRe-ICjt0p;o+vYnq~HS>K`BNZz)qlEpu(A&>6|b z8Dm%zdX`%ag~8jCPhLs=_c0Wk_oTrFRyKzF-qiLD;H%Ww(VSdIZf;11^39AkKTdl~ z%}m>r>Sk*@vQ;T*tWEn%21-EL2O%U0idd+Z?^y7ehX z@$Nx)P6k4Gt6va`^T*jBc^bb{Eb$P=-lQyfEB)9HAplIsp8%NK>=x9ltM~#d$|R+A zY<S`3Svf_+ib3(|8?ubbjx#>!zboxnlFfqcfP4UesVe>{G}$rTe`KhKx_&B8PbSNTGl*FXOPjI})Tk7=Zb}jOvY$n}&&P=aJ*Xx>zRJx!ZHXErK zCTyt#LAP?f)=diFrr1fBYi3zArg15#l)SC0MJ^&|Ym5>^@9n3#(Ag~*MVC+Lur-YI z28bgt1nBLWxEiY3ETKS_yqCKcYX$Sef-T8}%%G-JmjQPY@xB9O;MZbL2^!L^1}T9) z>T)lM4(-;q-L*u4PrkEX&Y z+SGl9DXrQP7#ON?GKN`h-O-z-G20aG`n57Q1J7Na&RY#g++xWCUOJL763pM@pCJb~ zo`y+E__0=xPxV7%zza1K#F^!Z;h;>Z(T4dcob_Djn!?ym2F)VqVFud|2YH(% zEpD~=;&MkuUlq3Gx6z=PvHR~g(`H8owq}_5R7Kz*O#7P5K#G{>e64!raQgNxl}c8RO<1Y@=;*11V=UD^Hqv50gSIdAT|gqd z!&KwkM}t4Qxz`=td-#w(h}*yxe?7aIZ)~|nN{H_S_M_PG;{BF(9TI?k?H?D*3+R1B zd9?X_%7sSIAMkdB`CZF|sSEH1AVN@ebi^rC&~NmUNpa`i5*`d607;7|VAgrjDlzWR zOAY*^LXy;=FI)Ni@WfxYQTxL#gr~4+1c01EGYe=47fUHd6U48*XZk^#Usd% zpcUu+U^w2YHwuf85o_rDGN)W)N?09FmJsalJ<(|1Rt;yKCOXkZ{Ca;KV0H4f@#u8Q zvo4V64(^{gB$qM0K?vg~xkCuh|ApPl#3rKze~%RZG6=?LhH3+fh4&-iO@fXSler0B zaZld#6bPJk@{z!qxzCXdX`)LkGrI29aDU9gb;+Tw z=0KhO#!A0A=HawSwc&3$ghdjgQ%#}=&S}D)vA#f-^4N@R?V7f(T0Oyvt@I zY!-t+4t^N2!^aDf?4BMU5{>rzKuwacyd)>mwE32bwG&v?@B`vKqUepLHb7TVP}LnC-nG4mj+zAy7$Cm=iLHJfguryT(QU{k-K6HU@aEa?wC z3{R*VY0)8LHJVP@W1`>7@xe%p8mv6b@jh{ryg7g3Udx2a(JhGGpKn#SWi_BzX%T9@ z?vJ9b6tX_(oQVg`C&f3d1u;mwcdCOIPrRa!K4vHVRG(+U#t4N?lecYxkK3;+k-3yK|J5ik76G`WL4Gl#w?xdN(EJ|V3RK&&1&j3d1!l zA3Ih3IerA=$vTf#xm_quwU*wft|Q!9)3>85Z8wE~X`l1k zPBjYTEimG1*p9DT3~`RsP-~a(xcZ0dO`-rLB?^)Usxv=J zk1a|O)$=XgvTlpZivEbbWyMK29$9zGMH&q89xPMmz3=r!+hz0=o$&JdfUHS$^y+v) zE=Nvw0T%3jg_Hh)IR(~+>>Kz_%=Qmck=kq0lzryG-*Bj=`alM z%k)2*ia>87^VZx6z>?L-?k{#CKI;|e*$4a~=A+3g{4iN zC>5@kj4j$sw6r9$`L*XfYgmp0X&VJi?-VO8@|aCk$5som$}k6r_T>W$HG5KK<_imO z(KkWKURaNb`e8`r(0uPllUdy7E(@!U3rWF{1J@1MRwH9v$~1L))C)6mB;G?GScW!e z{0rmBKo~yr`q}FBQKyxyM1^a)LQt>=-@L!4)A$xbxw!+lj$C1~9i3vFUn_Bdvzi!T z{ni=N;Z7esH+Nce1}H*X;OsYl7@&2M0m?cDT#-{_WWr_^SSf26;zPeJp=K@i_Ai*c zr6gb6TMJYF2na{SN_ZO5(42(ZrGro}q+Vso#=F`8!>QlJ4w6kaIn~7k+V=X+U=HlK zos0xUUBsxa2!<1oVyoDqGCJrV44-UzW~{Uhh;8Fr0MjUg?q*KQ@r^MRpC>tq9Vw(l zt!%4omeADa%b@$jpVynWkS6K=IE;su(8zj8WaU>G)Y^aEj(u0nYOO@UO+U_V<~A?H za+IDdh%L}C+#E0ANMTDRvg0~uzIL#(vivtr=b7(hAVZM#94dr?&ohOOpuRFReftJ1KKE)xesc zfIGi`UXOz{a@;tg5zl{T!K8z6B(i^JXtABRg0aTwR`B5CzaLot*^5pZYWv&aaYAm< z#qZ-iN|OCuvCbM1Sokh`E2rsWY)n{bXSN~NXZRU5#oep#QS%g4=}s?0PE)9C#FEteO9Oajt|R{v%d;e91|JSJIHC ztHFEydUtPI898GE4?IoOuFtJDnJ0TKEtw^$r;JJ58x}Pc9Qkl|CD- zZp)Le3%vj$8&fs!`OEZVO#_?~OZV6JU6d8(jokomEB9tUpIMEJiSPm^0;Yl%;^)i? zR`V{aCFJ0lm4Tyzl!p^mXlzCk!x>EIS=1ZyMRxBp(`4LW{BhFkL3uJ!c7SD<4z>cu z16#tg0WXMN?oVsq2uC?GV(|x+iKE*6e=5!YaHK)xPvWJp6$a&8@P4!oSF!0!L%~w+ zx6k&og~qhL7C4%ViJtg`0w2DK6DQ9!pV=bI?@@)-rJQ&c^+Y7KCW+DbDF&YJ5%*+s zmx~KJg16`WrFRfPVaoyyI~S(4rmOuJEkgkpx`BrTzK6E(7|ecl=KJef=@WCs+`wcsR+@$~QI`2n$!G8cW5Ei%onR#FgY= zXyP2UD&+0JK3C=hz#xY5)7J6r_g4ovFSGcCe1VBu(*mG9Yvsl`e>k<_ z%l!$mH2y!Sr<#Cum#OD~lNA!Li)tcnu>TjQ_(@BV%u|f+L!>F;JL9{YleO4 z*|WWYSu>&`F@>q^OdOyfWC6@|rShIXaW#0abeY#gyZ0EPz2}(GsMgZG0aWW0%!uK7 zAGQ!9-+mJP?Ah$0gI>8g$Qu!xys%&nfHy`Ge=mnusB*dJj}A`4mRR)ofdlJx1?jt` ziW-QHNG&hmStmlX*$wU9h9;=#kgWDd1T^0N7C-;V)c0DvYt;6 zlumYKLEpF5YoVc`xdxxpkMbDif)*+~HupaS?6;Q_;T`Pm0{rb9H8d)&`K(OIto~7B z^^=x;c?kgTgJ7$=#GD({XlRB;*)|>*3Ru* zZxzMA#YsxKh(i7H8WsZP%_-+iylFRr@|cq^NEQ4{ZB4O{w=}UFzYK{e2ZD`uXqZv+!>aZ=#f)9CUELH(@}{Gsb_*@K zHBqɦXO{QT_f+b6?t`4krgYSVt?4!0y7qjG;i{%-DE(A7OLY32cv?`ycg|Vr0 zP|rO**h(bu#Acz`xSMw)V>l_fOs0i|sy3nBl{!>bKR>PT1|C=vZB{1!SwI-K)o*a6 zpIWHH`}yiEHnV!fi|LXgBd8Jha-Qg$WADFju2(QSTwE37w@*ej1aqoGZPKyB93{Dx z`y-NUr_AdoSdG|>=V=7=_m&5lc5OO=h4@rk~h& zKhK@h?(ElxVN*maR@G zEYElK9~id>Bep!eDtQ9^Od5ei)X^aCa~l6RHU;) zrR@nqwY1V2uWtD#wvXq*4#(G-V0twaxrAQ|DOjYY{rV;4l`i)3d>RsSS*^?ydk}h+ z8T#{{bP<7%f@4wrmzb-YJkX?*G_^Wxwbc}O6LsHBAq20`{+TsF<#TM1BK4^|>D9;J z?i(RFbP0e_OQy}7$T5t*=^qa*@DB1HU&ECmqx=j@*Lrfd?Tb*V?C>Bk+vwzAo+`^B_o&pNZjIk_eHq}hx$4b{N!Qeg0l zXEsobRunlKokU}G0#!LgX%FkWs{&6=0@0Rl$vNWHTQX1)$!wz;i=*|0$^Wc|myW_K){E}P%%i}ztX>M&{#!&`_d<%xi4so^xf9!XRR~6&RJaY; zL`I4*g*k6o_`L(kX9XFxr0o~qZo$}44iyyN_Vo=ND>8lQyzZ0YINPYT6SV5N0mq0^ z5nOEd(dJ$(=viU&=ZWmLYLv(8#hP_C_tOqKM=L$MXhvbeX?%H~k~wuariPt(+Rpv6 z9x+Hjs#70iZF578wv)QdeDo~TI;XuJYMa%)%7}UBR!5M=c-G#fmi+7^x_=;y;+`*^ z;p(6n$C;mJvXE(jet!S)OXt=Lxl2$H6X2J_VN#K}1@y6}yiPs7CPd?m5^DYhAVB@U+ak z8R`_zrggS^#>)idMUCfFoO!FpYK%woBTuUwn~8^@=nZu;M-mV=u?;}!JkioLoO5mE zFx@Lp8_3eDMa0Cf(LzRR%8gkIO-es;7*{{LOiDsH3u)mQIR5%SH7I$Hy^Qi+tn96x znLtXa)(W78bl4L1V&dMdPHgdRU(;J<$6!;KC&lSPh|tC-uq5zB;E88sjPH&q#B^rX z*ypm=YLYFP9Z*NSJ}2kr>EHkot9D|e5=6#dKS5P&@$m5M%@&g(c%?=z-`Y3}Klm$% z@V9+G^_-5;j3yGLkgq{!_701!zh#|eHNRwzHyhx5$le5xApPMet z2W=k{MZOca+iQ9dFW#`P)DkbwyL(u`Bp>@|N3?}wtjs{M>kS#>!WHclM>Hez+az#{?Xp z9sQ#Y?e$VlKsvR3*kfk}H_&HG{nmwI!bbr1k8xm2Mk&#qEWAii>q<|;(wx(? zTdSbX)wWXZ*2BW>NMUoW$x}#RbO1lX-BdjpnXX7DEUC#X)efgw-({Z zR9e22Zxti`*Bk!n*KswTtEIC5cZAGDdLG=Pe-mpv2`a0wf`?SzVpZ(UP+ zN|G%B(lU+ne{%BWO~idS*g&pU?u^IH=g*%j=<9!Y8g|aHJoJ*bJ4WQzTnRB2@MB+_ zV8o^iC!J0_le!|}_{=0;OH|7s^$)jar*}rnbHzn0nl7{VCgqlH&zc2KACq-^uLQYy z_u~Ca1*9wSY*}b}`V9)KW{%u1q^F#1dK?**u=hpX%<;k zcp$*af-aFsyUg!`06)wrU2SO^8f=}@#K$rC6-{0pFaGT=TFIVhiR$+zA3DGete&GGWZpQjbquCzns$ivAmB8R~%QuzJok}HiBw-VBP5Z>$}NnF&&Fq$MnLf z#HuWOt+Uv=4cWK+5nGYJA6~P~J)K-0m`I#a*Vn{tnskcZ>&2EZr`FA9bG2vHk6uXO z#*qFK68(}5L5M9{&7HidTJ&6vAAA=V69XE)5KCD6LM7m#`ZfDtS5d@|aelB8{_3Pu zODsY~?W@53f4hKF1M{KaZ%SV;<0P8PG-BJhQw`qP%l&sbunv+<0O&*K7?r8{GZbsL zr2L5p5B?u%R~Z)78m$#U6ckiS8bwI~X=zXq>F!jzo1s%g5e22Bk?xU>8A?UEW2j+3 zdWHcchGy=@<5AB!dY}6|_uocj&z^68Ypr*^?^^3ae`&ayE&pU<{%)v2`630Ep{-@l z4sv9~pFhjK^MP7}-T6K9eu8B`Q;`IbfKs={lFC{U%}-UR!TS1c-fgItOBswtX_c|` zr)!>8Om|G1botN~fgrieZ0t|Qj<0aM`?w{zMS;+fzj1tpe*Mk1q*b@iVx)!E_jAOjnLwdVO^>vcXdXA|9zUFRbsf z=uwKiXUd-2^+!d(RF6}3yd3F4Z0ULYd)9fx@>Es(R*zN zo+5)B?!K1WdyFXVd5fs-jg1!X!JSSL?6#)IDQx!a%_v_*crRv64vAG4yKdzb-c?Y@ z2v2>f^WZOvG)Ut-XAltilUDtQThGsI)fsM%68rRT zlaArwg_sRJyL8Z}1DE;UPZhL9_$gV`{tMmtH0tPMqvhr_+;9rnJJ!Q#pFitdI)@7w ztx-wF!b`ADLxtT~)x;`*2NX`QjXD#aQc1bB!`8d^KtqxBj>79wkUCG9Fk8NOmCc>mY$FKW@DUhC0lJ>Wv8H08=W82T?V?|Hd>~mlAf_^r&M_@ zMbf^Arv~@l#FmIL-jv6kNZ1bRE;_iQO=J92g8q7c|JMzq4s4E~n1qas`KH|2PTK1E zQmMoC!MCE$&78OKzrBEScTOJ(ayv%`LW+_v0d)I2Kx+&s&bOL6aLbwOn~`I+-Dm~x zITrfzE^617Ycu<>Ci3MuAaZ?=qPH4mY;@NTc*bg()kVmGbBcwVKkX=q8~!Le*DHWli@1TCQ# zjGjgK9!v^S-s`@##}Ju&YkM_TpH0)vzHjQH8-Chx&MfqMpw$AuC2QT7Fr^Xkzv~cSl2tTO(=GP71ZHsU9$l>=9}F6| z*T@lzpXvv%$1dX^$blW~(REb;zN2?^Ydo_J!2lqI3Ve||=^Qge&@S)mRYJH4Pv3lR zm(aGNdw5SGos*6ETe{{Xg||ic;%0XQh(o0N_B6bj4G{^Bvw$IUVCjS>*Ud zt&Gw!Gu;Dny}zOdk50O2E(I{za;PCY{v?Ql&6Nb&X2QMx}252N5F4prPWfsJVMMUhD;$G|V+XFVBP z%c|=q%t5!l^uC`FNyA&atSn!eG5NByfS9NC?~UC=(${qGD#g-uRP7YMV;63ZUv>S%g8C<|yk z^=Y$87Yk^}TFb)2$Cn4W~GVZ4^EF*neRx9>}_g z7;Y-vEC&40Ae+qZr;=Pc=NUB4&(^-p)Mvz3lzzBzK;SjE=e#k|rKeV{MRuYp01QZ$ z0Rpfg6$bEJN5VZOr34Oursm==jER5K@}lkCk||~^K3k47HkTf%=&>@z3?y@Jfaa0Q zJTAD0t;{VXHzd=pU;ZWB{$sr$1Y5`vtr5pbPJmx~Gnm_}^AsrGkuA0%yIJ(`8c8~F z{O9dL=IQ)L!J9o*_xh924KNu|1}P~i53j>kMB6|tDjU#A&X1SAIXdYTZnW*8{5`@4 zWSZ7jX;DBz%7GHFix=Ral54*rZCA+a>?~fPR43PrbM1=jhS>nwXte#wos#6cpZ}hl z0*D=QE?s+zI>#tPqu)fzs|BDHN4`(mH<%wLXCQoL*Qm}+J&*twf62>P@2`OKA5l=U z4)^ZejWG;RTo(0_DgJN{yl`^~tTh~7=DYpiBq(GM(Ul(XRd#OxJwd`W_V=V1gaF6f zzWn|V^%G^qP4gFR+J-r(+h2l+UbELtG6rz&!SrySv&Z{F3aL5(+Adr5;CQ1i3bxW9 zE?5!G%DW(YRJazU+?Z^;(c`n|y1(8KUpyww@O{dFb{FTOk5rE<$gB%bh!ek?D#*O^ zzD4f_#yqD;Nc`_;@*{Rn-#zcPLV9BxWt^AkI$3lHZ~NwzsT4#r&-T2btN_5_au^lqk9SE{h>3{eV#aS+HOW<-*q=R@ z81ru@h&8pas!Y$!P%Ji4^5ZdAy*aOKk~c_%m1kw9OuzOMdN{ z;OC@YP*(>vk!4Ngbu1WHp#=U8SPrrdnlnG%-#@fop8E`8xa~(10nzr%k$aPfIPZGD zlBAJvZe}I9RRf`u)Y}?;T@hV{o@9sY&_w~6Xm-Be1OD+kzv}z<>Eb%6huZQu%cUi< zYTC);6QX}FR0ma@6XkylwVNzs9LoVR0l8>f{I(C$>~{6U%mK_2w{(*2k|M@yJ(u;o zP#>2mhYb@CfLIm_hyp0S>fVxz=ld*S?7d#Uw|9UPD_=OerW*DptvF2t#F>DZx^1li zu0uP{k1GNc+bZcn4h~LM%?$5P82-;I&&yXuj$I+!wc1P%XZnH%Doy3BHL43PcrE*R z9Rjpm`_|e`o{e;%!R?aQ@=N8pe0;}PTF7;?eu+Z!6My|8LDd-W*IqNvMHHL-<0Cwe zOixtS+6rIm+G=A#YgoZ^bij~)nru(9klALO;YM%AHjtU0b-~iYvkVfh7(IwR%;~i} zzU3NltZwAL;J`{}q?6+(Vl18@p#6&G*kBm2!46tOS$L8&0Rd-P~#<(Hny z!j-n1>W#LrGavJTSyUt)br?w%M1gR$bR?^XCPCay7p^b;SI;B$#&xKdRd^| zNnDo7<3En-+qRYTAYU42s1-X(f>g#ghd9KMGxho?C=?YLHA>U@_$O-!1Yxw8-j|i9MJtzPa{)q^vf6^82YHscXq!FQ@yt; zAjr7i5ko1_0X4=dbzjO)dI+#F&n0z!D=h-{ppov<;zFMBAa-M*rh(nB``uEv$Aln# z zCP~C6gdJkEhZD3mASa$Au6&jl+Fp(_ix1FQkG}x)o%)bzNB(Apz#lvf!`|JE-J(mEy(B*ngpE%_2lI^31!b1T zG3ANV^(M!co(o-$Hqm!s2O@NwOjpDq06HIqV3puNmt8v$R?H4c>U)?(AlT`d%+>9f zo=lU|ym8UYKtIjH35(Awp-iDY*FgvC>(^J!$<eWB)|i8B<6d&#n(1EqI~AR&diztJ@!>cO$|BV1-jGwrOjiyRqszZkc3_DYqjVFqAO&y9MzLzUGNOmdK(X+=9!+=?J7kOUINzI zYk4_n<2yO%VHj|srn21716c!lKOOxeM_E!gXUkSJ0cEZ2Wej-W>KQ)#jv5XJ2102r zwtml-e?%Py6z(^cPPUb;+<}3Hbr4rBEX+&~vAkUKAzayfqTN&+RGMw%m>&fRJi!e^ zV~eU`uO9Yaf42&>e3Z=EA4zIdC|XDJS^{F^!I-^8T3C`Bx7X$ZX5>b*KuWg%%IHAW&|M?E*pifF>c*#IT@h71v`)@#M0AOHJY zQWA^4gFtDr-y?x_`XJ%G57NfEn!zC<_P2#ATiL0|6Vsn#)T=Z~aVOsB>sC6HL+veA znKk)fhr>)S73HKumTrcJW@~gW9(e)Eao@48xYH@E8R8BOWu@u$-7e|OIn49PRA3mmu;8W}K z6+=HJO{zz+E)P@xuj(U1G>O^#kh;sQf%{0mTd-FbF0)>mGqo?c7Ud2E_I6?0SASfc zz(9SVlsGB)KU&#Oe^jOvNGhmn+5P(GKmB?&eVt?!Ks(UgedZ0I3+;sUVZ0A@m(k5_NbM_aoB?Wb338)jn%Q2SMCWuB^IHSW2h{9nXQ7&u?dJh2OMi_*)= zgt$FgBp$l+RYAyeH*c%9w0_dv){>nE)ry!0<}W>L#@U;Lcm+IU9c+z~IcWb9pw+pE z=!(yfCTRBq8F1N`*txwv&oeo9;zW{(4t{;qpU{qBf*?6U3@GB~e$ex&rQIi!KHie9 z4#?UjnE|uXix!f=vYH|i*iMQyrcNSkl_?m?`_&f0#l)m_5@Fe44sRYs?g$1Q2=KTK z;qy3m(41fyuZHCYjmoJ?N>R)(VuKXUsM)l)#YCFcH_*|dglUX~#nPb`s12d~16;imB$1Ck_ zk#)qDP3Q>}vb$2;isQBHsX;~1dZblYsp;=e4K+JZV|Hl=N;FB9fqE*9d>y5@BS^nt zwe`nD9?xk-7q1YiflOCwh+miMPTlyBHhB-Mb2VpgW$%bu7%+-bi6?c?aqa-4i6nucA z2%Z%-lyf;6!MyE?cJ`ITrnoo(Go*lRKd$)08c5^t7ivWTGaqgO+QfN|w^(Qz`T_Co#yEM5mDt`egRJ4voR2BEks#{JXJhb%+248&aXalNCyVj zjF}JewOk>!TWC)!&Rc7ilI_j)KxU7!l5w&HrO(?6w!(^yzGhHuwFr#DUvpDTL{U1j z;l^>0B#5C5R^MyzW&bC;qNtQ?RgzL`JeKw&xFC*qQgzEZRC62B7_k3@=0(Kj-ZbDP zUi;37-|wp5$|Lsee$5yLcLlV*3iNS0RLJE$U%oNiP#*>KN5$R6BYYOkK2;x$hYLOT zldd8pO8RbF)7X;%+TFw0)~`EVZ~b@)KaTF)lz(z}EL`n%j1@4^XkMo9dPYD#3&(h- z@wLmvyU!m|-+XebUHmEKL4X7w!Rk}n1wYSZ4K24yP#{$-5HW`y8u=9HZJ9(z*EP#hT zX8qwZ{$t)wM7R%~U5WZ?Z~i{I#t;Q1;YSYNq(}uzclr-b9hKJQEZ#&8*Jm=-A3YZU zURfao)OQAC431)ud39heQ9Q!F)-_UP*lz}(HA>1sQj zOCD(Q+iJ*k2JGDv+^+8u{~Wo&47|ugG1muwD2lRVnMN%ZcXg>#;P39Sd1J2~G&LNB zDKa#^-Zay(in{*cUIiEC8ML1E-KaL=Xvk}7aPVHcg@;)7A=EolWhswI!Hi5=C1nZFy0 zePUGMH=LXyapQHSk?p#TYON%1wq}aJ6}J-_Vq< z!;kHZ173RP$fg+9VRb|~-!EyCZ|+rt#a9?Brh(tof1~DIf!)BQ&U`Ti zeSh8}VmP9gT%=b*&?TzAJ92|{y2@h^(-rp;71P@v-9ji%QcK^K95RVQM^(c z#9Ru)kMpD7yeT*n_Nu_ z8iqu9g#R$dvr5wGV!L;()mIEgZM&^w55vjQCxLKHW-$i*W+!aZ;`aeL#`|5_5^+lmp zS6iEk9P<3$z9+>= zg&Q5iNkgmQQo>h2&^DM$bHa%n?a(yS?E4nUmyGI9a?|^(y zkMoWz;JcG+X|@00^(ZMuA=p4BsKCdr+t^=Sra@*;SJzOkF~_h>8mGqL*Bjfl8&}|? zi)xzrG&zl+2g}dDt}S)Qa}8|uk}u!at9uHcP@XbM^BwR-~bsZX=Jx)q@&-713X6l=$SBnhcA4J0j?GB&k?mR_)S-@)828@ z;4jqdj}ifE#uF9CBqtZNuXT?cwy=|4p-K0zhb-3>3U{`)vd5ptSKRa+;07Wb|`KM5LKI=^O%?82jRX|41=Y);xj9=KMxf+`Ff)2oQ+!UsQO zv(KYY#qG0U`ARQsysbMF4(g`Fgh$uTl%U_u>|9i76(ZeEc~?wrNP_?wMN^zZ@xV7JXjev zzOB#yXz0lM%ex6a*Qsi~!;g*_A%9XsVBhz5*WmoiO@05dBdDoBqRL8JBcg;q$g6Ou zL{+n!YPf&H__&RKUwMH;Ye&f*Ej-e$gx0s2DT|d;RtAlv@L^87+rbp7x7V!csSlP? zkPa5JJ1Zkb7m-7Ki4xeV$h4B;7o1iFmzhpE)+45{E+V@Dw>YJ?6{)qX2h!ult89v> zhOZBOhU!c04Ut&u?GLV5kV7+#N5Y5$4r>)@x$G>AqmRP)<)_#h8X5}JdDR5pI$qS_ zT`!(>Z7|9)iZ`~iSU+H8>C;9}2n>FiZ?tSoMqfV{^(E(=j>hl#&!7#)OSH3ZS+Ph+ znlL_kdHE5}Fl*0I(MvLdz&K^71T;LM4Kl%dY6cznc+x?&)kVPoEUyT1DdMV+4 zQ1HGg5iR$*qknZ~Ms+%mJ&D2Hk+=MC#9z#1c!FZ~A+am+$wYp)Xcd7CdnB@!NVj&O zJUrJD&v3G&)}s5MM||kLk)ol!+Kg}MG`>B&RHir))?0w|tuCSDKlF(Hl9L7`!#VMY zV-0nMy3|&B=vc{SM8X~->yf>spFocRz#LdQw8sA70iWWP1;hRh%;p?@b!K}72AQk< zwZ&6O$FU=h;|^S(>wFv|T>D21VwGK|5f$dyRl2q=ZThTsMHPZtwM)Sk`!4;Fbp;Sc z8NcdR6ia}kn^^V^#yN7`UI-&Y@ddPVVXjwEB=QMwqqGE<9iR$w)Bz&yJ)7WwTc~^Z zD7t_|=r+CoexrfCQ@RsQS2|ZRN#(!tjqg9EC!g0jykHw^&SxeWzb6S_15}f8Fb&&`2g%34kbA2Y zdG+nln(CEJ+HpSKe5aD~>NILEa4Pqr_ZGdwn!dzY_;}~HJux}rFrGSA=A4pYD2={; z{yx0|o0=b6JZBKUZPuP~S_g$zowu^%&pM{RzTX|!Aa<~adZCu3znFGi0{}f38*2-< zD<*P0rE|%}>B_TY0}H%T2yD{YJprh(5~}s_Zg6t*I&{30JXFkc_igZ*Gjp@EZTxB4 z^&ern;J%JKt)}hkGH)Nyp5v78(@= zg$UqSot6vxcG#;$n$L)7#c~Q_Kh8Udm$KOirAB3Gl&jpeaxzin$`=29hCr&8{yH*@ zYLB$cgINfkQWV^yCz#BnY2z_D?p zM{Ij{w>kw@ZKbCTj-bwwd$fl=)6;)RE zxw;&88xN0jp8{E|m~f@~CX;(guj5h*EMErZ(Q`CCeagpYVe;mQoJij5-*Vj1I^AkC zfLiPBA8lQA88QDui~?S1&Ti#>2zD*^Kv3A2-y^@6#nhOlpfn75|rWt2VBfob{t)6WLkO z?rLVV(E(e$Do^vPSFnX)iwiEltJ^5U6lm6m`}yxo9Tlk7mz6ubY&+*8a_pHwkd)Nn z#ZCSDr=9Nzl38eO&f5(%-OHpReyG1wSYi-Je~oduQZcM_1n)dYILC5#Ih5 z+;S!6&U0DT4UBN*o6`9BkYPQ(5rRJa{pOV?w?e4JQz2^1FeRs`=$xvos90WI>?WvHswJ8eV}3e#(|m84N8q1$F(?>!KymV;f>qYH zL^%01P>3IGrzM<-pXM#yE}NgYKOWuD!Ic(Y061Vxg2KGHIkN+|ZX3-CpS9DL!(#p1 z7!Q*`Q~u?eCnPpGFM}C|>Jf&VGLOmk~QT2f1xh)`lf0zkU>VB-`5-Wf`g- z&29iix&2ix5yZn7MY&i~^JL%LLiECoYRNz_;Zx#bcY49q4)|E`t05lN{4+Gs)`SJ0 zg>dEcn((lM?AAD}tU}V&M3lLFo7z)ZrNk`FSvjRfbQWj*hM(O=hmV7Se0zA_aDq*= zBICH)EmXj6?A_&i4izFk((xXNo7_s7WcE>&u~_(th^;J3GZ z-=^F_b8~#OqtOfq6E10c@uEJPV}fF&e7F>akCGeyiJ7La5PGgoaSSs{8e`Jj_UPWl z9(x~l(%ZDGQEL`@KVb{i*1%4|-K48u@ExGcnXDR&8CPjd7|}0;vSu=k2O-gP>Vb`5 z@x=Ov^gW5-w;?iS=BTTExk;=;=0Z z*>xL3?_O&@jkMhNhHIwG&(6kXii?y0wBZzg>X~|$7Mz){uWk665AWXnyGZW}*)BJy z^vi&C?Ur51l(sIF9qHo@kKQAW)+=aI!M|p*0GNXYsdcg235yioK+y99?TD$CD#Z+uFt1dy7yEwI0(!nPQfWMrY%~hZU_MSm zg*M~wON2pI`5|FnDx8+(;Nt1IOp)|Cp@`xkd36gLj8AV~wJyq~;CvHtO;uQ;KwdUm z>^)LPi#~nSW4#1%y<}KZt@y&sM;ZRdU8IH{veXBuBt~f^-TENw4|xY@Vw}&4i8kW6 z>x08^+4I(CYDbB2wm318OG?Wmqe)TkJj+*4{aq$0rB_-Os+H3*Opt zgYPWKBOl`&gC^Z##eDo<~XJ4SvXxC62ad*e6FSf;K%Ui7E$O6JrtGYBX*d`#an8ySH+rz z51xp!d$MfKt?UO>0m)t4PFhT|U2gr};vN)}tB{nNx1`hUy-3LmK`0)W5|0lrbQ?QZ z%wfMkZ+rv%Pi+B-5mdY=<@lE!{kwReNg5Xk%P;wqMXF0!p(iwt)tyufZz0KNKK2@& zl5o4zL)6nKxa=$fhPtn>p;f&`y4Os!3X-8cTnAn+r}mcE{q|RN7RTSdyVmS6Y)MzI zu1Gb~gW`Er`!*2rVmoa0PXsRKwZ1$fMW3_vl}kvVGot%(iA3VzYVOE&(MGR(#d_5( zw@9fo^Zf9;g5MLS9AP$Gx?a9|C6@l=lbMgy=r9WFk}3gz8ar-*ep)OiFJC=4Wd?SH zJ1(_pZ>>)!Y1G@8x6mtBtX(Eua?JhYb-W#E&Y@##c#zw!kDbZIcKQoVaNcOT;7#Fu z)$Q;7^{@3YkZIB*#e5BO=wDhyz?e7V@-)l$f%M(gUAGB4d)`+wDo`c0xEl4lGOi8taw%#?9)Cp7jzmgTV_Y1{XY_C! zYz6Im?Z)2XarQ2lJ{N!MCXkeicD=R!Qul_(7gd+OwAt1s^uJ(wPf%b=$P`v1@GFg5 zkbbIpP}};XwPDW&2O;cm)Oz6cY-pkFFqN1JffyML*J0?Z#1Vz9tT|F$shuF=SFm|Q z&H+U~E>{85?DDTWo3A4y<2Q9ip7Q&MbBar8GJYr{BHM?{IlJy!K|kCLjv=ezT4`Zr z%@?Ku8O*$E-dJB}Sur1CzZUfN?bXnyPs1skdHixT8-i0<~I0!sS=aTgk8OTxk57G?FB2K))P)0K0?ZYpm46-`THUs3p zPc}~MAQTvSVEmA!;HTJM^GeeDX~nWh0sP%X?tHy+F6poN3k?6;*5W4!$)944uS3wtmn6BS7wNiL!hDq<*6!Pc`rPn%wz8_s2*DDe#Ih3YhD^en9?( zl9db8_qW45rvHZ`&4eg6LVafDdo!kTw1Hr~T#2 zRN`M%%B38bLh*B2U-kdTkJ3WIaO<~M$F?YhF|(|X_kL>PeH#|6~>#Cx2#^~|HH;}!r%dfOUXNBB`rK< z;_W5hcfNjpd^6k2dJz9YrieOYWl^^zi=G3m7XiE7t6czxxHS+%S+|Pp*`Gsm*Jb$; zS~5p6->}`TEAH{LHJvqvxE*?C_^l+hNZ78tPkKmK89-{7b-MF44=Fb=xCJs-#ec*MR= z7ZVU|ltcO(?0Gqijv*zhQ>x{SdO1c5N5>Q3%HEqda+X3U5u^o>H2<$De(3ZG?}=@F zzI@BZMj`uD8_mc*Y5PPZz$XhE4#_$C?Ped3H|W1a4zp=YXt*c>Z@Og9VCjczP+7(0 z;TfD?U3k4#GsvsFp(^2_rlD={=-hinMHMc>74 zK3z4MP(|D^u#^M-si-`78x!ch*etV2PZ|=2V7^A9n3tNuh{m|hTJo|oaoo|ne+hz z-;L1{OjjnTK=$K!EvI#V#jD4UiRXnKXxH;poIZcG(PTrd(Y`|XlnY)Wr^_PLPs`O# z^J6)9RMi;>I2NAuOO!v_LR1p*#08o+c1gIsh^~oqXq9KsY49T~kfi?A*P)3^_bt>? z-O(WyQk=Hr#}|i9Ra+U~M6SYC_QcBNCh9IZIj zqG*6_?nvTxi<-ODpaJc8YOXQNPWAEQj5;@~;t5OfLU7}etQf>FE3@^=7Mi$lD98NX zFxU|8B}SdC4Di^?J{DV7|0V!f)Nx=Ml;GX1tA3Y{G)kP0Jk076_ExhC8B?8e8ov(Z zgDDp4nrNKj!|sZ1_I&huv?OV0V$l;&7oQ;GHRgQb@B2ssoMG=vsCF-uPg8847-V#N zX~YQQvD0JE-#Yh>{*D{89`R0**OFGEylG{6;=pR@VQaNG#Ldb&&6T&YMQ^@`8mzT- ze2?mI_r4FJ7dGL0&MA%4YD&elJL1SWT6~D4x4|^6+-0D;F+Cp;76;gW3X4;hV}es< z=x>?CX|pa#7~ayEZep{ZnCuGvi^@0&!33o5Ejj|Ai>U^z4ms$C;T}PHoofbPt?|iL z4V7u{t!b!)J`{jvw`8f`+iy158c;5**(50wvL@^m&^(JI>D3kR3 zOL?8XfQQ_6SI-u(^yH3@B>(_OFtbc@p_DJhLC%|jNb-FQoQ?Q3SVrX?ilXYVoLs@H z&$QZ}R_{ecm5Os?tQ*E^xfY@TbTNlU#}?KC5HLHT#V91?&^4u;C@b~h;Ooq<79RfZ zC*jK3Ev4gPo_Ca{jL6Ko=W=6=e&UYo9m)aMzuA@aF2Wnxt*oC4pCa)L?C>YGY4`4@ zNfX2D`EwGQ>McvY;(KXn%?6fiVDj>6D*W{P*Eyt_9aVPZBIgP^f4s?4K?Ce2iqr3n z7VC7Hcb>_;tF4<^SjcH(+uP|y^p=XNl@Kb^t#1^WjJw|wPO5)3TtZfn8$){%+7@+r zuKaiy1%iqK&lIXXP-QFm47{&LHemOYlIDg@lr2=VJ}KWOTiy7Y5opEr$j!5W^CEq! zExBLG3!$I2RVp^P<4-Z{{C?S)=SXijArCI&4S%8e#hw-=8JfDnh2LD6hq~qrpwSgw zq>H7&0*BoZ-i2B0mFYIg|D5O9^Bk~l*AG34KcQsViC>IKT|SqtALM2$QOLKxRN~CoOEMqbD;`39Vk@7;{Qmib8*tCvGr|>#NsF8UYl+yN)ht=V6#m0K8M>vhL zpv>w}M3!D`yuye!5v9yeYVQ$oUQti_Za9w$o_? z8*?#s5VeV@<8VcI2BqrG#6jw6LtO#VN_S05$m22C3^U|PBnt-{Ofm19sMXSdyePMV zGzY3`b8*hhn@>i36XRb5T_5{<7Qm^XH>3{*UbN&`++b_NoxE9%eR06AK@oMprs$ zPevO5=Zo+a4K}MMP%m;5AQLO3$YFE;CwOyZ0~GJ%4c8xA{h#_e%bUR1HWSe0a?1N} z9UtwJE^}%;NWTwgLhNWCD5J;k>B5~jgp!6^Q6h+P$#)y-{NzD|-#rwN z(5zi~Wk~|`)Labt1I(68hZ35RbB9FD^KTEo-bXg}E-#0zy$OSaIn)IdX@L=XQ>@A& z{(vqW_l)4@p854k#R%^NwHc0Ak7TR`3&tulo1#Tr&Ek}rL&(m8AMLQhI} zP*Kj_tN8~SkOyRtWx?ETfM05KK*Jcp`edlmXYa!cR5|We+CD&j7*QmhUd+Tscx!ep z)Uyc+vU8~(w@|@O=%%Jq5`N{omYT&hIjY2ncd>;6RwEShWaL+h@K;wd9pWoWsb!Uv zhUa!HY>&T?5u7Jdk(hM__$0KY+E7*1+GOF^Xlx;3#7?VT@AZjpjGW3&@k;{~+-gMi zTEs|i4p(vFFl?9kY<>=`o0MT|L#Qtt7;QB3m5Ij>joWccT$fqVW;52hUw2u z&$4rhtF9gH$RA9$x;9jXY=VyvxBb>TNNl`}{Ah2_ve-O2>!;e1_nR|FPE2WDZhBc} zm9IRPden4v|SzH^UN9MjPqQM@3RW}r_s*0TVDQ@3v zutquL&Zg^oqk1_+$ST*;rv#e4Y~!jl3#6ZlpmeAy4Eu(PS;M}V4%6Uy2nQPRaM zHhJ5{z|5lvx`Bln10<*+TE`$5gO(s`lkhHD3R(!5>P=L1jD;_pd8#)|%e#-&Wj<)& zFPYiFR1@nW&gyJ@Tua>Dy5!TSJ1gwHmaKr&rLs-dchI`n=g+VS?N$UaQu-}e>?*&G9R+lpkUJu}3D?IB9k z>Afg_U3&})u37JGkX%`~C-2|?1ZHt(r`ODMtwHCYOt`Xue9wiRWVgqtY~Xx$PL55$ ziqydnbg#3r&vz+G^TSIYqwMsm#}Oc_76y?!Y3}H~ZmPsZ5b|Sq(d>v1ZFEEHzLw-l zb0*TqoZ_fqmCFzWV>zJQT64``r5}}P5T6?35<&}{-XFZ8`lkxeHjYc*YvM#-N4>S{ zZ6o4sRn`Vx=m@(HaAa-Om1s2uwpd&|ynRcF*hP;a;JlursA-ehw64w6#KcrfICM4M zvgNVx9n;k-2OqEtgnbjSwWW)L;WMpGWr!#tN@;P^T zM=_p5af^K#AgfpeRBA-0`Q?o|TKhTmS81AxVfS0dDO}r(Xm*sNeXt*uvD$M!mA9>5 z6gl34N^IIr*yYO>mRo#8)YaI_Tk89Cg)xkj8c*>ul<%!B6<#*Y8Kux|o@$?cTHqqS ztW_kZ3tMofuyr1eZxzdfO{EoUBa$+S(1}Th=*t@$;$)vI#V@_z=+STK2*8fJevFC5 zS-|*<@mqZ1s~0??KB&8GTjg~e-M@(p_A5#kUjTi}&;2^&z77_6-Rt(YEGlxuCC$;9 z^#FASaMrqu`*`)-qB-c2S0B#AIxWrsn{lzw$1JiG7pl=kKb=C-N- zN^Ui0mBa3Eq`3y2Da^8Ww0f?ZVlCo zb6Xp{qA1A895_Y1yrpXy-e=NNa6&seE#;+jPJaA2i877>D0}nDN8OVRTl-;KuLURw4luBP5K`CZY^fiOno565T|e?-^NikMHbk zw(mgQGwzJHt(}cK=&zY2Ain64IImruKC5+%wc@3gnDkA5z5OUmGoa0sWVp_Y5$@&( z^2zZBR9DsUqN=i@-nnC7uo%I`i{@wj$eGsu>!wb=wu&6?-R=lv0DS;wRkOE`PrkR= z{9|iim@$||{F>@WMs6b*o?cD|*XOD#XSrU>Q#;WAM*Si;}R+h|)mAE3iD;yho0w)_Y? zcp%;3qAr(Q`#Co|*h&#%X$Xm9d89RYDyWS0{SB8e@p^0hF&di8e{!6F@OLVR`Yy^k zP-o)(sq0E7m=y1XNQnxIK9_97z;RoVJvvj__%+qP*FJ_js3FL!cZ0+9quCwcmrc4P zPB*xOZMz3=|B^iY`>*CsPWIp;d-X@`%it7;2b8AA1zFSm?b-fm@8Q}Eo1=->`IpE& zQ|DGSvuyJsBxQk)Gp_UIt3f}kK!q-d|a&cI4+$sfr5Y*bqyM zn%zB)<9DyHsM*w!>{8V7~sdAz-{aN3U?nY>~n!uS&!?wjRnb^qUm9kG;+DAcN z`uM?vr)p}-ZLn6zGi|kcDA4bc0~PoB{G&Ie=DlQ!dD8GwC%v^f0D7+okii z>OnvZ$?+6Z;M$_ErY=P0G&1BhSSVZYcvE1}7 zvp}~v{OP1u@kyWnsz>U-&KPhA*fTL@BZD`om%rjWXw9!j1=oG3DDXh-;S)reT(7$i}0N<*Q17v0ix#QJ-a4nMJ#P3k;_1Zu6kPKZOd@;O@H` zJbxm~yP?TaR!k}QKni@Npk39Hlccz?wcF(H)H_J+x`_pZ`3XKNeDEVK0*2IisdDqF z5YAR*yyM7MHZu43kH~y(5BpCz;Ll`o<)LUM_o=yD9Q1jYxcnalHBkE$Z(jYCeof8~ z1gBLeTnj)VPe~XrcI~RmWS^kmr&EUfyQR-DGjxU%%Mv?V>Uw&K^#~+%f2^?FPzg`x zMpd%7=hL62S|+(Wa)jLPTJ$i1i}4%UIzmRr;5LnDjkYQsAA}xhRr81K@d1>tZY`4a z7hjfaf0}sbL#T$B7`xTb`o#8ZygJ|f-sSO9S76rTk->?cOHqxE5T7v2K2kgBT~k|+ zTKdt+sxpO3G#uF%j?bm;lAgvR=5l;`Yw#I+GL6=LQg-ISot@3`heR0I7fWz2inR4t zE49UuDALK@eS+(b@(kjHrso5v-Z>1wJ%RdKL{?`$7~7+p2e9+`9@_=`-d}r#5nXv; zZOFbZiufWVN3y5U!=OBJ>*vWoCnc8UGtq2k=LmY@dCxxmc@&ch0dCQnb6v`zfF)+l zO)F1D%G5Zqph|l(ldukvZQA<9eNj<+jFAfaiNV~$47m?^P}n`cvs541_$D{klJzQw z;Vp;L)s;zAmQ{R(7yb!aB4?|SrDqzGH~stmTdRxI$6jfk`7`cPI@xh zC-a_x+DF&l$GGJ6Y1aw{sBRE&xgC8ecx(M;m|pk_bASXrQ0KpBgYmrkMooQCVVt_v zTfUafDZPu}I~4?|faQ^IgMJ%#R36~@+CCiVYk3d+l(q`n7errGMyl1dl1rPqb&|G(fDG^8!1xXbUyAoS>UOJ9g5y&iM*6YlcPY zG)j?|i1LnWKYD}}ukr+n)t3_}3Cw{({DdnC!d5J&jx%tD>th&KlswW;Us!5(f7Vy{K{2J5=U!=8qlKSJ#&~>So#03_n&c1 zZCf8Gysg-=0V>r3hzN-E8r+B|2uPQ%AiW7lhXghok*)$F9YI8T?}VVzl-_%SgdP%v zgai_jyepoqqQ{)){cwNxe%apwD_LvJF-QH6GOJ2#Mpd9&`0TJTYjBRdfuPf{F#+LH zz^()Fc~1f=QN4%w+-r-2IZnG~+6_kxjB|o>;_Rsd3__sKvd)g8D_8H(N}{AI?Rg|b z!sk*ypTNyRcrRvX-Hh^?b#I;BiPO6R?g-bW#iJEYy$Wa``^8^Z6`#HNnullZ$&mAC zA<(hVYrg`1)BAy8sZZVU6RON&;u|X}#d53Oi?Ys z&d#=Tmah{&>3tzXGy9r88Hl+yKpZHhrB;aOf(zaWQ=}7b(0p$wZC8|`RUHgFK)kp7 zcfM97XPuA|vfehuD$sZ*XUE2k6?Li#(5a`GGG3+Sy0*W-xa``A?2gi1xS>5j;i4$E zAUp15a(;KirfCpo+FPK+?P*>?{;tz=s8@O|PxIG$m#H6zTaAL8>UdChLSfx6>ww6O z`0NJAFU04OojqjVDxvG5|5eXWy~44D*wG+1j*{U!VW~`XcLW_BC-5Gq# z{A@l*%I~sizkEW`{YG&UsQ0x4-zm+`kZ#=74NbSQI;vUR5Y%jw!pWSu1d}_7hw7 zHof#vI#4oZ&T`Gy4HuhmaiYw~x2`s=C>lBm)a6JCM(0m&oLM+eK)jB&7V3N4IezjO z*!9iLs6;8xzW#n1{E)b|wXdcw$;?aR1<+=y8;>ujSRf&QvRm*ZP%`-V&@w+Kr;tGz zv4HW2{H9;%^oir{i_MO>k)9+bwizM*x0I$0&qwZrBXe6-9`>FSUKK#|hFIRB@ycrJ z3G~!frlOM`EU!+oTf+8^kp*X6yWhoUOR~&Us5~GiJ6d!<$f+m5xAmeAb;tZ>zvG0P zs`FPYZvtGfG%$_I7i3DOVpmcUMXjFHdfMtwxg9!uq#Tq|J26mZHNI_tw@P;78LV$Z zoRt-j%jB@^>(>#5MB^lO;H-E$)$pZL)#dMZBMPLW=dlv%LupqHR)*G(@di;^Z$AW+ zKc19^B-moZE!YUqfsT=t1hQhlvV}m-E|`ALcc_^ zOk;L0vP*9LXvAwf`*7+(@ubRq?K`*BZ#|1UH}|F2g6KzKEM4m{+CdOQRWUzG@OL6b zL4l&$$Cq^4@Vb35`f}v96f2Xiz4%ly3rO?QVUGX`b`ychZI@nnY#L>-nv;{`QF+Y% z*m(X`@2#M&Ue;R?N;3j zynn$iT6_;tQ%QxAEX0sy&9&&q?cw!$HWp=F3kV(s1XA=wMGE`H53HlSr&6%@4f8mG z^XrW|9el8ZQIo!%xHk^7lhF-*^}+2a%B5yV*|r~HqC(Cw8^@j4;Y66Hpm>aTswm<> zrE9BEj$J7WKZx9ZjK?}FVO9R)%(LzySxc*7Bk$$-mpbC0m*V-s_X>{ttvRP~VZqA9 zNYf$TgKjI!*Zhs_+gum1G33PgeT0d-%JK= zU%>gInUj@T*}r<5LqaF|P{T)g>w>e1lxP>L2a@2kMN`k*QoZgzESB4AojL1uK0D&z zwcBaf0&z-(>IaNV{)6N}F$lyg5=`-5Uv_<#;>?GV+y*CQX>j#ziP!eOPX1_`4>D0t zK9p|_a&jCNT`x$l)8!>>ol}k?l5SrCk4yS^6M{Omb7KqzH%fBsy1IoPBiXy_2>mIX(f3w$hx&B`kHWhu4s z{$Q?YGrSUfJ)vAA-m|U;US!IB8$sSk!W!Qf2a@*{zU=Rw7L0NOJt#}%z31K(oZeZKPvcG zJ~rOnb;h@p`9}H=8xZRhsM_J+$*(}oOIUTnD5Xv4+i=s%J?_qicE%Bu2F)yUKFu!aBOeEudmxF+m+ptvJhZ48gaGF)-x>j3LbJCY?*=y) zD>PG$`{1_s{lRh5djV-TCL|iw?XjQpgQWpnb21GW3X1x29~2-U&-f~LxzFuhH=U=bJk-4izx@z37 z_CZ{JQNT=Y<98MRN1)dEdjflb#FMLJ?}5^^L*)poolA!`3a>F){I#a1PR!S8-og`$ zkwz`}S2k|&CQQn#M%mH)$rn=QS!Ldz{Ish+e*D8PJt6VxgrD1p%CoYC2XuuFv!+K%KK;=&*c4&U`Ir8mCS&Mj?y~dWV)yk+jQd$Y2fXhp-YcpRu zDlr>eJBr|!tDtrZ0HZ%27Q^6PkU~=*w>Q{F@kay(Uzv!^gTbuNbl-~s$VX)2J?1>X z|1s%Y8V=87m~Sg`_HSIp<{tLJ1+rr8bB)Y>2rM~ki6 zIIo^~d?_j2M~|40Pcjri73?uBP`VeSjwNgGTdGCmu$qsp%5Ao)>d64+y8iVpo?_?O ztnMOpJrnppo`3qiz!T)X{N(ri2MU`;wFHiy2CwBs3W|qmT0FnqKv>omJAj!I$@FJp ze$g%TWRF=Qa`~)C(!?iDGrs%`kx1331ACU)cDhNAR!0_g1Mw`a6TtNV6J$Yh%Shke)Y_!nr60MP$VfCT)p){7E+kQWgZG}aqx6F%$ zc2T%5+|o~qdqk*2OPibNi+SsxO!fl3m_ignko?Ql3QR+mKIXtC+x`Q%`1MPeLNDydFVHILw}*lq z6W{6+qJS<6j~c(bhtB*Eob8`pllG=`SlqpFrM|<#jbEu#x=b)TE*m~hdeGH+^=d=@ z{Rll;uJN;x>Gl~^K>!$@RP`EZ*H^&*)%~sLuDfO91zgZ|-RSAdkxTl+6O1kPgH-t7i1Nxn zdBVYRaK8tMd8oq&P}v(%sB({w3_i07ug*xgBxYn-ps^zJg1tlNx(Y8mKXEpxYUVz3 zxjz{y)jTRU_G@b4dzHgQi=Be3>1krlp@Ptvo&GW;%M^&|E?(adaWypht$);1HMR*h zamxI!724Pstc?3&kIJ#nXAMi1H0pHic_8LmVNUgUE#JXuRhbISf~8|bN<3&8doaFm zU|rw6LSL+sS)=-&4$7+{2Vu`~ux0u9?{%QxqRMaCf!M1%;?-Qrmg6gYY2z%wOwITDv23bx8c4Lq)PoZ@{pfdQ@ zR0~Ne+`IE(p(3Ha)lFsTCxE1t z^d5DkWRraAVU+H$bDl)rN?eK?ZaH2TwM@?E@`{RSE{=KnG9T!z(nm2Le)LhP6U~f7 z9-l=aV=~>Hg*4cKwj}G1Mr9}{RK0)=Jv)1^f;sXqQ206fG{$?mru84s3c&@5Prnhbj&nb{-L&uOq+@_c1pYG zq7B!hKMwnH_r7wv$juhXjv2ure>56lf0u$2qn5wn)bzQ1`?Z?*E zyph!F#(sa5S}m9XWf0bN|Fm8vY+>O(?pumhkI4_98!d-OTWuju+Uqx4-xu$?-2+}H z3e@H!u8@q&Zn%qXbTxE&jBI=IFNB#&qcIBvGABWFY33v@WwOKqb8!L{(*{3hx_D5l zfU2pI=*ji&lH^tJ^{hJuX88Mo!=BLU3d)94ZidAXzH_oo2}>=tZHZ#kmcNgc?j-Z^ zQ2nxsr*@7{U-&)HNKd!CSY8twaCM+ zc8kTncVvmDQ(cdP2!Vqx_M}a1fiy>cWNm&~SetgE|)(4tX;)gggVZ&G*VfEu0gzhSrS&{ zxjLI(#X7@4e`VYkr%I(3oKc=xn5}L%!v@!sPM6k5IW)ohAmk}9mFZlbG_#GsHGZeB zhj9D~Cx2CH3MDk=XmHGHNtAG}A99Aj8tP;P%Iz-U)6c(G6r+^X&CV{Cng~^fDAI>} z0qvEQIkAo!c)I?{)TyD+7jIJe4etDJuDlqxQ!p-wGWwA^7-iBM{8$$7yFawnn=TM_ ze#GOy5L=;9cQ*hRq&@fha~Fm+Q=?%+eQ|Md&lv%-ONCRleXIv&%zn!dZtN{@HD)n~3aA!G{M_`ko$5dnJTSB5^#>_B zY6W=TuIE6(a@lUxvS01w_Xh!m(LiQ{i@oc&>65Q}^zg-Q6{P@qDJflB+ePcp0IUDO zIo>+}CoDTV8&zP79jNKPQvsdk5HPNNKU4$s@`ySNpHw0?w!ooFvO632y#PZN2)&Nk=0rotcrjE zP5qpQ|Ai=7YBYUeRHaNS&e-(v@%5SWjRNfXFO2&A&+6KM_Pq-Q32RsuYJNmS2is$y z|Hf`bKRiDfDM@F11?|VNJb?!a$3{sErkNEQ|T z_7x6->!z`j>3Uw0(17-?^s2sbKRiWpFvmdYBOf0y!^aLCE|-zbS)&NF;|?w=-}oO! z(D-hnrFD;ymEKcI1(nqaYiVWmsk5^)m~Zh6eU%o>@^JAFsMsNX27hxc*TQ2!_GC`F z<%hKW*C!RAYJ2ZOCZ4^=s9%1#svRP-gBWX^3i21D^|jYTx&69}Kko34298H;>VOJ2 z6oZ^YS{ujmW04ZA**)BIPO- z3SG8~f}*j2FKz=_KE|?i%!Ga#E`4J+t@ejq0^is83h9p4P0``YH!nP zmm_rfhU?Cm-NxbyoQA@8fT0()=)hHP>C&4Hsh8R4YS)Y0U%l*&xkEsiC+Zo4lh*aS zZxaO4L|CI?X&QWI-z3pz)0wY+)*F}yszrYC5n0#7m5_Lz{?}t&1C#svOl+|!=LQlEdX7!eDG|<&sK6967sVa1@hd$?ak_L z#A!7uv;0oyP|~2JfTFTo>K|LG($63DnMkKQytVw4YWd{k1ExC)v03R#N0pQm9$d;e z_ckD4*U^)Ke;u%l>n(k-&+@=p2($EJx!fE;(tEaxCPc9SYQz7#bZmcUxTk&$IsP zYxmp_%nWki4xjLc3^8b$YW6-HStg~slWw@UXqL_E9RGHspXn6n$I{jwKxxkuG|d_B zCz!HM)_R!Q${eP%LIwE)GnNVviUDtvH2;2tOkCVtX0|YJsy;mIMTVo8`Oc(QWV?SGt)%zLN@_ zC+J-Uu9rlY(c7&G;8hjPSuPtNXNuD;NVGw~QP(v!HS@;ofODRI`%Y_WTtz#HSRtxn zgs;z^c@e1%gfXC?&F%mEiFSkQLpGP5FHH+Yp3#xbs_`v+t?WZ>%b!qGW-<{Mp>CNK zxy1f6UwnUqz(0?-t($X23A{qHEpX2%5aj%M0!ACb^d~smJ3jBfF&N z?Mm>ur&WxhCiSmn;VEn26Rq-4-wW=}FqwjnQcQHnlYVFSns_>5vn(yN4#UcM!qivB z&ffk+cUV|%^=4GabJw)g$%D-!lKX%E)(R&A>&K>fd>>i+k=wlR&D5sVY_r5M=Y|VI ztRo-_scYu-M_XPt=AgZ}PNzaTOt<@h3YYUPVna@WL(2paTTP!l_ex~Bj%H(@91&K>>gkl8jR)T<-k#KuCUqG<(I$T+G4)DFWDk`~m#520P<%6f`>2W`+ zm_xY2aYnUI{qVGu9BsOr%=5Hr2YJKVG)m5MR5~my>`a0OYD`S3FxPbGEga~+|J?(R zUc3-ke|SM+(&o;V?|VqnU3u*Lyjzn~JIIO2+SlmF!2u;MXU^lKEJb|iit$7DM~jBc zWpf?j)Fe>>C*t_7j}&YO7))5`tI59;5gGFkAu}T*cl6_1 zAiDVNyVInJe}U>X?!P)vR@4No+LFD*pEteH4>^vNH-1^I?&IuZ{0RRs*xVtR-aCXU zzzfqMZ{B=-K%W9M0#ZH%hi7Q4 zU_z1n&gv0hsBaT<(xFgv<=6#pn`Nq6qT$7smRrAH#_q&@GmZ+d)xEorCA_8)I1h0G zXv<-_^23XdQ`TLHN%iq@j~uR^{$C2hgOOOo;eIa}m(o5RI&Z3WSwq(9`Wl76kOrtU zZsB{A4gUuX_Tpq5>b$qeEvm>0Y6I)IV~uS-=q=>x2C$NrENh5>v`5?cD~W4*^jPr) zOB*WI7kPMZ>kY-d%^~40nzn+&GXVxzojosKwZ+Cp$5I9UbpoJXLuI9SA#`!Yr9Fj_ z8#rdM{Y_P+)aFZ}rd8$WC(onU4;&c}pr_!1zDg>Y+*?iQCNu%^8O~nCeg>^=G`5A> z)JIl9YWd-5auR|Y@X*6wI#MCMP@wcYzq&ZF*;aE81daIj6xXLjP|v zQs}yxn%ensZdLvmq&sQdNe7|FP*LIj3{IuQ)H4h5MSCA)_J6KwqWJqIq+JEX38*#C zILD)r#Kh}AU%Jo>HJY>VcK$@VwYk+!eOgagfLC%#LHgqVQBvZeY4vE}#rVZ={ghtBH)6$c!?Lu5w*^T6c7(H){_BxuICr zS~J$}yIij}0shdkiekM{HKM-)pR{P3qNsI&Br!FTYw6HTvowy6HOQz5P*?4jvFa}j zyh?z}5re-(Z*PHx9p+T?F(crZueQVb7@s+|f2qqaztCrvA|3Y{4=plksGHd+6hjB^ih>TnKeB*=^GWn4#sWF)v(B{Pde~&L=Wph)cdivDp&CzBtXk zk_#dYQS2y#evt7^2Ao`-dq=YNUD9ql6`AYg%<$ocnuKFNKlko4h4=wj==euDz3Ve< zz%{*#H2bv_;QTLYo*Ao@0*)!1h zR832Vp_V`E5)&7S#nn5Kf8Ov$mEF18RGz|TzWLX48Z|rxzO1s!2g>6H;(~nh- zFHFg>xO7OP=uX;Is5}mC(^mWODQ|-WR4v5R$cS}tu>CoCK3>4+MXbF&ija{e>u5FM zM+y>l!%y6;%;&-QZY(}PrJ$==6C*wce3ofh5RPunm%^Th7IYS5wzjk#^Q~Ou*mRUO zkN^e+{l@48iV4zeo-clZc}2}1v}4RMyI{IIlQXQ~vE%V*IkJ{kyy-jmi=ti&j%Zb| zyY}1*s^3CR^#TKLbK5S0%92wJ_T9$pic@>C20$RPUJ`e!a`c1{Pd#35*>fP_<7#oI z_!CWD|9WoqxpTo4T2r0PNiB7(#+7M7hXbPeDPufm-QkJc! zsXXep_CnsXC^y9`o3Qz^6ANZG+;SKy3&)Fw5Aasz^1{5@;Lj*_mjYIe)#J|k~Ks<8Bh%XIk1#-(t zM6k>aQY)JcrIq^6{Yl`fyDx{!+|U=38Q#UX zaFii(;V41w4m$yKH>G?O0z2Z;NMX;gXnitp{x9?Um+ru};+a-I|1}r1-YzH*?qq`E zG}Vu{yXx)gxHbJLY{t|?YaLWme0j{89O*@j7^~i5Vba9z2ix>4U-&6Dzqe;+Grwd! zR$H~5f4V|~R|znWeT3&(ksWVDeZ z&ZZ{;*E6{T-Ra50+sY|;_jv*e0{0QhQ4JydZ~}sPANHs5Bdv3ER|Fg#Mea>bOK$%C z+-vKjWhBpVNjaE%E1A*mJoF6M#V$O_8;b%Q%NJCYq$3D3?+TvL1H91Vpvn0tnb-KC zThP8dO#eKudcK-ohenDX%mIAuVBRY0;g+A`j4_4AwRCZXM+d*LTj0>aj!iBoyPC=I zQXz=04{QT>faHlNP^)YhwU3Ub*4937F{vk*IxfF3{G4jKNHMir_gyhnazS=>IzKDh zf=VFrO-^Q)ZiU92Surd7{J7br=@q-hm#OIs`4IFv5iw^>@oC-eC)n5rzKCb7aM!>n ze^+e44ljP$N?wV7Qz@Dso>%BZsp2+!c_(H!3F^b#{6%iOwQoSt9FbJNrU3P=%Gj9W ze-OdCu@9r}i z$KBpDD`Gv!owvZ6h_3M^UF72n<4w(vIo3-Q zKS@ZGKAM7HMlE{<2WR@)KV^b_Q|1hBbfoCBh~;4V$LGAZBTif_JH1c$aC)yrXDE_9 zG@OuI+zKNn~l}v|L(r^-)cDv@Z?3fO0M>a1f2R2T$^) z`hyK*Uyi-tf$IBR57}4~=5|4^Gxc&;Tm@qXE)kV-po^@g@y2(V4{Bed3W0TxrPQ5l z-UGT6o}mM$WGC@w+S?^1;q=umA<|^C#68r|mT3)?S0Q=Gpc;3Vna}Lcxu}D~iIk`5 zpDFO|UVYCCn$jaI^@>GFd#&ho7FmQGDX`1_?yo3-_ZvS181mq-aVdX+1{e zOyULK5eTXy>((9EWF>mKY}_B$|Cdgg>J>ACX+igkU;DtN5wWOG(O1NLp z;3MBEwSyfS>9ca0`5*^9erP(8h! zy}Gw>T%vN!{DaMKp^2gh-N=iJ=8F^+nliIcrdz75m$f7B4Ra0%71nRyMjm{4_rF0+ z0KJHY)JW`~*N|~}u#6jhRiZxJwK|QWvPewQ0^e3xBn0l45QhuKrC8BHb9!#l;%A0d zInML=N$v)htyc*~_n$En>bWwr}5p{;pgjdMIt&e&9OK3~&JJZ|`vTFijr>EWP;c zRW5~lmOIhGH0d z8hZ+u2y+v!eXTZw; z`$fzECnJiojs--;g}8l7>xz5pq?L%4uGFc6N4SPL-H%9{4u0 ze*N*f$gp>L&`_8puqHp^h0DKRMk$C`dzbcM)N5^25sl zMG-8f{`@p(eyqyAegu*bw;sGEXGI6T?1`YQo*;|U@pIZZ@--y~pco>G`bxiB4nU3r z`su5sIN3yH1KV2;SE^wT<)h_K{O{E_f$N$Sk*U$SOO}ojMxMWfSl?IS4PdcG6PHa& zMQP}h8L7*%w@jJ9V+@ib#4hOQjlDLdXAx)$^YB9ETS@$cZC+DxZEo$+%XIZ*xgm{K z&TJ5yJllJj?>QPpWv-Bh$!pLw@C=JajZrj~xv@okQQ!Mt%u;%zMInYgGR^0YTo% zzbGUW9R%}H*U`~YvbAm1)6h|*bC0JQJH6}vl8+oHQ6zUWnu6ESBM*Y!^%iMF4+L5G za)VduB?QvgB7=XUYiO)Ya{>y}5q1kLtMPRlVIJskWNL{wZg@lpun5Cdi4%8!) ztWZ08OtS{;kA4q?TrG5Ae8kwbS*;gHb8akYGF)(ypr9!;UO?&#`?2>fotW^xtzs#V7p*N|#lczF2p&=jwZz}^DY`Hs~~oq87Zmd{I5;@SRvzCDbmCt$}0 z^^6<_7Dj%GM7KTq;`Oh9^-=H-Y0$hv}th!Au(2A&qBh~_yfs0Gg(Rp z|E+Pr1!qy_DEGvn`p?%s5)7y1I56O;(3?eV1bW816Y%;#T73Khp%i~#rEGO>wPdT# zl=$6i>@t4!=X)Nb7$XumJJiP1zhyF+AD2>58564=dN;=1^U8$y>OI)V98J{e zeEgGa8lDF(YF;7~Q_95j#sH$FOYPXcO*_lf_~nL#~0U1cgoF7xf%i_&p% z*L8KtA1wMKb8}yCJNA!HzBV&a5#~K=zsSixKK$D1#gRqm&&s1QTQ%K+!hNO782W3txK0B^E>8W6CFcW|r3X;-mTc=O18xq#67% ze?@;F-98^&apPWSv}eKQWwpP5T8N(7y<4-TFYD65ucmWNa@=3-JG5W|`=jDci+%F$ z18R*cT>iMw!J#3Z9e!$RCYOyWFY^J$-Px!V>5;@+;6!T8)YR5K5iP&^=WSnV(1?9} zK@4%%v`qng#1FGa_4~|m8`cu=P&e7h` zPW$c7%4)s=jjn2}$-SX`gWfXh?&80|O!7I?)6=KQozj?#Xf;CN^U2$7$ab2H?!>2e z58e5WvHbEqspCg`=Jy#&{Ooyrrz)3`f=o50P_+f4qn{O>>hBhUd5gUZKO={Uq~&w!gN_tgGA%-OMX6o~2^4qMWbZRUrz1Po@4EI!te6 z%tw2y?JsVJNS3JS2jJ?`{y&)v8Nh97eI69pR296j1o zwBwNpo$zhP4(JLU3FRu@3y84&A13kNU*zWx3JNQ}+hq2q+vM-qv4ea@VVEA+3M`l7 z-{pSQm;dhWPipM{ab^zGxNwIMTIl*U8zl6#rNE_HiSz)wd^d$_pLouASz zMRUCfSf#U-g>+<5Aqa?Cuf3P~Nl<=BYa0;M%7={9()0Cc2=3nf^b!eNAv9}kj-nS5 z|GV&Q&dCpk3uOC>MEI^gUwc_~Pm>kiO9zv<=g4FOt5nY~n0Fspoo{OTC~{`fV_<(s zvxJ7E?28xrO^iZN`4&03vr@3_QK;PtJq`Jv4KZ8=6K^RuQG8ck+Kxhf<6rxYPe!+l z$GmAe?`g`EA`{p0whVGwpL+ihtsXX^^1#n3`1{Sdj2f9GM$ho`5ipZ-j}KweHkWz} z8dtvpj83~K z=)|I_&l)Yit~K$9w9BP+MJNgg!&K4N%uaK&t6ue77%52=BlnCvQa}`0+s-AwALH$* za4@jb8nj0=bv*g8b2KT+zn*SyK+?gSH8tEH(+`lwMzaxe&FHnQpZP%Oc@ZXF$mY~3 zmkxrt@ujyQpz^#oC;vij?j@1=yxhJ(#|uDIi2!YULAotk6ji9+yA@mzF;u8A;P8|D zrdRA#z4Ri>H9B!fe7NwBJ(pz#8W}sE9#m0KpzfD1Jioy6&@!)Ec|! zcGWXc+ConYGnS>jW~$BK*g2ISyoYneaN||t&!uzaDqLh|^BU#6x1xa5E0pczI6i(^ z75NE>R$Dr&K&M2kyCZ?D>~up&TGTMYN1_tpWYhQM_vL3dfCK`PRf?YW;M{-r8sd|4 zuo1@Z3V5?_{AvLVGFDRZe-x6@!w9|raY+Lc;sAnZ z?{jZixu`2e1)1319Tpt^;<}|Bb(03lB(KrSeO<<+(S_k7CrRNx-tzwyt1tAu|8-kOC!hBBaW|VZsRRAqcAR`0z=U)B@@~?DkhzL~&9~Z? zqTeh+FkwN5g(H_h5l%reb#!V@@(X}aUVP&LCmX?-&ASIqD?~=zLTQ!iwkOQ!M`D;>8 z6)=!sjeT_aqw!>kg`7R{RSD2dAGHJ84%Gh*$p?m}yvX9wt-z~wMEra&p#P@Ued*~Z zxh7Tq`>wQVR)GT>LN?)|1u~jCIww;;0Zm1@SKY4Z>Piqem+lhAD?!1G5$cT z-&Xqx`;`lwKi_4a==mu`Q)l{HATDaWM603Ok(c#`;SfUjMzr^3tIh<@MMp;`b~*7I zQ6nOKd8l8Kir?Qz!424xIUT(c%)&EL^JCQEC*&w;5f7;aR!tL=6PfB~YQj8wlfQjE zU2LaNJyEY73on1Y=&Nnj{Vi}p6AOW zGr+uZPM1zK>dnH%8uh9Tv_^MC!DFX^C10DH-=T#71qv;8e4KU*Xrc_pIxZ~g{pW=n zmsswWKd_aRr64qo?3M6KL#aUVAka#dFFDUEk?*Q6M4eB$kPKU_ns5F)gnB9@Vnm>U zghUW0^bGp;{8;pwYU2Uo7Ew0hTc+tt=j zvxT*vyKZxWq_G>cvf7X95Gd){25Qp6qHBUtn}FubPI=^Y2;n~NH5V)~Edhg_=gF$f zSnRh?kz5Xto3?LduYWHE$5k<$W4AK<0zDyucG|$ol~mWuHux^ZUv+P1=^KY9yDc7e znQD+}HN%`?9R%X`4(;eLpk3&kKY43UII{XYKDr2E*-Vz0AOOQE!ioTN!fUNY9|>Ez z9iFW3=#UsJF9$YwWBRNw z@9Z@cK9m*W?<9*KP+iT2McFRA(Om(W$;_9xMqjOt3P`ApaZ0RVUHTmvn~YBIwgR2a zwu5pstl=A{a#45Zr+34;o1T2*!MqD!oG!(3u7BX4E;DjwhHOqoBNsm4BWd`TD2+x3!)6>mz>MGUVe%|WMKuAf{n(2|Z)qb#WCTf_0xm7XQBPpTzF;=@1-P)q+{D}4>yS?b z!DtgtHfD%gB&seOEDY2&z_tpK*qo~d-w6A3Y`m26b#g5%&6TEjjlUf631@Pi*D^JY z+r3!SASEoQ61>>TR;jVXa1;{3s(lS_%gez5l0}YebaE;y`yhxzz>t0mt*i5R_OWJ) zFZ!gVWmXQS=t8^{$p?>Hq;~=v0@TYxJ7xwYKP~g(AnzsODOr3l%EhjkbPd4GZ-IH#he!69WSS zho^=mK$H4<-V%#Y4ovx0NK}av3|^vX*S+D7ACb4M%K-I}a*KdMlnKI@GMBY@#xUMA zZzmM9IGZIN15j`IFnB*O0Yjqrji8+&8j(Q^FEkCPvUSz z&io;CNxxo!ZK5p}rP_UysG)C6orkpit#c3LSB1Vh54v@hoS#V{`-<0=Ya86U%x5cawmb5u{#bT&_#GQ0sZOJJ2X2=52q;;wcv{5|D&F~!*UEFyU?Yq9r3{)mOAy{`uKdMfHw^$C0>Nkm#Qcqf&JW_kjDLCAr@HXsl99wd@sizcv4YdHb*1SmUVMFPcD zY-^j7`UCaBZtHILpJ#Y;)t$Aq zRL(tra^N~C5w=B?0U*q{Wj@5rv`#Zd#)%!6uOikG1suQF-C2c56CJl2dGidcfF?)F z$H+#wXAcD3mViIJV+X@W<@hgvo)FQlmn?uGFK;*=)*9%%){7Y)$d^rD0jeMU?!!!w z3@m2xvTW|;Yn>~Rh>5%U){{|2&K%Jol2&QaRuYw9x`m|h8cv${a6^Dfp}Tt8F~%{1 z!+tuZqiXn3blJyUVBUJX$7NNyE&Na&hYC38epzJS_-XHLeO!0m0);3-v`Ut5RS}%~ zX${`VkfHr9^PtvdduqZdVDyY*MC5#CxdoY7XwiyM7Jqxf9kDSJ#$Guo+a*Qjzf}W_ zTJZxK;l~;Fuan8jwV3h~$}I*grb#;4z(L6C+}X;A0da{F+Wd1u z#Mwv}Joh>Yo?SJ*i64%Bf=7=xLm5F#`Uo$5*Uq^d=qdzEu`Ph-&V32OhY3e-@dNEz zUidp&q`Ip1{E+y95$G;)H$ky-H*VR|{DZ(;AfiWWHG>gvYkB(aunv{o1`(d_%dx4(h z*tj8(Xys>lV4zjx(!hp~$4a737Ae>Gow%@xbi8=2=Li3>H8O<#L&vCFrbdzb*n?!- zy5Mj_@Q6K;f!%f($B(AI+uBp&gn+-_tlWD1aGT#I#r#stUJtiCDgqc&;d;z>TdH_( zSjbPLal)k*f8TNQrrWofc+4#KWTZGfJ6Oaj5O30NvzNx#Y~&GE5g zG7%ko{yEl`8b~IUj|^>x%n#5UtoOwFh!_8u{hs_rpxT_H->NGLj}Eg4S9{#lqZaaD zEUSBcaZo^f(gu1v<$KfN*Wctz?ciS+PKKs_V{Avk;_;{thV}DdT@_x>zWV6jEne~1 zo-lo1MrnEpAz*IGzJ;WxZ|PmnGXv@!h>`2xZ}`iHe}DczNS97tTKSXlyLY{L`HCem z|D~j@A#Mf1UpeyNX6CD|L=hqQ#!6j@dbCpsx>)T@icwq;t-yBTAqq78#w?A6T{?fO zDE+G^MMW35%r;wJN5JI_2X#NlCfe7J@}OD+D9|LCicjvdOmVbHJ3!DO{v=Z3RQ;#n zx^4Du$Iin$`KiO2imLzQt=V0Jwf^|ykjJg-*K`2Jvd?PN?uIhQljmM600GD;-`1iG zpw_fQ7UKV5p_=eH+aknKA{u1kt^NkAdl1Vk_(UJ^@?=Ft1#m!QZ^qhde_-}d-dHxP zSVv~fL9lDrF!Cv-PQueJ4mAgyUUi`xL%mIxQEwA5i z0PP{E7jOGQ)Q@+uocTx_CI>{WtcUBWtI-}F%ZMs{S(k3H-kkE+nw&qiO%&$>@u4nL zf9gLHR{Y`sn{N|%HNWJ*iQZ*QOP~b=F`Wq$#0T}gscN1WW zZ|V(0R+3u3isA#fx2K&kc-;&+%7VD5H^;az%dTB@-u-0#+D)6fv|vuz7^ z`@BrrV!9j7WOg;2W9lG;=LyQo$xHaPR?oeGGzoah<3NzTNUGS`7P(FH%%amGpqm&O zxpku*jL}q2a)YG;aQ(dTYzM>4=eY>Q?&w zp=Iw|Nn^FOGa(1CESGOa0i*Dqvr(s}QRdhIQv!lQWIjpz7n%1*{Kk*uDWI7ZPx9XR(b!Ok<={QViN51w}4@v--RnSlkzTYl)Bb+ z3B-w*?2d@YAd*Sob`P$LyOCT5_s$~9bcC&zIWTad7 zhKk*+zyF9oArce{ScnUbVPWCs@AI6kF>bT#o=t^0y2~Ik6^{Y6plYLbZ5%x1xg{wi zC8+@3cj1&~z|C{~S=Sv4K~K&|CuC)=wI+885Odm}hn1eW+<7_a^iX-MgM-?uiw7ph zj)u-1Z_;w9RFJI*=kzp_MPoOU38RxPX)Bn&yC*RJzjm%Ps0kwqSC9g=HmwfBQ3=%w zj#@6GUkUK~s1OgLK zAg4eegpj29VKC0LzxvnxwYxJryZiPX-+S}+0R=piNY-YL7f2*xTS&^(iiAv`@2)1+ zujrD@#c?wZ(XZD}*Z9irY_>F0ozj;11mw<*$26G$Bc9T;sxbj`%l1dgJA#@bo?jX5 z8;C^^i83m-(8xM4BQMRh$f}zi**UfS?enQOvWeiv zSc)lrs88^U_Ejc=0M%WL$ol=E`90ck6ofa(1(VbxrZ9htL?4K471=D@4Zq7>ibPm7p@x`0ObWE-SJbDurN(hZZq74DNz2nM z>mchlt$4YJd_1}I*fgfP@bTx;o^QI2Eu8wYuD3%h1a)Ke4N*anK~gz)hrAb(Fftw? z6wh)v>}63#OmxqN=Di8^jRPWE9F(DhhZY(k-wi*gk+WRPcPt=v3_nDX)Lk4L>>u%) ztCgAKXLRx)F>&`rsi15d_E5NS@Kd7zCw5&$OlME+wj}7(qlafEM91U;4`NiCT`^xK zi*LQVT?q3}9?b8BV`xJu@lwclMtqk8grD%VVL=QhpiUN9CY_9eGtSZzi$aErnfS@u zd!c=5je9CP@1GN7!0$d?onn+~hqMC5RkE^!uPO|`W>029zS8|+elfXO=6(w90>|zZ z%~^NMQ4+Yf#0~C4<3S+m?nU4aHc{sI>v+L=fC~s z#_Z&uNa}+&(w$d{DPTtD-1$ZrV@U+94`%k)Oi%cw=hA4$i;&g0(`0V~~fTTJITQrcI5tB z3Yk*>w&8o6sn}ypQUWN-(GVVP$Djg;p|M<7AP_~FYUdqcjNwv_lF=OV3j^zxPLJfW zP;a#VCNI4rgcFq5?M{~eX+~MQ6tY62lr)7$;+h9+(0;=3vW?2!l&+G7`A|SQ`T`wf zdFkcB`bBp}R;zF@tj{cCX8|xXC#co5mV5#B$~efAE2FQ(iqHT`A)Kt2e;EgAQuzV- z%t4X=3HYOxYoG*fl!F$fqt5FHjJRt3CEJK%P204^U-BlhF-%lMC}UAQ2$!;Xtwr#B zp#CyvExK{DZ5+hHVr_xFaZB7h7o1rZePV*OhL@UXR~o&XUrcg{4wR4qVPQ5VPtLEp zgP)28W7Yxr8bUjQj{uU;=SuG;BoeSOSbP0bhjLoNyQu3-fWQ^Vx3MiicnAa75# zv#*;uBC>7)>xBzM7mgsCnVpt_XOjHc)Z}*;GYmn8qV8T18>lD+b@yPoOgb_={IzVK zUXqBZqg2jrHy!+P%lu%u8`&pMZdB2zI&WbfPy&IpB3t+N^sMk}<$Lw9vUO8hCiaq; z&3u__J!%MPiJmB}$&#f3q(62W&nNL;JRueRJ&{U^tz5%MRQb|g&1YDIM=d%lC#9Y< z3nTOr0&lg7ZWX_98+dl^PF=!TuxRD2_6R?3=9lOLR?q@3=F&)c-g~31K*%MzHoto!Pl7! z3CT$a2@%OT*qWGI8H0g|hbN`L$SEFR_IG;Pa>CK%HT}uB#^jgW2mj%U5JblRi2({C z3}0RAS3@pIz)(Qtw@*JY`DY=*3^YG{sHO`m{&Zv@phSHJA!K(q>S?<=n90%ku(y1b zb(ZmDEDM&qUlZKzq=xB-DA4$6WG;{Z9apSB28^5E577>c(@Hiu78?x>RxTsTx9QFs zEd5*4NTJ2c)7{(5_Rluz_h53cL*dKRvd}}6@^3?^-)h16<)B-d7rH-p1TxA5v4&ml zf1v!zWK#N`#y-S+iUUaNMFD}?wtCndFo**ljJrc(^xjybeY<;AhgaTZTJ>Gc3J`%K68hx zBZe$AXY^aDu}-qGqcEh~t^0jh{kC7|$VAiBMkt2*6f@H+JFN zm?;Ulrj8B|XD`O?DQqyJ@^O%7j0xM~$Bd!bjqFWS&{Uj&rMC1iRShh0O}U~gkd2sQ z(}S`0%c-#!UBH8}U*WVzbL}#>%Yhkff}3TGqeeKG)E~5lyd%4A@`qOPqxkbqvcG1%T^e#wl3ke9kV<}3)yQ}dV!!PFK#lpa+QDf- zvI)@rM27C7cZ2Ke@}!3oKq2_fuO$9%GT@q6aRA4FzcQBX18lHBjT}TGA{-GzJ%k2K z5`Tf*J0+wU!A`-VJl1{qX{424=seVYq-!`)Jx1w37Cp#$2$L>pcCcu_k3EBS6q(2$ zyEE1uenRs?uytFXv;6!>;HTTY13dFAl1z8F8n|w zlf;^0BH>t|Dy1sPCV3~`8BaX4KTL0&Jg_~SIG7pROw2?jmBAq!Gdn}U~m!P3I4c()XJag7e5VhHYv%G$P3ZFng=Sh zE5u9&)uz|BGJb7VYZho$eMo@U{;f$aV_YVgFeIcLSsHFpY*BfrGZRsoZ^6uB$`Y+n zvuNMod05>N?veWFaN~VAH`C9;%;Loogwx7QYZhriWzk|;XkjzQUBvNSWX54;)6#m5 zv+5?GlHYW4>R{*k{bOolOJh!>TB}B@l&3910@ z{;Tlqob8^E!rmc1-(F>3WMAMQg#yh2Z=hwM$B~kd)I%OZoI{W!KZ)9irirqMHbvR@ zA#O2m@$|9D=%I5Zd`OT?z%8sUY5%Hu}(ydz3*@+(ehAr&= z7};Cd1=;?rR|i%{Sx4UEXRnS8uhr+5t57W`t$ErRTF6SLIjFfDZJ6cym3bQ$o0t{n zmF82J@Pf;?I?7*RR&eGw(e)*XYglYyqvs)yw1Fa z9WEWi-kLAfKHxr@FOTXt)bGoZS$5CXJ109y{qRr-2M;aBDWWpu9(3psCTqtZEe>^v$gL@*5F)?TkHLZTK z+g>D*mr6~hWJaZgoA&33cTosa;>q5Ryf>Mv?nhS-Z$ZbvcC{4^|!W zOx})HO26h_Aa~ci6#u3vsXp{+EkXB}Z`NUoMiOTnW~K}MiKd5>hY&O|De+c4zfLU0 zzFKKBv#oj4A0;(1yaA`7hN~hu6 z&D;X3H^vmhtkj;#X9;h^4kHd@m$w*;UuG+@v<9`*+u=2mewKPSoE~=1^jLb$;b}-| zWHjF1K@0~>2Dl!5pKJJF@uBiF-s7*k&=!&d5_$PIkG&gZJ_DcaxA}%&COwr%x)II4 zRi&t*J~p;k^C(fNrt5OnTmNe8JGnmj zy|i``wq{>2(pYXmX

E$M3G^ens1^t?A-jDPFQ)ab0XRU(w;&a~jUG%){fkayP4L zu9T$Y{h0Q4m1VJZ*oPCCUhDnSo8u|_G-h$_0bwg;5wa9U^1Ts-u;l#O*wj*#e5~c4tJjq zOEyJ|?bWx`6eXwl9jzSi%-^O?XvS!Ye3;%&*0E2Rl1wqiY_T$1NVedJ8b+M{dL;&uFeFmzBBStc`gT*ac()arBS z+Pe1DdBK%s)$Zqn*VKiH)ZO!`BaaP_byJ&-cFSmscLiH(agmZ-1+GizHGF?7Gk{rB zBnA%U%>Eh9j|~Pqw-c^%C901jC9GB~WLAqLe! zKJ5xTKWwSUPssT`(apN6^Xp??<61GW^ZoPlMmRhVL5R2dwC2~=jyG)oABSt3onT_7 zaGhX;d}08KRWVkRFp-e~qXOQ;f`NycgFylBz=0PI@B#yaj1K{W0sf-_FX3E>|2hg@ zoeTNj@9jY+3MdLmNC5v84IPY)Z5+*PofsQBNPwnh&6U)g)MTVN4Q;Jy^^I%|jA`Ah z?Le1+al3H>Z>^1;^oiW8t!x}Q-FQg;p1}#c2YpRPLiG0(Crcg@H5oY~AzKGyB34>@ zT6z**cp@SqZU-Y1P6c7nf0qNFcu35goa{L1=v-Z0XSM7&tgM=;#^g7#V4R zGiV&$ZJhMoXlxuw|I^5SwIgioXy{;W=VWecLj-D9-@w+{iHC#)bff?N`_Jz*b~FE< zJJ~q?>lV;KI?xe123mT$|85(&lpFLdr<}Q)v6Z^8xiv66z&&`GSUI@=p8tOx`JX%f zuPfF5=SpTK#{Yfg|2p#jyi(cG*g?qF8n~qs@BciRe;5Dnga0n%rUUi-e~rX{hWYQe zz&P{5bJP9zGvkGCfQ1PH1LFsi5Ef8!13%7!TUK?v?e8>%R*W>l{x;BI#GEL|U?x5r zM7@v@*KIV222Cun;M$k1o%~tzlb{8);pZpX+DNRC_3@XSJM2BKyN4Ui(=&IU@vG5p%(LdTtB5_*&Lk%t$Aa~3SayhLKhkwrW^yP@Gs&o%M$Z3y zW`_vzW1<)8j`@3(9YO{k+*D7dw6(3~-xvQsKHODx{A2ziQBn3Y=865k?nLv8ptwEk zr1VrYt)`chOCv_czh0(v8d3d0=lHJ?X%Ja=?R?VSEw2q0H2fMS%= zo85Q#nZI+9Pby~)HLGD+QEYN2UgU;rON9ir3X>o>1s}24w41rzx2mf{&V33`l!EUqq_fE+y4m7L0q)q^MDO zU#_PlF*K`op09G=tQf6Xh$>r8Tjq-bVUXXC^W14wziJsLDO0N}6Jl?#x@|?3+8Ekw zIoMZ@a;0gSG-&(2z1(fwF4@l68=N%E(YCsyQ_4=?z2yeJhmWA^aC1BV1v5-bstawZ z_aHY27l_JpJ8#G$2>gN}oH?!S5T4ijE&Ubexhy*E8b9M?bqeWJdSVKRc(Uw0F7uTZ zXDk*ArAVrz?4Z`O4$q$(A-Dzv1XQQCeUaE3TQPj^f$kykz1|x}Rvis?zP+5O^&;~< zql}l`1c93!RW!W^KG`*}GT64>E|})vj4yc{ReXd-!bxPaZh$86zMjxOp0Al%Y->9i z`rXWWX0#emMs{xN`=mMVIL^}pi~|nV)KEc+j)w&0&XP@Q1!LtX>k{?#=}PPIpm&Ey zmj8|r`Pca>-k%w!*SlH=f(M@|z{*Z&bTE}*<{7^eAnb>1zo+%sLG*rRTmrF_)bFQ`b_wr<;gqXJ+| zWTB_q6Ges@SNOUe<17wGlN%OVp$JSJw+);Uhbl{9m4$ccHNY=!8q!Fr;Am!_+sztL zQS2!aBn`T5So3+BDZK;+OXn(W+oXT$D_^_HP5~1Kq*Q*~#xv0PoYLFlTxY!^4Z?_J z(~1j1Gji?ke3N;LvI@ZzTJ38XNa>@1v(__SBryoCgC7ahn%wcXZC7ImmgO}euOW!o z-w%6Ib0H{Wcpr<$Vl^5pFl?H4FllmF?~WG~eT%_8ZVoZbw(>#=a5`QeEh~~5SAa>B z=lk|rnl7yUyC>Sn)ctnV(>4k;k$PC>w9to*{%~IOqO%ZMNKfMCNiOP#;zR79rih&I zXD58+7zZ&B*Y!AhEQ3DfFvMSa~GZut|Muc@;^KQzH!D1k}KAQVFj_`tU zr21oFqbJnzLHuH!-FmU!FCFo7psByzkM@atv5%!BA^9Zr9qVCKsXh?6Ljxo%SI-*Q ziJ_%&uST&3g^@eJQ$ljgU5LweTbXr5cOB!aHYb5&jM1|?LMYDOoO=980Ae*A-_~ zx5tB`9@ZtRd~f?@`~G0+^be22i@!a)u?*b2t|!Did}nvZ`QEhFz*`G#o9S9jjv-tx znffAe*=#yIHJm0y_7J{cbTPS~wclCPieRb>$FS;~v=&p}snL%#`8M|>c#Qi>8Yv-^FV0_nd}n8h-QCXc@ov@gYmekh z`|INo1M<#5Zq}a;K49*%)bgHpK{8|&3!y*gkm~3H0Vh3~b;?B^Y5yhsL+RX-ZKpTM zQD`_K4ns_NsChOL>pW4&pkM6u{)AA~FHE#RM@cbNVtU;dhd2TDMY^J{1K}c znf4t_{H$nz53I|Q-MUxeQI5ULgaKia!aVEC&HkgSaX#HAC8phnFEdr&-%r)LnS62u z(!^Jz_}Br2ij7S4CSdM-sdO(bMkNar=ynuca_=zO%)QY2jAqD9`MMs6JZp)K$im}| zFxV?qB>~yUt=hZLAf6LW|LjI{s5Gj2T!r>?YV8*B4|Li-S$Ei_id07VC=huYc@66c z@dgxO*S_*T2%klmVo~=;VkcPD4pO)u4InY&m4v#&kVr#6y2)m4WS(W!Kat{}`8=v` zufz>%9cLU&6Ze<2dr0?6 zcSMbZLC%B~iF7ClOvg4!*Ww3<66>W#hW*Q7I@JRdWvU<%RHG0I#^#A9Lrj>O^I@Fz zcq!VFxdXmMszd<^2fdaY%8L9&Kq4A<$;e92|Acadi3XY6-K#uKcuv@bRnZ7(lpzR2Qk-KjHX z0nDQ7Xw-j%Y?kvr7YddSbNiLY zYP~qk?%*H1#Wc>HzwxD_KAynwZPoj31uKXXu>leiD=1i38*b(uG(s`UH`F7)Sfqf$ zTESq*U)4m4J6&LsaY^Q4llc&WJP-_j>^g4?I>dQ!vd@sAErd0te@m#PqdHob>dfLs zm9#(XW(bdpm9PAoKMs9_qnfEIE+(3FR&x|45AlF*K)N0vje&a9B2!nsGt&#_!|bkT z2OgBQ-X96R81mY=FjhqA2BA)aIE%=jUiC2uqBA7d=whm z6w%T+^6hy9r%O#$$?}KDUo;F|=y543xbJRdQ-*S)$A4$BBLun&wL$KnjgwmoVhkJo z&@ynaJ|U`VU)xDekky6H(F#c)%h*}QR>Mqmm@NOf`OP9uh#nHQ@N&crUVafihWQ|k z9MhIn>T~^h27#~|*Va4CK@DGA+cx_=i{baCSwmq0OJuMJU!1tNV}+bpBb|DOtvSzH z)Vi>lguiY2=+iFORVeRRA_R`WQD)}K(g14C4PD9k0=bax0ds61QzWQK4|O=YGPDf-KDNG}eI9N(0!V%jS zEXiBx=Uk7(4-Q8_UI|$nAtv2>8SP|PFtZ^gCrB$C_Xlt(mFnv26}Rp))#)TT!znj2 zs_PN;ZlVUD8k0wUjI8#kI3m(Bp4m|&{5h39m_RAE{EIL=DM~kwne+(=4=jn#N)st^+(go-P^so->j)s-2#XH>5zZa!`H!PKbiWOD`u_EpQuGB5aRDrkQ<- z#rznbk;3wR_Xq)@&afrgYfpvMz7LB!tD@N+>B_{bh6NbIfo$}#8j8ldxT9rOy#`o+ zXOEFTT_nD@*(2FJnu_YURx7RIvy<{R!D`(jBHxYsvS8~JhOs9aC=EKw)3$qdpoZ~a zEpgn5zdJ_}<>)npVi1r~kwhiH=+f*oE;~pyP>{(IBti{;xjZM0CY|L~laTkdd8ij6 zkZdqWNTJ{TeOdJZ4pJ$Cy(}glF?XVNxLd?H|4whNUP`I3hIWxMP=)@*kj3QQfhJPemY6N8cni&4pG7uRy-lVrG6}p?v{{9+A1FA~rWB#) z)p8>ZW3$ z70+mM%Fgy~zS!w@gMI%^J*PHu@w~u;K&oUZvuLjWJz44bB)ua6j=v@SVeCtAYwAtE z%enN0&Ib}|cJrokxI@qW_JPL~3c4UVmAdVXY~l6_mEQ01v`QkYcG&j~QH*~5Zq$bE zDu_(fT#joJs~iz~VM#1Js1Yipv)L9U z2qqPDIp45NGxKK4lCxbEC^i;noj7_Ea_*SET(n(wT)h7i;(6L!Bs%ML<`@O_j@Abp z@~F+kXA9iC7;UprU5f$1VGbii+Fi_qd4$(Wo|COf!nL?Owrzyub5?X3I@3=86F{2; zJ%lwtOfl_6^j{8`v_5POBMFU?23C)msORfvb(^Rk%Nj84It-KZq{5*qk-}=OQ*f^_ zNwn8P=3^=P%2$Y&LPY~Rs_6ED7WFjW}GI8-fXHiu#I z9byzh8#2L9B0aL19pAaeeHm^vO#Uc+7$MmZ5$uu1K7;ML7{mPTbzS*e?D?-`)AuFQ z)AzyW1y+bEvC(R-qK_oV@o^Hx4~N2#@vNf!B(`A{%|q6z9gXOHcVUqjdP7)>gQHMG z=A>u@ib52bfLK;k_7$a|tWIx#U< zRqVqjjV=$tQ)eg>Zn3adeO$$9jT-BkY~R>x5cRY5zh`09Z?sggY_?W+o>7M6D;gqQ z|EWr#NLRxQVa!A?9d?5IZSuD#yjbolD!{0`GTh?e`CAC*T0!@{ddf!gvkpzZz#&y; zl$Jelcjf!%`O0F}-jD`!Rp^G#Jy(C5`xB31Hh~zaoxafSRX56tz_#jU$PQ2(Gf%c0 zp6_bQxZdMM8BIxWjd~eH9i48bF}Vx=y`Kg<1WmgCMl}jrC+EZi+50O!@!;j* zJx`qhMa}LT$mkophmC)1Uo5^q=4h9dF1J?e)&mAzb@9c0C2UN;8`sH9nd<&RUD4~g z;85j^i+2wa+ej#LwkwzO;Sb)<*!^?)cQvG(1is-%vlSTx>b6^%k9i&5PognB<-ay= zoPh%RPh*XGWpgbhjRG!LSAK`E!7@i2nIoFnJ-7CaM_YHXNq5Wl0c-BQdN%|O_KHsdRyscX9RQ5Mx0uzmDODjGGAs` z*|}ODPzIDbr@ETiS7ih2kiZs=EIRWApZ@B_`w>z{#B+3DFwGQTr*FM&CeW}%KgRIf zrdImH1E84eAv7`pp2zimUbb&IsA!n^4bz9;EgkUl z;{8giEAr+yoteag+GI@|j+@1ZCT+4$iviQ7ybZ8Imx4Fu1TnuC^wv##H5MbWnF83I zr-|$}E78b50=!-g`BnbkNi0JEdRBH4(T_->-uK~>t)~4Pgd_P{-De+ymaIsz3bYB` z*8PX<Drc?dW9;X_~?e=}^wi4kBY7*zc1)1x(dcF!o>v7ZF8BK_Mj8dn+L{Zk|#|end zR;x3Jsz3n|Ci(!Mhxk1pbKH{Nr|CQh-ky6e)A73PYYuOVP6(9gbyMdHO?5WKBh*Wh zVWxrc{ls5R6Ii+;O!S=Fz>tK$Otv^5+0Ck(zEiiHtZrd9>uo{#PK^a$dCbt=bsk7R z?N8s}1=S-E6%Ky^&|;Qq0E4wF)%bMUTBZ~VkA^`KdB5s;HskcA@~43`AHY1gy-tV` zenyV~_t^w3SAKL+&>Ptrcrhy{>6dQb0|`eK|ln^vUW4uIZ-v`u9g& zv)Geu27_K$mWjO{vrTA3EEL#4J!!XE7C{Zcm6YUXJ^ro!m?koR&d;X#QLOOA?@xLx zf3#I~JW5X0K${-^(eDZQoZoiZa?}kFeTRfnVq`D9K!?-0A!ruhQHN6iK7zmE>GAF? z2N92*KHK+lkkaCJAVx0Y^Tg0stc=135TU0MO!7dcM(jXiscgX^&T*+n))R{S)hMG> zxPxhmyY_u@ScxY|Y47$3)A;mdCs1^m$^QhXLP`U#I9S8U%TIx8^EHNB4fI9AE;=zM zHOwv_gpg=lcSq9TZq(WDKDT1#sO?U}o`a~$?Y(hLPgch#0)Iu^jz**0OB zNueyBc=y->6XL4LP0GDpDHwZH!pp*=* z=sR+pqU9jbU}e-4V_Df;ShAYjwDfmA)}q)!t6*x)#9_gm12|u`&Nq_t-p1u9%Cj(J>kq^^yIn9n=d$4Vma!Ry3%v?<6W}kf`{K zD}VFjOHf{9Sfv)r%|`u{;lBb<#W%}jbgQVJkimT^hI@t;xgbX0`D4BjJ31bScOw5M zgyTRMOX+p&PEHcU9k%KICtay$d2d%ZK-Gdogt}c~Z6IGs*P=tgP>SVFpbTh--k8=- zv~+)QfjW%jKYhw<;8!I!5gD5LCt4_B0FpszDOVfAKb=Dz)yx4g5`W&dvd?qd za5aQvVN1@{-?sR%cA8Z4doaMj(G6sWGH`Gm7BwY(L&f0CBkcPA`47O}6}H~*S(Y6u zmW&BqIB7?I3e+WrK%jSZ0K|$Aq1-|G>I~Zs^_2{#vsIo<-`5BI6m1tcH9i=W&hL&$EuF>F{k(5jPc^`~pWQn^|nE1tRxC&$D?vp%-L$ zZ$pkbt4`&xu1o#T&whK?eBF{4!!KCdb3R>G?|HmEDa$T2r{o>z>ayFwQ)SS)JDmBl zt{}?#iX?_-FDxyDw^}L|g;UV9DM-&rNH|3ShPM0d{$gInk8X z*?@%49_aj|K<=Vz{@X!O4%&ruIakH7i^SmFnlIssMDSEax50U*ZHrB;DPPI z1ZNnxVLDif1PYnSALgp0IX;gi#&cUh0xLL`n)DBW`AuoyEcz#%3T)BmfMcj4zPT)S`C7x4yO`+v}vCdts2cy!79U0)Z4i!4JLC(aycc z=56}7qd8f$D9cFY%RVsm3s67Z@_roFzCoLJ-^==l&uLInyB>fbD*M{c_bOjdgGEGX zZrl6AIK?ef7-xeSO+!Bz<2uZByM9U%BAB?e$xw#KcN3TT_~%x%0Cd%#Mr%z3n|jjH z(iN8_8D*9FBV^xuAuEK|jp6l*I~2&B-qRF%mlo4;VLUCILA|P?VVb7QcQak_{4UeF z0N1U&xrIX=D>;vqL{XbZ~1H_?bsqWj+sDEw9PEag81+J00YX#GGo!f8fmPRW48-n0_Ib!D`JM$v69vSibCV;tfzWb6-)4J)ljR zItEB0=FNAvas>jrCBIbs2!xBrnT;rbzFF=Tc`H@12+H?h>V`>~xWCxy0T!A~iUlg{ z7WZ*=4>MvuYOz=hfk)7wK*7)-VfRzzq}GRYld5IMt(T{og($i*o`TJU?X z_^zyv0?*fApULa~oQR-88N+KcoU-&o@HB!y9~O~+9=sPI_CdsJ((2Dow7QpO*W}z+@05Wz^iY=f+8ZsunRJ9otrNyXf*#|$4rgOR4 z@Z$^}W~pJ`8sY;y+LR-nI3@`+4YX`Nq3<(8HMD`r469JnO&y2{<&|piGN$hl>#ogl zcf^GJP*xfuf)@`RpHi?F29+umO5jnv+cQF79^24co~Tl!PS8^rOEosha*aAekeq~a zWYA|dTF0_saNeC4l2`!F_okl&kG^ErgLKo6gIhnwc?6{>7Qrw)Uh?)O%|1@$OP~1P zu=S1PoG?IR~ENHeF!gZ6?8lf8rxyg*eYhqEGzN zU@r4iW|dw;3HQkCq^H~+j1F|d}Jqk@dtPvGNEPlc@p);{9iQ8ht{^>}Ql!&C}@Fo|ygo zoYKri1K~%*8oC67;3vsG+c69V`De!zOi^ql>Zj%!SXv2Ma~Tbwu>8rT!LmM=@V}vh z<^z(ui`#abSingexDUPFrNA;Jq5qOb3iJ@6C}DX?JmF`=0EX`YMm(e99qm&-W!e4_(?u3wW7?BS(Bo-)Rpb@}d2KCYg*7FDxFMpLzbk1&n2>Jstp zl151R;C}zS`f%|?Y$ih5!;x5BeV|A0rYv9Ejz+4_bC((-#-`Z#?Cy1cGska=?*>xa zo^;l9geSKLQ(|YTSPmqJ6`~HjE``zy3*SZEymiCCCCdesNQ>l`3AKJce89&Z`(VinmwWRfM^bwHI5U$MLh^zYtmUVx3U}h)NE} zA7FzHw7$QZ!y}H^1ZANLdx#rBR|LXER#cKrDv~hW31jXxNXO}2Vsf|~&kOTDkK(FQ z$u(k1>Zutp%Tt+GY?{}ldQe`0?+Li|bp<``-7!Q>o0W8D3ufX^wAwXsn@ zUGt_4>CA}YF``!rl#Au1m;3YV!?~`*ok~mfPivAF;8$E{ZHmKa>0eptyYzx=OC~L?NB7>p$busO zB2a3hsj>K_t*TbxEKaFeuS44ZL)FZf5VZa*n3pi z&Y;BsoNXxSpK92|(XRQn*+GyK=Q*vsaP_>A9RexTuM7qEy=B!P2<8L#3V=c)sX0o};LrIR64+!d9wu`Bw1m$GFZARdw8rQRW!+0sL99 z;aZ{Q?T&EE54ssF=w=Xx8-l67?oL-UvnVvErlkQA*))~IvGCnPjUcRE)TfR%r%^6=s=^BuN{&AI72CB{6(W-|@W5_3#2rH8V@~NxwqIa=b1jPzWt?l*)OBxBiBI zLeNN|s+t@C6oV`ZOkAE~kt~&Rmium+KQ^y|3SesrU=N$tIKTD;Y!F98RAUS*h;!5< zmpPuN9Lmahm4d&Wx1!ier~9JMnpgdg=C@6<@C)B<*k@#2+WV8fr(m%HaH7iDtOv0# z1qZ%e7dWQU);+)_9I$v>P3H;0@L=pEdr+(7fDB&iu+zvue-Fcsf#h_t^X1JyS_rfSY^^Eyu@iTmG+dVxswz1U19#gC=htu8gtc%L?@ zQ7L0ZrRdt7s+oa2^W$CU#a4e=y$LuL96N>>*k?>mfDP#Zp(S)kC2#RA@j=ne26ajp zK%i`uMTw1!OLc@3LZ0s2wCCgo3|`=nCS#mPu#+T+QkL zP&&b$XYHy|A5yNJsnS?*lypP^V^x0EV+|Mt1%93j%i&sQXLe!1r#4&Cv=zWlroaCu zZ)iQ%>1x6_MY~Yi3Sb@)p8-#myi6m2UV_SRcD~H@fR-^3*dWVDL+CMQ)mxknzK4MG zcw847hG(}o!gi$+X*zCuWO}@B8EelJamcOIjrZO0I2O!WEm##S+eNhhW2LVDCV6qzbQ`ruIQd$oKl-qGl ze>Y&6s+=oEQD+i>pzl6Jis-3VUIGkSGu2y3syZbX))Qa(UX-?a_F78GDv(AW?jv-c zNiw}}7amViPTKrGKX<;~RM8s^#DaxT=6E@z@_M&)cZF^Pq((T-Ojj%s7J;Qf<7|Qd zWvKiB0HEu=BKdL^Zp=ykRR-%13%21dKiCJQ2+_k%`LdrR;=?cuNZRxHYJ{bGNf>n2 zT$ZfEu@K1^rRM0P4akeC!w(um29No$xm>JBBI)C7;|*4s#@I0Hb*a`IKD^ z{ESa79z#}D+$=5pjOoQ+46zCO!80df2XUsfZ@qr*=5t^|FkRu?QsF@MekPhUQXPTlcM?qZ1$qX07n-Gx2-t;$2qxq!- z0Y$?Fh!KrsUe7uiU=Xj;tp2i>6r2z@en28S0#~A&{FlgHfn9?TU+T%fzOc<$+CjaB zf>{V9v^}Ri6Vm$Z!ZM3MO`bV&oX}E&*lvF9V1mOp%&x*RW&BH z{$EuMHI*OfYMjd}bwWN;x*(|L9M5ED!aWw6niJg2HPi+E5c9>tOd$)z)Knx;nueZ z*aP>8Y*ZKvE7K6o!NsEFq!MsMV+bmITJ^X$TPUQ{7|FFf4hmSnCAEJYp8>SPIp7~C zTdP(|%m(>i1lpj3fU=8*ZKw(#m=$rTX7Y%LjF{WvVOvgudKq z^!fqW5jYaw->s{EL3>0s8N$Z2LZ}F4`@TJ`9sY6!x&DA4TPBPZ*M}Ojw++~jn25(J zft!vCrwX@WI%g%P75EJORkprVCyAM;$As%A#yCpLJCcJ^&TQbENv8fLeyZ0qf#)$p zMM@)Jt`$41p4hBxIi!k0pD_naa2fl}D|O=3B4@hjst0R>B5gsw0D?G-!bUs~cHuY+RL5*Bn&!><<388m0z1s`j4hE2+i>tox^WJ^NtOG z+}Q!Uq?l)UNNmgW&w!D2mWA=660cA;3kOr%X(H8UdmwHj)i4?k*a|#Xtx};;GR*>r z2JNDU-%k~8GMIkzdOt$_s0PG<-_MEuEoII}vp9fXH==|XVhk-5aJTAXVS_Sgx-8`O zS1h$y0Zx!V5eb!EPkbtq!;x7B%L63@v&Hp{fVS0p$XhVoFgJUJnD!98FiFs1G~@j zy<60`i^2>Wc&w7`{M(bI{`&GeJ^9eXgqxi5yPT?J3nWc|=Tp*#rvMqJrNx(F^EEt& zmv&&M3H(j^yC#{eXZeL^FomIJ2;-$`!x!ZOq2J>bPfh`ba~2vEzlJ=Aqj3$G5rygO z^k*QW%&^e?_$;mzFyzL7O(zVZxFmn{GNsx!%&1u8sU1AgDft=(pVjmeXgGHaN*Oi` z9ytTSVTkC@F&0+Dt>{$I*QO~VOQE4ugRXH(EIs}h?o07V)9T?Dg8jX1a^#&^-P$fC z2Cup#g7ybae@j55NPfq9!9?dQGn!9!6lw%R{l($MtJCR`^ge25 z`Wuvym7@26-!(skE)>r}9I?OqYpQ+#M($20z!IV}$i&R4J?gis6~7);b@CP#L;{$O zE$*fmOH@6Uei^F~FoTmb(p$3of(*zc&>t@}c&C-iegkZ<3AO}&J+7s6`6mua*+|Xl z@A2HfPx-V>8nX84G~y*BND@B&ey?TB7&nk^5><%cIOmoI+iS;A82$iTloPd0YWf?O z`twvdC(SBo|B0L{6ktJBmqZ$k^S9qZ9I5Jj^&mN{qh=>#g#6lTNBXP24Nb}BrP=u# zyyEDFV~>b~;*Yb=Xe; zD0mE9hpm?e2~M?n=EK56XFy2-*di=i`*3$*r@uf{3YM$7wc7!B1C%)*0aO7;ePUbE zfwQy22~mK;Z*3Nz!k19+=h96qw*?hc^VZk@-}_Q}{y*+ZNymc#=7hC8W!^GM>v$dd z8$gf8J-h;Y80yY9LJ3V|&)`7$)(2EB@cr~Ci3$U=F$6I_)yK8NnAb;ub^olyGR?Yi zfuaqVf%@U($)DyopUygU+ika$5(%G_(nC*y7&aSL$NW*tPTcszCmVNOmg4 zhLlZMB?9mZ2$bG;-x$g9e0NE?owrm^%In79+3$l6K9Htv%&6+(1ECzKD@GWHg8=s} z75CK$(_sRVrMRKr{re@L`~YEp`Ev0X5UCDN1fV(yA`jG=K(N*ckkq~bP{Htk7MxS1 zulA+84G3vi(LnHc*mQ)nv!d$QcAE3O`0{a*ph@Bj?}{M6b{jN;oY!G%4f_Y2qW9sz z<5t202Shr*#9!wCX^9A=5@dW%Cs+_TKoLt2$P@|zypNV}cx;Piq)}&pU|(5M4P7^* zqWNxG`Z@Ihy&usB(4Uaqz+Z=TU%$JHhr*cb13NEb+|Ma^+|Knz>i}8h-Njykf)KFd z!YNibz-}nokD)@R-6J@Hwne-K7+E?5OrkNF9)L(i>0L0}2UKHrU^B*a9UJz3AoCT; zF{s+#CC{(~Sy_ttK=n-5EhrOso;0Z5Ur&8?W*y_FSj`G#4_8`K1Qo6G00HVuk4~As z0c=)5YWKW*|Fepa*UQZfpw_QlM;Q8{E(*(!80WMcW!ATN%nFiJ;E{Q*<=7v#6K)=X z24mbq-s6`-QXZCtqf>@Nv8}Q%1G_KsfHGNswp?AHO|*k-VXO z=qWDb@aabe#Ke_=;E$Xy7BjmaM=Dl&$e^n+1X9P9yN5rX>4O4@`rj^G0Z#G>rK9q? z=+byj*VFR3O?(azq_m*t2OELwj|*zQnbp)koG#HHBu|;Hf7S*nj-fwtA_AaI4N-A% z)kIhcZb8wOG(ik{-S@GtxqkrwJqQ4pWc8D=A)R*yz;XjH2gem)QlTTMW4rdzAK@jw z5bBptw0pXquwBeo9R5Q8%FR&ayhR$r{T&^L;^SW&{RXW5VLyga64;P;-hhwFgSl#u z$#iV`$bPFY*k+|ASR#SqfCW+~9aF=MxLwOAPUISvDL|virb6sBc5%{Z)+QN70x>_( zDVxyeo;d6GI0BJhij`Ru%WT+ZD+50P(a#!=qbxnBVt9FjY9EIS;Ls_8fG4Gm#O1K{ zya-xDF@CuAMqM9R@Q+lB(bBZ*?2wUgn8jnfq@eV%V!0BU)H^B8J4HqGf-zM7JjVTu zOjXiz^54z~miB0b3*OOp2yD3koDa1`d?3WgL%NQp2JMHv4j3tNMbWWzay>S=z5{qw z$Sc| zW7tV+i<$VW66pmk@eeHDprt(Oy;(R8KSG@MOX+KoKLUrgpW~QVzeufqt9F*5i{c~t zMd>X=X}2MnToP_SJ>Uno=o4htZvzM*nP~SMc+Ner zCoP&k3NKS+P6`Y6mUu-#BV4RlpHFtl+$|hc)3Jf@gYmH@vpdedTdx#~XP#G$n}~;m z8}gGTTGuB{`u>II`)bq4s(a00-8$S`}ZrJ z4j#f7@6^m>hQ|};U!~!0)QC#LmFvBF_i;Ol;L&q$>-hN7w|+t406QTHm2wu%6`vb= zcc3H_q@Q($hkqopdgD9fADhir35*s$1!Kjj0)Yrwd>8p4 z`(F*o9UawZuYzoEVw=cAg-XY7dVYNI89LnPDApU*-`zaFDM?eCuXd&C^T`QyRePV0 zU!~`dh=_wRzx?113B{4YnMGgVJ-Yq*p8rC;zethW;^BvUM*5R3NBfP@p-RneO6PdVpj#(pxs>mPl=v%nyvn@;QjB zACQuK%Aod-7`yFVy|@T87gzoYi>hmX+D$~_a$WrW{qu249{jr8Wsv8!d9d^ zWd<$R76CIRz^K>;*u#~&$E(x`*B%vmZ^huHc|)+o{1E;44ycX3xRbMDvtArAobo&K z`2)0i4SXxQ9aV_njq?b2p!Z4KPN$Pn#~bnu$|E6)k=!4W)AM|44UCgmPTaxvck!?b zc@_%sfrqGCAx=&tUCOnLxyio0K4lsRX3E^V<6)#M+td!C2^ZRT>aB-ad^;G2X42;X z{K8xPT7?gwND03*D-P>N*fouZbFuG3PYb({fQeI;8UNZ3wr~oavG}eo(6a{cwU+Zg zwLk_0edw;b7k&a9j#A+}GvgJh-=66k$6n)q6%_HBlybOaqZ3Wvps$yvreG02XEU~{&q55 zuS%GoUidek-eo72<3zq{-w;vK&G=R|93C#NF1R6%-#d!pYp z6d+hhb$EwM=~UcwdPofrY4!C>6?x_}t2)$ZjRygsD5*;J*j<_QYHY!xX)i6l>P@5Q zdjuf3a)m`@TYD>VF{rN~7o+_QOpF@62hejDYw6&pErVehmF9zOb6{>pEVZ9W{lM6z z6VxSt0E`XBnFy6E00-T5fk`6-%y7Dw zSo&Z`e%-9p9estR?vsS)Paa^$hKDOS?IW{?wj9#gmz3n&xOXv>3>=eRO)T zfNkt$?0Yar-R?w$C>MMatn4N{1Kkr4uV!Fed0ZXuKBcHjCbb>7bj8i5eD(;zYT58E zLk+Ce@ZC!p64~G4BIWz*!V%SM3f|vRkdJYu^U^o2i@``~fyGWl*-zrzPO4@0Tvz7Pa8$*1|*{lIkc7Aa3VYC&5~nIA9MrBc;~ z!D(EwER;NInKkjj@f&4yPO~8;2OohvYId_lpfzC+Q$7_T8k=5PDiN{c*fG=FaXOkk z1mqpdHy7vn4dv%|yCi-hmjeeb2O;L8Ec;TIz*31`CHxyTnZCc%c>3G$Sj;H04({e$ z#XXpj?1k5+VgbfGCV1Fov12Vhw9(2O{}+n2L$(9E?S$%of5C|DUq0Sk8iF(u zw}?#@rOZueB;}i>d#Q9#^Z!A-!ZuGEoRrL@fUHa-CL?yiG{l6x?E&eJ6_NAbBeNwB zYg-iYW28|iUZ~1aHUTvFv9rfXjLeU7p7Mo`Fh=$UA`?y%lPF=)bj5TVH$O=!7Hu~H zF4OFstK@wA|Cy`YeJiMO!N4g~(~I`GFEj^(hz&ucamXNzgE6xqHALwd;d3Z-le`b- zdsPz`R1l?GNJ}7gD7c|Rn~vvr%n^S)A}!1ND-jQm&+b^@$gi8T^1uo?CeEiLMFN&ocF3TxuhInu3SwsWEI%H%!{A}3m@TYd+;*R>O{ z5|1p6ea}Az$r}MkX|(+;QtWwc3SNJtIeQgg=s&5SX?4VhcsU9Av$BP0ddZckuJ8)5@7P z0Gewbc*_ zLXx$H-{s34;sLnQy!ja#A9kT)jySS5+Ik~hA)lD?PiEy+GyIB{?X%PMGr?%SH(*Qs zQ$<$`c@(FlCG(xpG*>j0)S+e$hbw@TttYk_z}HMtk6NViw}mqJ=8rP~5jg!)#b9U( zatVIrXU7bccgE1sAoaI&F3D(~;(X2=#KEbd(q(ye!@h;O%7W5td$oEBE0RIf#Y_}v zW3KL4HWITRF30wU?j;?;O?=T*-+gxL=lRm{}$9F8W8MCjWY?oM>W06rFj z?O3+?5Q8)yqwV9Z0SCk4Z;X5tIK4}IllK`Ke1WEJ1!E~mp4ux4N>Su{A>3;^#$jaR zLo1yEt3@8JzM&mk713NkxXe_e&**br6)(PaT=;Y!=UL96a?`O^Ljz}3t-nq6pxIx~P z6K)P6v%fk{4PKf0hNsOxfUFZqSg~YU-}#q%GlC1-YX8Rbz&~ zATGwPc-N~Trr8@7ftw7haP2B_5G1bpr0c;ITdAa)Ba54+{iCGI{yfk zxQ|5316CbOyXbWOSn`Z}KK+JeMI;pv0P}C{C`3YebVuoaQvUlb#c(})B3`_8aKo84 z>Priwus;U0ee{#Q(=ifdVinu{eL4W@*GbhOO@iap-@NPO86pYT$jYt-k1KibCze;t z=%(JAx6H1b0TlKjpSTKGCHdjrw3`Ay=K=aE{Mb(nbMmyB8B+20F*y}HeCT=2p4FiY zfD9N(?186=HsNJNEbh59p<5BY;K?}85PQW7;ma}NYdE~|5O898@l`|+qHG$t68m0Q z!eSQpsJ!O}KC(klLen= z4lRoMfe^ehN}gC!CmW$$Hr|{7Nq5n8@{R9wezxW@q5?O{ITtuk;RqVxuGB81<4Zo> z9bt%Jbd?I%Zq?N@U`z8oT&!y>1D-k*0%sH@87n9(6n>U`_{#BUap;yGBCCr=djS2# zP%o5-?$L*64i3(&nAULXx9YDe*O1;a-Te1WakPZV%`#KLKw2b{3IqK zcYSyJTQwkv8z55;rg5r~{dT7|O|I?9V;8}eXu%KCrj~PuF4N4(-L57 zMQt07DF~j1Urire2D%x^snC|+8WXBl91>TUi^b~ z=ADV3=)KC_h${;UBj$WOc1SKr$;ki+smbsZqIW@TRkyNF#paVO<<3q|go;5m#Ni%I$tU;T;3$R9 zG9;Y_t9Xu4Emcf9ol$j)L#Dhlke+YSae zy&Y|jMIi=hi=F3x9&K67R3VItj}+kw_I?8r?*PsAG6pXs%^O;f$FzW5EF~+>2 zDIH&HTxZb>YXEGt6HDn^E}Jl2=zlFlz2$^6Y=1jP=AN@GIZpVCx7Odr7mIsx!euGS zKeT;Ey;q*f_z^qT(%8(%7p*cuI z9yiJ?s^l~K7AxvxkdM5ZPZV~u&U;tUfj*0)uY4^r_D;t_&+Iu?OzTg<Ylb@gQWaQ+6JIs_1DB~GNv$}(LKZV5A?9+E$E5hp;I^vT? z_ofNSc_5vT&KHrwZyjON`Wb7qjMV1uAhdsb*)6)1D8_r1uC|XrhySfWT`!{JQ_#5$ z!J28Xm@1iTgp;xux8*_9b<%5*JJk1;5{jczB+ibvsAJivS@Dm=Fs7isY*TM7Us4xk zQhgD0*W z#uo)c{5$42Keeu)R_Igj(AuoT?QkyK)>V~Tss3tTL`n<0BU@GfZ({ePT@;@zFOVu7z-76l&M?dQ_|$2fxeoC@&|-UMec zCWKx0u3VZmB)Cq-byhkto0`$vJ~}Z6ckhUU){(6k*C#6@rszu{+S#;!yOiEj>o2VX zD;(`^@S4^RV=S6P&v~&1CKP>cJiEwX%N-=iYP?2T%ZCkE$tS9TiF3%P>Y-+`cBM#} z6T9aYA8q(J%Bs6sE!QcVW2S@g>Ici$?qlH#X=Vo3@4ovv4l!fzQKPpvwt@|UyIENk zEvv32gjxj>3g0G@+Bd7(PUAI~i+b55$My1lZ)vZj6u3K;3-(d@8DO-f^|x4m$Y*YIegYG8e>?GUnFfXJv6D5D7F9RlC!Vtq?y5a5q<8GyYPQ<$-$=p*Y|i1 zNi!jl2@Z$aic{>>07PIjbnB@nGJLRuysH~?1sQ1oPtx3@su`RPl~+h(nkgM>6@6FM zNxdXej3S4172n9QYSjC{rM4V6DP&__&THxD-l#caYF7LRn9}P*L-9YEf#^<}IGhSF zUljW{>`Z|eQCqDtoIw*gN!b78t zoN53SePXnbe;Up{V5biduf@dJNta8T61+C#&ZHfm&&=hd!NiV{|wp zss12>HAmaQm$s?P*PmE$1{2I^jS zv;+9Bje#%Ql-l_14QRrm)80yx2aG=#A2%tLVA;Sjop#8V&4slFBdo?~dcIW43hY*+ z3=`CP)8inR?jsJg*0%8OVDf+N1nGihb{&VO?Uyv7tcMiqX}69ug{>MV35++6W%8T@_!jy<7^4%rS>GOmC z$R-sPGC$C#SnKcniKd8cy`S!BvrSTufW?>8s4k3VmwY#2CZf&`HLU+JGbyS6ymWfu1BJAb~g%&RD=W=KhfytIEgr}C1`!B&|%5@HQiB`Ihq zUJ!rr3&I`NULQ`29C_=C5WWvE_O7b3n#^__weG=!*45jo&3p;Ww&?{A9g4lzkaLo*qh#XGe_0eKz~GVjHnf&+$h@cUTb38 z1$Q}+ePmb7scWydm?bqzewbx(F1zSfV4ONi%MH^s0$G3k#IenljrHi`+dI6xd_>`MEfVE%#Ta(V;H%qQm9vc#T3>@dzbYPPC^C;d=Qa=Xp z?i261wX2RPZr!K#ad|n5R+x9LbHQX-vFN3xmszixk?HuGrKnrZm0uJ$>z`kh2400r zU8khm+iEv^*%Hb8isD?z5%$Tj2|$sxa`nRwh2mZ|1)ZpQeZM5K?+X3_1UynplqC28 zH<#k?G+xryp-Y^KESx;6cCQw$un=px8uFhr;w(&>?+JQYL$=LqFe38X_n@IqSm#NiUTilbw`jYUeqjT>Fu6?8g1Dy!n3SEF>0oy)n|9_&0lSd3G^uKOhoCLmm>M} zLwaPlK**f9C!pejsIk8mf&&x-5y}Y)jQ9T19$>YSvcsK2P3UhZP`XVpLivJDEjM(L zPtakkxVayvem^-V@=K_%N9p!YU?qV`AYON?$JM3bTuxy^3E@?~{AwA>?LZBm_8- zN^1gY7Gal?X|wEw<>sOlBJhT#%*~iF=uvj(Gz1HarV$Gf!IhlF!#2hX@@U1R+^SN$ zqe7xwpHY-o?ATi@G_X<>#+n3gI8fC_dh#c?dlgHGTZp2kkYc;PYC*T_H47A^%s@$f zwGP3hwI*Th@(!vpu9Ns{*>Fu&ZGE77_8NvObKsP>bq+=lRneE*PK{7$_1g@-zd#ds zo%^TOdx&H`oFCB|S)J^R$C^`$4T|V7?3*R?A-wMD)wBd-5g z=rwJwb8z^I--=s+w2P18lR%e%X7Re+m5B{5gtnKl2I;@lmBJFe(`G9yBMNiD_z%B1 zI3K3BuPA*^ec5}wGi#vY21+4Cp*I9~sH4P)0-wq4;I~BM~iJ#5C-b zlQA5=S>sa3|1}hX6FfwYC!*3Qw>cMbfAOhK3`$^rH-6RL;*;BwFlU#yryIAf7Wtbz zF2(Okxni#14}>$Uo6Z(epaW_T}V9r`5olr$}Ta{vPbRY3~wC)tQz!xpkyQ zMuq5h-#ZYHRlTHUx?K3J_bYzKLP*b^mMa~nNX+t~S?j@4fdgd}MUQ%z2FrwX_6Les zcgM&D@Dupwh07#ZnR)ZHudTy~Q%Fs73~;?g#NtofyeA>oF6$ zl5IVNIuK$3!Ncqk{bcOB)b*}!W332*73^L!^^Rpj02$)>vL0`oM^k5X6)q7uMD^BlO^+i za8AqJknvTV5kmj(dp8~IAYvPEOpxH6M{IK-T!QWs56geRXF2mU0T<`p=;Rp3vVGfc z1rnGOimJQ3y)SSFu)Jw`p1w;|js14z@#1+hMJGfwp=M_?svpzdPhICHqXiIbNelIT zVnJ=)Ux>8%M?aE=7Ng<#u`u=e>642r)PQHMg@$9ONQf+~sf9aGJ$_g=1@IeP_Yd?H zKcT}E*hX{^=rvKUTyBn0sFY_K&^lTbLFxPR3!IAkUEVm1u4R(~s@GOOJ!lLfdsf*& zpR!MCInb+>t+<$;3LH~WJgv>=F)NP;^Py+uph7HEq&{EsLX^8l5Mkq-A;i0F{}$Nf&+e z?G_cTvLGis*wvk}yRDCkzOF=>=KB&OH&Y$cG+FZWC$lF&CWJtWR46cI1xE=8$|WS= zWiy39oJf#NV#_q{*VYF=tXhL`&l4|Px~ZhRYrF#UBHO7fRC&~|>N<9?w}Vf?h(sFo zH8Q_^gMa`KEM`Vr&FX!@;p5xrB+mmzUprwBp$U5awc0^NW7~D}I=50U`jPr0LUv0N z3Y<%U8=Zu|NDYslU99(bvNFM!_Gn1CrT(r1`nh+hQy!Ud@o!Q(MEqh=iqYX|n29lY z#WThFr)vAoBe$?|vv+{27TRqb_XQE7LocGO1^lx9CkH)v7 z7R+Iul!~zHmp(hTf>uTC^fRX9!4STHii=6Oh%F~fol|N^s8mDek$CpfV#w{ z!Y?$Y?WIwb*xMGh?X5nkZsRf!eDJC+vqs6s4(F*2vkWDXR_uc~Lot`9wH>2eG1tda z%k<;pRYBUR`*wx^Dz-w*33+}`&O5|dXclC4_bXtJlrCBUU$yE9LkqOu=0$F7N#OJ* z^Ra*-opZ)#70n z^qjZqLRylt2L!kqQUsG)-$$0+$E2~eB`_m%G7UC35YNSOZAnHnjfnba77|wCf@S)) z6~|0Mz++?j*^=lJH;dX*O$peZwx7H6opXN1=f9_{kt`^D2WU_#ls*D`z01~p^Igum zlsJ=*Gv^)9_x5)QWHbFSu(4PpY9B5XR=wsEjoxlLAQ6R3)mBjW7oCWhT5&WVCv%s7 zW-ic_MrfW+86?X|dl>B$EGWk1#Ju~TmCEPI)Io87lKuxk3<65cktFr*Hna?`uejWv;jz+ySF1Iq3-bn`|E=!`B$b(Da7?vV4RK z?1ddGw`F1dzE@Atc0sb5W!a^+J(c_}8b!n|U2^UfjM&d{?UV}ArPhQ5%QawRQi?=! zNnT`?V|}Vt2QQPY6M5g^S5}9F+p9L_^ME-M=A)RTw5Dls55Y5|TO`kQ2`JeXMnyhz z*k3se5|Y9(^s>PGhx_uJ#+$&YA-8z`Swyc3NL_}K-rgmRLv}-^?$8e0vA`hL?=VX4 z7)P-_hibs%2@Y9H-{MoSRTg`2FSCaizi)R~BEzEW$?h$BS9gcd(Q2GfO^ME;JOcsm z`4|?1%0_GQ=q!F`ng2y|W+p(D{JHHQZ4k?8)jn+17HPoUbswut8W_f-phkh`T>&$4 zHIxq&q6j#}5y-3yfiPw}uCaG<_jGS(Bhmmu1UxHy9w<4Ys8GZ?c}I4=}|^!&kkv*~GcqHKnlr2LO|T4mP*l1wt$8 z|66w`0u8kk$VJZ2CjO&7`akO^f+0AHyYSa){e~iO@NcWzYqu zhE%D|d{qcW4}_uAA@MI~E1f!7dqQ6PGtv#y{=as>V!Dv@9@2b@^LFdV0d~NBB+*oz zE`5wNUlhG|x%5<3kJf@`kkpnJ(`{;@nEWFiALrfleh|JO1+5Sf>FfUZ6%F}DNDuyu zn2h0YiaPe-XDjB(<0-aTul9i(MsJd9-?ac2jPD~Y?Q5??5eIA=f()gXbynR0`e5`P z_7Ux&q1p-hDL^Yhz^~j`X*xSSx2}U2LXEMQ2LEP1+s>D3UtC56p&$oM36e z5%k759@d~xdTCh?(CfK;3Hjmch&~R;QcLb~jfWC@3Ej-k-oJkHf5jX0Vef} z!uQy@7=9gOA3&}}T5j{ct>TMvGq{s#|m&cug!*^n& z?7B(nBu&NBy)`skM@TKwVYeX?uv9!>$gz}Q3PJna(UsfZuEn!j+D7yX*ahvKyCBjJ z*hj~&F}S&2zcu26cSC36d*mmRu#_Vp*yNzO37&!wrbr7vkbUm2yyOn3DMR3p60s1z zQn}lgYd&7R2ljV_LkisAAF8p~aR{K;zZ%22^G<7v>X$1`DsV|4(@n2rV&!8wBp|#0 z5>eHot@qjm#LL1R;UP9LfKYyAIH%%TVTvEI7@PX1+bSNZDe9QTT~O@L+>Zw4|s z=_kiSRMn#SbhE!XR6|f&7&+D6(hY%(I^vn@H^bh}QLhxI-2qw=VqWJ)Q35iFp4|D^$c1EXL7dYTWFkDO*L=wdhD4{JunJ$yoVNTghBU z^G2?&2|*<&(9C2GXHdnb5U{}ZpnkJ+PH&uRhzEqwUJRii$@%2ga0L$F--B`b4OEH6HO3Xy=Ws85fENHK8Nhe#~8nYYD>mcJ@yUnxq7JDf9 z+5)SiOkB(z;+kZaw*{yZcGsNY>K6||G0E#L!k>*~AvOo#(Hk_?gI3Vp=Lk-q=dcmy9Dhh~#mbMoq@arjy&8D}!g4og8pY*3|T0QLm(eRD& z-|wfDtQO)0EucO)Q{_|x`OwKOOfkqJMtKj=hEek|%S!mvL2q|R_7=}uZ08q@4_8qA zggZWVR;yHIUmQPnZkpJ|x$KpplK2PGVEE@)%qe7(6&kM*wkgmT;D;|D-ex-S$5DBs z`TM*TN_BEW_CM;Xe8m;%S#+Y4=Mgte7KBOEnLGx`Xwj|g+!jHG>?laPP+CTTMJ~`_ zl)Px_Z*@CMC(TM>i#j#sKqX*~SOo2szhyII;7k);(WU(CR*s_LyDW59yvG=M2EBis ztDiu!4eF=eK!e6PineLoHoiLoJovN8^rC#92H14;;nae=kYaX2$4_-oPNNr(Z)&yL z@tZK#7FzB3ouY>6O;~KV`>Gl-u>f@&3qOqlMCx#&Ds)FdHKdUD`EKc*3wj1;NUuF5 zd;oeLJj7NPiQSKuPmg5Aq&IFPfhav3X}Kq-|GVYlmHofP{uArIdmr)i)^FWJuaf_# z(HtX^ADF|@!AD=*t4$L|Hr!+AcwBFZG?u~h^`AT$PNK3ae{cjT&t3Qt|EFx>%np9u z-J7cGycy)Y8smZ_&53FhPlOHD6@}4=pyf8MM<>JoSiljyfZiA4|4W~9wV;Ob@q@S? z#aa)sQ}9(;-WB##GyN3iB~}aBq$527M|AYA?W8+8DytFJ0yDr9seXI;^L^RlTp{Ck zU}q5l54yngbsFp!`*!h_ljae0`kjKXmVGrQLn|^@pwgNkW)}V z9rTXu2SnBus1n%y_>ZdFpM9YDzt(%s2oEVmHr5z6j0&gz&38{>6)4~+P3eiW~Mi;EN?n5@~s)^jO0Tc0Askycd2XdaNF~7YLq%C zbqJ`*T`|Zz%v!OtvBs}){KnOY+VsEEUu0@WcSQ8iroZ>Kc)EBsN6Mm9gU*w)0oCz> zwI%J>KUf5QMnh6xN%}+U8D#}VHhzCL{;an@A8iJ4@h;zQ3Ah-pdUp~08wGooJy+g7 zU*e|H$OIpESW?2tezlyTO`Op4B1a-W`?AwKr6=l z^roHr*6}f5A6Jp&y}yW)&ijk-knpT)gxQ_H+waNN5%r>HTeYoDe;_A)MQ!ZkrauCr zagSlB)29m%Z5b(ttJlZHNsZ!%q=sJa0i?KmCB3%i*WZ>q>|iE=`|8QPC?drDDGb)L z!M7R=Db=T}l||M)Nx`t2hhPwiW~$M%StvaRv`3q`OQJ&~O;n2t@gq@3WRAFiLxH-} zu{T9P>A)ex+Xr?Uiat63V?>&r;l6l1l_>N#utgGzs zjluajSfu}6c$RCt$-X|(_3B~kH6={9WN`PBFy-jwApiquDJv>^@Vl!-#Jp}yGrn5C z;HjInL?0q+4+SC5jIGHY^)Ok&6+Ig+syVjeNt8+8gOo0bgLkMhi%XY|VG?6*f2s)Q zL-l@pk@QO1uki+9#D{G32@Xik{6sO@@S2cD_>a($q$|yvDm%&D2i;n=r*m`nVobG$ z!QDa*N3W4ITf)d(_et3l^6Ipzdfxm90XJiTxssiEl=lsYTDT_%C%HJ=eLy63&C<_g z-i|l_H4E7&OjOlu->GvyqIoy*obQIVvp0;=BZZVNlS7mO43x?IK&qyTK{9H{{d-U_ zdZy99u!@&w-QGLmMHBSK0xW#RzZ&*EAA)-<~C2#PFrF#(^`B20(3K15#O8S_p`R#^mHfF{Ok)JLK0 zjG=PhRaiy)Z3 z=A_cBujL8sL653by+NuMof`_>b3=n^aI%D?OeY@qUdboSHY-q0hbPBfEvL%J>(X`) z2&QID@G1voVyonh;IFXbcL*{F z$H{!LzVWLq0rm*x11lfo_&z(&ug%@OA2?Wvx(+H?Kk&A@ZNdt8r`?RDWMINNmfexV z`K;Wd$CHCj#QWegYx>_Ca;!b}ivZP)9x-C^gi{(n7}6!}6iY{pPJM0miBpj{WrEQ- zJC4V>zW{s~&nq@X*2WVYtCNf(*Y5;q!fq302Yxn}-q$q&cU5^o&*j#mdo7%qR;Bj0 zk|dw6&@Nfj_>-p^2pQUuDx`+`U+f9>49BS4l+w17ET&X@%GJl&p!kp^i`)xT_#1OQ zF~9GL#VFppf5Kii{T*x$^10)tS@92J`~HMHJp<_>c7K%l$_#>y_uXP#J~J-4!oB4z z^zg4eMt}GHa9hpFb@G>OI!)<;Mafg2l6?6zX5eTn@4PQEb_N8(a3tQ{)bDZ2F~5Jr z5sftmjr;4eRIpdMeLrdmG$9`$4usFz!E&3*~D>S%smQOfkh*(bBuD&0-`2g_53$@EbtHxM@X z`?%=E$n@cqKCn%FdB*M#GQcT{9xTUrDm^|_mU@x0H_{`nHc9sBWPhy8qEzu2yZD*p z4(X$Nn0L+wJmWXm+Usw7-u(AI<(Q_?i#4m;#Q&SJcLnW=nP>c8x7T=RKcDQwjbOc- zBC_+UZKzl7c?->fEBzrTqVzfP(s4O9i^eh{dL@V919vlP-bUL6-%8TzgIooZM>HP= z3Unx*u-2c&5Su*yCST*o7dx1yE6rZyPvYr~&7cF(9UL<#f{dVgPReow1At6w5L=i!wK zm*Ka3rVIMHh**ivU8nKqnQp|}SwyA`r!FiMK0$KuT??KUJ*bkaA3X*Qy-MV!&DNVi z_%ICYYN);73k9rt?mUKv}**tNt zTm!p>$Gf@AXE8 zqJ6l&eZ2aj0%Anv{eq zK$s99md6-BKz0yzftaW1xZET%u$vcTqopvzEn)L~^q{mkaDqe5a9@OORKh6~I?<5= zay!M|Fb{?x+G*r`Pp*4)I$KKzZyI2SG4gC#2ciZ*`q~!@^`2XgQX*UjKLM=Cf#^gH zR{%jD+>Cz${lvD(V8*hxCrP8D%Yx)ns+dWo%J-lRLE^G;H+Vxu1Dw1q+NGpF6TZ|> zWBa+H>H5T|OL_=c|y~owP)I5f7OF zF|oXZ z zE&7$Qzk!mvBY9I`SAtbtv`Wgci_E|cpl~*i$`B57#vWJ0*fO__KNybP^qT254mY&B z0+(Z@Pm@()3_4;&=OASvn2UYlcu4-Vdwx%b`lQoH#8nDBW(Le(Q#ub;!`nvG4pGq= zx}TR~=brw-dE}1CagxDlu1!fuFwj14TdM!wqz-N1^toq%B1W6h8aPs|d~tt)CQgU5 z0($o!AyfDg8~!%q^Bdw}k%NdqdGF)xA{(_N{z&GHkMpSxg4Qsm>$c%Yh*ZQfqYH2x z+#gAw_`<>D>iqr}SP8Su!G)zg@Fi*;gj+g<@v4XQ9-A|dQkFptOyu#HLiWv9%G3aE z|CVvV7zkSog0+GxjCVx%9q7Lq(Y1wcseL}^NGC-e3e2R>6Y?-grLA4JHkUbjS2KfG z61v*~@maesDIF?s@!OCC%Z?f>I`BqaYC}<%<;upbBhNtKil)Yp3Q8?RTdRYC__lT1 zG9dbIzQc5M-(a%%6xE1ScufZMU{W)KAEuSI@iu%^auj6uQ%Y&eyh9cR0%R(gUdI(1 zo2cC4)c7p*iZscA3zy$c{=@J42SMQYj&nM-T-}E{hV|;WJH_DJslkdk!QL(oIt~kZ z{l$S)A(fYN7Q_RUfg6F8>#*wOxHc*z42xpnSH(#(GTxq~36NH)NaU6ILq(5B<>yh2 zZTyvhzqi0fQIZ`t{N*0}M;NERoYuGax4JxLULuYdG3BPF7 zkUBslzCtqbEq+egF~o;^cZ9>7gyzSD{6$ON)y~5sjUW_>9EyPXHlxANa2(HY(je=Jec6(g^$T2r$wjl4&kIW9b`LKFun-Ag(IVs_SzH zbX}GuC`${@P5T0|H_l(0$xy`|9fpdAFyBWLo8vv+_2rMUvrsF%j4`ZW7Mo}qkAoJ> zJ8zwpb`UVw8HK=$(cS5`W;dy2ZLsL9827EtIPVo!c?FH`cUW`N?|mAQ_|La|?ZpLw z;m7Q&2U`n83wd|$B7Q}VdZ1~;?Q_!ps=i0*@v?YQ_O4I zmK>8yvfTSKKdXWmgAx7ZHjL3ERAG+aN_d$FcGgYTFuf?_?bJp_mm<=qK^xXo6szQE zi!!z7BhL|aL16jRgOTANo4+0-xVP9IoaizzcU{GMWl^|DP z?bn0jJ+U>Dh1N$!NWXfXI@93eW&&DA1pd~1{dSR&4&s1q6 zcq?)8M9NlVOgH0m1=K!qsIqHUm9~+QJnzyTw%VtT@Rw)K`;D6vm9t6uaC_SjQ-=TP z?uuZnHz{*>(a=kr^!DPjde86*v>Qv$@gx{CV_jX`L83kTP-m3HV_qX2 z=%fUUd})IYTvy7t+6y%?E9S%PyzTGZmgM5FZc^T|>f)bMN?%F1z77iEBJ0-_g&S;N zvt!hVyso-2ijrm}e-88?``r3Wm6Rf!mQ4LqS2ALM#N)2wQNh&hHfs+frCzAM5EAuj z)9rm(dsO&TCvpH~jhTN0TbAuD*LVeboJy&rs3=qVydm$Uwu~bJ-Ilx~r|%?`vn!Xhp0s z8$^5k8e2EN%}pb{)I?sr&X}80tNiiQOhFvcqcJ>YhiVz3g_?H~d7nw|U3aRe^ksQh z;k)tO^iXq6>eH&5-^%SPaYbhp>bO`ijd@!`%JPsZ|s;>!&Oa=Hxm_ zsy~sUiL;Zuyzd-aZQX9&Ro!p{e^#uW&*nN^MWT7J@UBOAqBoJsr=%#oPjX58otFMxem^wkO$w_3Zcj3KhNgSr0$PQ# zf~=1AJ0hMl$fZW8IRVCLbk#L7XX*j3ZyrkD71CEoSru(x_Pv?7TV~+sC4N5iKf*-6 z(JY2O$|1ow`Ck^&mH}M|pSOQ_;vjQZXtfGF>5S7&U7gc^jA{M*Ly!@F@9ck#_?MR$ zA8v)La{nV7A3er&vootVX#MH+a}s;ykaW?vM+I$_Ceg|(>RZ)FpZzN z|3Ck+Cfe>1gJ0!XzW0A#0ZxqAXLzNiXVqDA*h?5t+0K(zxiNK~ogDYPFA?Hkh(6zc zVq|()GSMHb)|s}h`#pGY&~kVI)+S^RGQ{?G`nJ1PkZrJ&QB@+q!GHvGA^Tzv%it=U z|D7nyj>i3<&ku6l(2oA+A7S`Aoh^NlBv_f!P^xg-p}t~K;%e%1`TT8HN}!)%<36~o VbJ;o8^#c4+zNN14PR=ag{{qS|ao+#{ literal 0 HcmV?d00001 From 3cee7a391c6d286e881d8c8c248fe7377a52b35c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 16 Aug 2025 13:16:34 +0800 Subject: [PATCH 077/231] [XPU]avoid circular import during XPU init (#23017) Signed-off-by: Kunshang Ji Signed-off-by: Duncan Moss --- vllm/platforms/xpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 66ebc8ad9d22..af24437f649f 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -7,7 +7,6 @@ import torch import vllm.envs as envs -from vllm.config import CUDAGraphMode from vllm.logger import init_logger from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS @@ -105,6 +104,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: and not cls.device_support_bf16(): model_config.dtype = torch.float16 + # lazy import to avoid circular import + from vllm.config import CUDAGraphMode compilation_config = vllm_config.compilation_config if compilation_config.cudagraph_mode is None or \ compilation_config.cudagraph_mode.max_cudagraph_mode() \ From 28860732233c2af04bccb7c0640f4ed5493a0ec4 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 16 Aug 2025 01:36:27 -0400 Subject: [PATCH 078/231] [Build] Env var to disable sccache (#22968) Signed-off-by: Lucas Wilkinson Signed-off-by: Duncan Moss --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 919300e143c1..cc3037ebb72c 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,8 @@ def load_module_from_path(module_name, path): def is_sccache_available() -> bool: - return which("sccache") is not None + return which("sccache") is not None and \ + not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))) def is_ccache_available() -> bool: From 1935c3447aefc867f1101992d7e43e16917bfc41 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Sat, 16 Aug 2025 01:25:10 -0500 Subject: [PATCH 079/231] [BugFix] Add support for loading prompt embeds tensors serialized on unavailable devices and sparse tensors (#22962) Signed-off-by: Andrew Sansom Signed-off-by: Duncan Moss --- .../openai/test_prompt_validation.py | 49 +++++++++++++++++++ vllm/entrypoints/openai/serving_engine.py | 6 ++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index e31a1d077608..4197583074df 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,10 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import io + # imports for guided decoding tests import openai +import pybase64 import pytest import regex as re +import torch + +from vllm.entrypoints.openai.serving_engine import OpenAIServing from ...utils import RemoteOpenAIServer @@ -42,3 +48,46 @@ async def test_out_of_vocab_token_ids(): prompt=[999999], max_tokens=5, temperature=0.0) + + +@pytest.mark.parametrize("dtype", + [torch.float32, torch.bfloat16, torch.float16]) +@pytest.mark.parametrize( + "layout", + [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]) +@pytest.mark.parametrize("seq_len", [2, 10]) +@pytest.mark.parametrize("hidden_size", [2, 10]) +def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout, + seq_len: int, hidden_size: int): + # construct arbitrary tensors of various dtypes, layouts, and sizes. + # We need to check against different layouts to make sure that if a user + # uses sparse tensors to reduce the transmission size of prompt embeddings, + # we must cast them to dense/strided before passing them into the engine. + # We don't use non-CPU tensors in this test to avoid preemptively + # initializing cuda and break other tests in the suite that fork processes. + # We also need to make sure that we only use devices that are actually + # available in the environment the test is running on. For simplicity, + # we just test against CPU. + tensor = torch.randn((seq_len, hidden_size), dtype=dtype) + if layout == torch.strided: + tensor = tensor.contiguous() + elif layout == torch.sparse_coo: + tensor = tensor.to_sparse_coo() + elif layout == torch.sparse_csc: + tensor = tensor.to_sparse_csc() + elif layout == torch.sparse_csr: + tensor = tensor.to_sparse_csr() + + buffer = io.BytesIO() + torch.save(tensor, buffer) + buffer.seek(0) + encoded_tensor = pybase64.b64encode(buffer.getvalue()) + + loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor) + assert len(loaded_prompt_embeds) == 1 + loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"] + assert loaded_tensor.device.type == "cpu" + assert loaded_tensor.layout == torch.strided + torch.testing.assert_close(loaded_tensor, + tensor.to("cpu").to_dense(), + equal_nan=True) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d6f92a63301e..0f4a7c0186b6 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1006,8 +1006,8 @@ async def _generate_with_builtin_tools( # OPTIMIZATION priority = orig_priority - 1 + @staticmethod def _load_prompt_embeds( - self, prompt_embeds: Optional[Union[bytes, list[bytes]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None ) -> list[EmbedsPrompt]: @@ -1015,12 +1015,14 @@ def _load_prompt_embeds( def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: tensor = torch.load(io.BytesIO( pybase64.b64decode(embed, validate=True)), - weights_only=True) + weights_only=True, + map_location=torch.device("cpu")) assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( torch.float32, torch.bfloat16, torch.float16, ) + tensor = tensor.to_dense() if tensor.dim() > 2: tensor = tensor.squeeze(0) assert tensor.dim() == 2 From 8c8aaf1efc975586a1b261849571dac58443ef97 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 16 Aug 2025 15:26:10 +0800 Subject: [PATCH 080/231] [Misc] Add --save-dir option to benchmark_moe (#23020) Signed-off-by: Jee Jee Li Signed-off-by: Duncan Moss --- benchmarks/kernels/benchmark_moe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 13bf1be836f6..b4a03665ef10 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -3,6 +3,7 @@ import argparse import json +import os import time from contextlib import nullcontext from datetime import datetime @@ -542,6 +543,7 @@ def save_configs( use_fp8_w8a8: bool, use_int8_w8a16: bool, block_quant_shape: list[int], + save_dir: str, ) -> None: dtype_str = get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 @@ -552,7 +554,8 @@ def save_configs( filename = get_config_file_name( num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape ) - + os.makedirs(save_dir, exist_ok=True) + filename = os.path.join(save_dir, filename) print(f"Writing best config to {filename}...") with open(filename, "w") as f: json.dump(configs, f, indent=4) @@ -707,6 +710,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: use_fp8_w8a8, use_int8_w8a16, block_quant_shape, + args.save_dir, ) end = time.time() print(f"Tuning took {end - start:.2f} seconds") @@ -748,6 +752,9 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" ) parser.add_argument("--use-deep-gemm", action="store_true") + parser.add_argument( + "--save-dir", type=str, default="./", help="Directory to save tuned results" + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, nargs="+", required=False) parser.add_argument("--tune", action="store_true") From dc8809123b792e5bba2ea941d3b6417aaf48d6f2 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 16 Aug 2025 15:44:50 +0800 Subject: [PATCH 081/231] [Multimodal] Update Tensor schema test to cover arbitrary shape mm inputs (#22867) Signed-off-by: Isotr0py Signed-off-by: Duncan Moss --- tests/models/multimodal/test_tensor_schema.py | 143 +++++++++++++++--- vllm/model_executor/models/keye.py | 22 ++- 2 files changed, 138 insertions(+), 27 deletions(-) diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index 92390d8c2f7e..036624431c20 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -1,17 +1,26 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable from functools import partial +from typing import Any, Union from unittest.mock import patch +import numpy as np import pytest +from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, + UserMessage) +from mistral_common.protocol.instruct.request import ChatCompletionRequest +from PIL import Image from vllm.config import ModelConfig from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.inputs import InputProcessingContext -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, + MultiModalKwargs) from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from vllm.utils import GiB_bytes, set_default_torch_num_threads +from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.engine.core import EngineCore as V1EngineCore @@ -23,12 +32,64 @@ "MolmoForCausalLM": "incompatible requirements", "MiniMaxVL01ForConditionalGeneration": "broken model", } +ARCH_NEEDS_EXTRAS = [ + "InternVLChatModel", + "Idefics3ForConditionalGeneration", + "LlavaForConditionalGeneration", + "MiniCPMV", + "PaliGemmaForConditionalGeneration", +] +REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"} + +ImageInput = list[Image.Image] +VideoInput = Union[list[Image.Image], list[np.ndarray], + list[tuple[np.ndarray, dict[str, Any]]]] +AudioInput = list[tuple[np.ndarray, int]] + + +def _resize_data(_data: Union[Image.Image, np.ndarray], + size_factor: float) -> Union[Image.Image, np.ndarray]: + assert size_factor <= 1, "Size factor must be less than 1" + # Image input + if isinstance(_data, Image.Image): + W, H = _data.width, _data.height + W, H = map(lambda x: int(x * size_factor), (W, H)) + return _data.resize((W, H)) + # Video input with PIL Images + elif is_list_of(_data, Image.Image): + W, H = next(iter(_data)).width, next(iter(_data)).height + T = len(_data) + T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H)) + return [d.resize((W, H)) for d in _data[:T]] + # Video input with numpy arrays + elif isinstance(_data, np.ndarray) and _data.ndim >= 4: + T, H, W, C = _data.shape[-4:] + T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W)) + return _data[..., :T, :H, :W, :C] + # Audio input + elif isinstance(_data, np.ndarray) and _data.ndim == 1: + return _data[:int(len(_data) * size_factor)] + raise AssertionError("This line should be unreachable.") + + +def resize_mm_data( + data: Union[ImageInput, VideoInput, AudioInput], + size_factors: tuple[float, + ...]) -> Union[ImageInput, VideoInput, AudioInput]: + size_factors = size_factors[:len(data)] + if is_list_of(data, (Image.Image, np.ndarray, list)): + return [_resize_data(d, s) for d, s in zip(data, size_factors)] + elif is_list_of(data, tuple): + return [(_resize_data(d, s), meta) + for (d, meta), s in zip(data, size_factors)] + raise ValueError("Unsupported multimodal data type.") def create_batched_mm_kwargs( model_config: ModelConfig, processor: BaseMultiModalProcessor, -) -> MultiModalKwargs: + size_factors: tuple[float, ...] = (1.0, 0.5, 0.25), +) -> Iterable[tuple[str, int, BatchedTensorInputs]]: processing_info = processor.info dummy_inputs = processor.dummy_inputs supported_mm_limits = processing_info.get_supported_mm_limits() @@ -40,30 +101,69 @@ def create_batched_mm_kwargs( seq_len=model_config.max_model_len, mm_counts=mm_counts, ) + mm_data = processor_inputs.mm_data + resized_mm_data = { + modality: resize_mm_data(data, size_factors) + for modality, data in mm_data.items() + } + # Mistral chat outputs tokens directly, rather than text prompts + if model_config.tokenizer_mode == "mistral": + images = resized_mm_data.get("image", []) + request = ChatCompletionRequest(messages=[ + UserMessage(content=[ + TextChunk(text=""), + *(ImageChunk(image=image) for image in images), + ]), + ]) + tokenizer = processing_info.get_tokenizer() + res = tokenizer.mistral.encode_chat_completion(request) + prompt = res.tokens + else: + prompt = processor_inputs.prompt mm_kwargs = processor.apply( - prompt=processor_inputs.prompt, - mm_data=processor_inputs.mm_data, + prompt=prompt, + mm_data=resized_mm_data, hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, tokenization_kwargs=processor_inputs.tokenization_kwargs, )["mm_kwargs"] - mm_kwargs = MultiModalKwargs.batch([mm_kwargs]) - return mm_kwargs + items = [ + item for modality in supported_mm_limits + for item in mm_kwargs.get_items(modality) + ] + return group_mm_kwargs_by_modality(items) + + +def get_model_id_to_test( + model_arch_list: Iterable[str]) -> list[tuple[str, str]]: + filtered_results = [] + for model_arch in model_arch_list: + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS: + available_repos = list( + map(lambda model_id: (model_arch, model_id), + [model_info.default, *model_info.extras.values()])) + filtered_results.extend(available_repos) + else: + filtered_results.append((model_arch, model_info.default)) + return filtered_results @pytest.mark.core_model -@pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys())) -def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], - monkeypatch): +@pytest.mark.parametrize( + "model_arch, model_id", + get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())) +def test_model_tensor_schema(model_arch: str, model_id: str, + vllm_runner: type[VllmRunner], monkeypatch): if model_arch in ARCH_TO_SKIP: pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}") + if model_id in REPO_ID_TO_SKIP: + pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}") model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip", check_max_version=False) - model_id = model_info.default - hf_overrides_fn = partial(dummy_hf_overrides, model_arch=model_arch, exist_overrides=model_info.hf_overrides) @@ -119,6 +219,7 @@ def _initialize_kv_caches_v1(self, vllm_config): if model_info.v0_only: m.setenv("VLLM_USE_V1", "0") + # TODO(Isotr0py): Can we avoid initializing engine? with ( set_default_torch_num_threads(1), vllm_runner( @@ -145,12 +246,16 @@ def _initialize_kv_caches_v1(self, vllm_config): mm_registry = llm_engine.input_preprocessor.mm_registry processor = mm_registry.create_processor(model_config) - mm_kwargs = create_batched_mm_kwargs(model_config, processor) - def validate_model_input(model): - for modality in ("audio", "image", "video"): - method_name = f"_parse_and_validate_{modality}_input" - if hasattr(model, method_name): - getattr(model, method_name)(**mm_kwargs) + def validate_model_input(model, modality: str, + mm_kwargs: MultiModalKwargs): + method_name = f"_parse_and_validate_{modality}_input" + if hasattr(model, method_name): + getattr(model, method_name)(**mm_kwargs) - vllm_model.apply_model(validate_model_input) \ No newline at end of file + for modality, _, mm_kwargs in create_batched_mm_kwargs( + model_config, processor): + valid_func = partial(validate_model_input, + modality=modality, + mm_kwargs=mm_kwargs) + vllm_model.apply_model(valid_func) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 40c66c226850..db9ed5910d78 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -30,7 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, VideoItem) @@ -44,6 +44,7 @@ from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope +from vllm.utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -112,8 +113,9 @@ class KeyeImagePixelInputs(TensorSchema): - g: Grid dimensions (3 for t, h, w) """ type: Literal["pixel_values"] - pixel_values: Annotated[torch.Tensor, - TensorShape("b", "np", 3, "ps", "ps")] + pixel_values: Annotated[ + torch.Tensor, + TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})] image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] @@ -145,8 +147,9 @@ class KeyeVideoPixelInputs(TensorSchema): - g: Grid dimensions (3 for t, h, w) """ type: Literal["pixel_values_videos"] - pixel_values_videos: Annotated[torch.Tensor, - TensorShape("b", "np", 3, "ps", "ps")] + pixel_values_videos: Annotated[ + torch.Tensor, + TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})] video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)] @@ -1295,7 +1298,7 @@ def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): return None return quant_config - def _validate_and_reshape_mm_tensor(self, mm_input: object, + def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors, name: str) -> torch.Tensor: if not isinstance(mm_input, (torch.Tensor, list)): raise ValueError(f"Incorrect type of {name}. " @@ -1310,8 +1313,11 @@ def _validate_and_reshape_mm_tensor(self, mm_input: object, f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") return torch.concat(list(mm_input)) - else: - return torch.concat(mm_input) + elif is_list_of(mm_input, torch.Tensor): + if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2 + for p in mm_input): + return mm_input + return torch.concat(list(mm_input)) def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[KeyeImageInputs]: From 28234fe70b176381b3d9d1de6ca44e303ec20168 Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Sat, 16 Aug 2025 00:46:00 -0700 Subject: [PATCH 082/231] [Core] Make cudagraph check cuda platform only (#23005) Signed-off-by: Chengji Yao Signed-off-by: Chengji Yao Co-authored-by: Chengji Yao Co-authored-by: Li, Jiang Signed-off-by: Duncan Moss --- vllm/config/__init__.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 280ae60c91ff..72fec5e205e3 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3535,15 +3535,6 @@ def __post_init__(self): # in V0 means the compilation level wins out. self.compilation_config.level = CompilationLevel.NO_COMPILATION - # if cudagraph_mode is not explicitly set by users, set default value - if self.compilation_config.cudagraph_mode is None: - if envs.VLLM_USE_V1 and self.compilation_config.level \ - == CompilationLevel.PIECEWISE: - self.compilation_config.cudagraph_mode = \ - CUDAGraphMode.PIECEWISE - else: - self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: @@ -3552,14 +3543,28 @@ def __post_init__(self): if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - # disable cudagraph when enforce eager execution - if self.model_config is not None and self.model_config.enforce_eager: - logger.info("Cudagraph is disabled under eager mode") - self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - elif envs.VLLM_USE_V1: - self.compilation_config.cudagraph_num_of_warmups = 1 + if current_platform.is_cuda_alike(): + # if cudagraph_mode is not explicitly set by users, set default + # value + if self.compilation_config.cudagraph_mode is None: + if envs.VLLM_USE_V1 and self.compilation_config.level \ + == CompilationLevel.PIECEWISE: + self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - self._set_cudagraph_sizes() + # disable cudagraph when enforce eager execution + if self.model_config is not None and \ + self.model_config.enforce_eager: + logger.info("Cudagraph is disabled under eager mode") + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + elif envs.VLLM_USE_V1: + self.compilation_config.cudagraph_num_of_warmups = 1 + + self._set_cudagraph_sizes() + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE if self.cache_config.cpu_offload_gb > 0 and \ self.compilation_config.level != CompilationLevel.NO_COMPILATION \ @@ -3618,7 +3623,7 @@ def __post_init__(self): current_platform.check_and_update_config(self) # final check of cudagraph mode after platform-specific update - if envs.VLLM_USE_V1: + if envs.VLLM_USE_V1 and current_platform.is_cuda_alike(): if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \ and self.model_config is not None and \ not self.model_config.disable_cascade_attn: From cbc33c1b28505595cd043f584f0e9fc2d4708372 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 16 Aug 2025 17:44:19 +0800 Subject: [PATCH 083/231] [CI][Bugfix] Skip Ovis2 generation test because of broken remote code (#22954) Signed-off-by: Isotr0py Signed-off-by: Duncan Moss --- tests/models/registry.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 3efc9a99ea41..10e29e01e8a1 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -196,7 +196,8 @@ def check_available_online( {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder", extras={"tiny": "bigcode/tiny_starcoder_py"}, # noqa: E501 - min_transformers_version="4.55.1"), + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M", {"6b": "EleutherAI/gpt-j-6b"}), "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m", @@ -408,14 +409,16 @@ def check_available_online( extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 + "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 + {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, # noqa: E501 + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", trust_remote_code=True), - "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 - {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 @@ -455,6 +458,8 @@ def check_available_online( "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501 trust_remote_code=True), "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True, + max_transformers_version="4.53", + transformers_version_reason="HF model is not compatible", # noqa: E501 extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 @@ -482,7 +487,9 @@ def check_available_online( "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", trust_remote_code=True), - "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 + "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct", # noqa: E501 + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3", trust_remote_code=True, is_available_online=False), From efd10c3f8a993c0c3da49a3231d75b29087eb654 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Sat, 16 Aug 2025 03:21:20 -0700 Subject: [PATCH 084/231] Add docs for PrefixRepetitionDataset + enable usage with `vllm bench throughput` (#23012) Signed-off-by: Seiji Eicher Co-authored-by: Roger Wang Signed-off-by: Duncan Moss --- benchmarks/README.md | 22 +++++++++++++- vllm/benchmarks/throughput.py | 57 ++++++++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index caff8f034214..1d715a193ea1 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -40,7 +40,7 @@ become available. wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv - Sonnet + Sonnet (deprecated) ✅ ✅ Local file: benchmarks/sonnet.txt @@ -51,6 +51,12 @@ become available. ✅ synthetic + + Prefix Repetition + ✅ + ✅ + synthetic + HuggingFace-VisionArena ✅ @@ -592,6 +598,20 @@ python3 benchmarks/benchmark_prefix_caching.py \ --input-length-range 128:256 ``` +### Prefix Repetition Dataset + +```bash +vllm bench serve \ + --backend openai \ + --model meta-llama/Llama-2-7b-chat-hf \ + --dataset-name prefix_repetition \ + --num-prompts 100 \ + --prefix-repetition-prefix-len 512 \ + --prefix-repetition-suffix-len 128 \ + --prefix-repetition-num-prefixes 5 \ + --prefix-repetition-output-len 128 +``` + ## ⚡ Example - Request Prioritization Benchmark diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index fdf6548ada5b..0c19fa6dcfdd 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -18,9 +18,11 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, ConversationDataset, - InstructCoderDataset, RandomDataset, - SampleRequest, ShareGPTDataset, - SonnetDataset, VisionArenaDataset) + InstructCoderDataset, + PrefixRepetitionRandomDataset, + RandomDataset, SampleRequest, + ShareGPTDataset, SonnetDataset, + VisionArenaDataset) from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs @@ -327,6 +329,12 @@ def get_requests(args, tokenizer): dataset_cls = AIMODataset common_kwargs['dataset_subset'] = None common_kwargs['dataset_split'] = "train" + elif args.dataset_name == "prefix_repetition": + dataset_cls = PrefixRepetitionRandomDataset + sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len + sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len + sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes + sample_kwargs["output_len"] = args.prefix_repetition_output_len else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values @@ -356,7 +364,11 @@ def validate_args(args): raise ValueError(f"Unsupported backend: {args.backend}") # === Dataset Configuration === - if not args.dataset and not args.dataset_path: + if ( + not args.dataset + and not args.dataset_path + and args.dataset_name not in {"prefix_repetition"} + ): print( "When dataset path is not set, it will default to random dataset") args.dataset_name = 'random' @@ -432,7 +444,10 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--dataset-name", type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], + choices=[ + "sharegpt", "random", "sonnet", "burstgpt", "hf", + "prefix_repetition" + ], help="Name of the dataset to benchmark on.", default="sharegpt") parser.add_argument( @@ -521,6 +536,38 @@ def add_cli_args(parser: argparse.ArgumentParser): default=None, help="Split of the HF dataset.") + # prefix repetition dataset + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options") + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=None, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=None, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=None, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=None, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + parser = AsyncEngineArgs.add_cli_args(parser) From a611c4b5a8317292c3ebd51ac2ff1d291209aab2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 16 Aug 2025 19:30:49 +0800 Subject: [PATCH 085/231] [Refactor] Allow optional MultiModalKwargsItem in IPC (#23022) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- tests/v1/core/test_kv_cache_utils.py | 12 +----- tests/v1/core/test_prefix_caching.py | 12 +----- tests/v1/core/test_scheduler.py | 12 +----- tests/v1/core/utils.py | 12 +----- vllm/multimodal/inputs.py | 62 ++++++++-------------------- vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/mm_input_cache.py | 33 ++++++++------- vllm/v1/engine/processor.py | 10 +++-- vllm/v1/request.py | 7 +++- vllm/v1/worker/gpu_model_runner.py | 4 +- 10 files changed, 59 insertions(+), 108 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index e0b91e6dd7ee..47c74aff1e75 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -7,9 +7,7 @@ import torch from vllm.config import ModelConfig, SchedulerConfig, VllmConfig -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit from vllm.v1.core.kv_cache_manager import KVCacheManager @@ -42,13 +40,7 @@ def make_request( if mm_positions is None: mm_kwargs = None else: - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_positions) return Request(request_id=request_id, diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 28cfca6767b1..89824768ed90 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -9,9 +9,7 @@ import torch from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.utils import sha256, sha256_cbor_64bit from vllm.v1.core.block_pool import BlockPool @@ -37,13 +35,7 @@ def make_request( if mm_positions is None: mm_kwargs = None else: - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_positions) return Request(request_id=request_id, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index ac70c90d92ad..23762a0fb622 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -8,9 +8,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler @@ -1328,13 +1326,7 @@ def create_requests_with_priority( for i in range(num_requests): if mm_positions is not None: mm_position = mm_positions[i] - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_position) else: mm_position = None diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 52093d3d381a..849c3f59ae52 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -6,9 +6,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) @@ -143,13 +141,7 @@ def create_requests( for i in range(num_requests): if mm_positions is not None: mm_position = mm_positions[i] - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_position) mm_hashes = ["hash"] * len(mm_position) else: diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 0bbac45c121b..a33ce146995d 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence -from dataclasses import dataclass, replace +from dataclasses import dataclass from functools import partial from itertools import accumulate from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, @@ -218,7 +218,7 @@ class MultiModalFieldElem: i.e. the name of the keyword argument to be passed to the model. """ - data: Optional[NestedTensors] + data: NestedTensors """ The tensor data of this field in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs], @@ -315,13 +315,8 @@ def reduce_data( if len(set(field_types)) > 1: raise ValueError(f"Cannot merge different {field_types=}") - validated_data = list[NestedTensors]() - for i, elem in enumerate(elems): - assert elem.data is not None, ( - f"Cannot merge with empty `elems[{i}]`") - validated_data.append(elem.data) - - return self._reduce_data(validated_data, pin_memory=pin_memory) + batch = [elem.data for elem in elems] + return self._reduce_data(batch, pin_memory=pin_memory) @dataclass(frozen=True) @@ -643,6 +638,17 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. """ + @staticmethod + def dummy(modality: str): + """Convenience class for testing.""" + mm_elem = MultiModalFieldElem( + modality=modality, + key="dummy", + data=torch.empty(1), + field=MultiModalSharedField(1), + ) + return MultiModalKwargsItem.from_elems([mm_elem]) + @staticmethod def from_elems(elems: Sequence[MultiModalFieldElem]): return MultiModalKwargsItem({elem.key: elem for elem in elems}) @@ -654,46 +660,12 @@ def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None: assert len(modalities) == 1, f"Found different modalities={modalities}" self._modality = next(iter(modalities)) - self._is_empty = any(elem.data is None for elem in self.values()) - @property def modality(self) -> str: return self._modality - @property - def is_empty(self) -> bool: - return self._is_empty - - def get_data(self) -> Optional[Mapping[str, NestedTensors]]: - if self._is_empty: - return None - - out_data = dict[str, NestedTensors]() - for key, elem in self.items(): - assert elem.data is not None, ( - f"Cannot get data of empty `elem[{key!r}]`") - out_data[key] = elem.data - - return out_data - - def require_data(self) -> Mapping[str, NestedTensors]: - if (data := self.get_data()) is None: - raise RuntimeError("Cannot get data of empty item") - - return data - - # These methods create a new item to avoid mutating cached items in place - def with_data(self, data: Mapping[str, NestedTensors]): - return MultiModalKwargsItem({ - key: replace(elem, data=data[key]) - for key, elem in self.items() - }) - - def without_data(self): - return MultiModalKwargsItem({ - key: replace(elem, data=None) - for key, elem in self.items() - }) + def get_data(self) -> Mapping[str, NestedTensors]: + return {key: elem.data for key, elem in self.items()} # NOTE: UserDict is for V0 compatibility. diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index b29394f3e676..f7ec982db41b 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -3,6 +3,7 @@ import enum import time +from collections.abc import Sequence from typing import Any, Optional, Union import msgspec @@ -47,7 +48,7 @@ class EngineCoreRequest( request_id: str prompt_token_ids: list[int] - mm_kwargs: Optional[list[MultiModalKwargsItem]] + mm_kwargs: Optional[Sequence[Optional[MultiModalKwargsItem]]] mm_hashes: Optional[list[str]] mm_placeholders: Optional[list[PlaceholderRange]] sampling_params: Optional[SamplingParams] diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 1fed74330f0e..aa7dc62fd4ac 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Mapping -from typing import TYPE_CHECKING +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional from vllm.multimodal import MultiModalRegistry from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata -from vllm.multimodal.inputs import MultiModalKwargsItem, NestedTensors +from vllm.multimodal.inputs import MultiModalKwargsItem +from vllm.utils import is_list_of if TYPE_CHECKING: from vllm.config import ModelConfig @@ -58,21 +59,21 @@ def __init__(self, model_config: "ModelConfig", def get_and_update( self, - mm_kwargs: list[MultiModalKwargsItem], + mm_kwargs: Sequence[MultiModalKwargsItem], mm_hashes: list[str], - ) -> list[MultiModalKwargsItem]: + ) -> list[Optional[MultiModalKwargsItem]]: if not self.enabled: - return mm_kwargs + return list(mm_kwargs) assert len(mm_kwargs) == len(mm_hashes) - out_mm_items = list[MultiModalKwargsItem]() + out_mm_items = list[Optional[MultiModalKwargsItem]]() for mm_item, mm_hash in zip(mm_kwargs, mm_hashes): if self.mm_cache.get(mm_hash) is not None: - out_mm_items.append(mm_item.without_data()) + out_mm_items.append(None) else: self.mm_cache[mm_hash] = \ - MultiModalCacheItemMetadata.wraps(mm_item.require_data()) + MultiModalCacheItemMetadata.wraps(mm_item) out_mm_items.append(mm_item) return out_mm_items @@ -91,25 +92,27 @@ def __init__(self, model_config: "ModelConfig", self.enabled = mm_registry.enable_mm_input_cache(model_config) self.mm_cache = MultiModalCache.get_lru_cache( model_config.get_mm_input_cache_gb(), - Mapping[str, NestedTensors], + MultiModalKwargsItem, ) def get_and_update( self, - mm_kwargs: list[MultiModalKwargsItem], + mm_kwargs: Sequence[Optional[MultiModalKwargsItem]], mm_hashes: list[str], ) -> list[MultiModalKwargsItem]: if not self.enabled: - return mm_kwargs + mm_kwargs_lst = list(mm_kwargs) + assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem) + return mm_kwargs_lst assert len(mm_kwargs) == len(mm_hashes) out_mm_items = list[MultiModalKwargsItem]() for mm_item, mm_hash in zip(mm_kwargs, mm_hashes): - if (mm_data := mm_item.get_data()) is None: - out_mm_items.append(mm_item.with_data(self.mm_cache[mm_hash])) + if mm_item is None: + out_mm_items.append(self.mm_cache[mm_hash]) else: - self.mm_cache[mm_hash] = mm_data + self.mm_cache[mm_hash] = mm_item out_mm_items.append(mm_item) return out_mm_items diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 376c76a7e728..c6a23cdbf65a 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -17,6 +17,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from vllm.utils import is_list_of from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient from vllm.v1.structured_output.backend_guidance import ( @@ -295,7 +296,7 @@ def process_inputs( pooling_params = params.clone() # Multimodal related. - sorted_mm_inputs: Optional[list[MultiModalKwargsItem]] = None + sorted_mm_inputs: Optional[list[Optional[MultiModalKwargsItem]]] = None sorted_mm_positions: Optional[list[PlaceholderRange]] = None sorted_mm_hashes: Optional[list[str]] = None if decoder_inputs["type"] == "multimodal": @@ -308,7 +309,7 @@ def process_inputs( # in the input sequence. sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions) - sorted_mm_inputs = [ + orig_sorted_mm_inputs = [ decoder_mm_inputs.get_item(modality, idx) for modality, idx in sorted_mm_idxs ] @@ -323,9 +324,12 @@ def process_inputs( if sorted_mm_hashes is not None: sorted_mm_inputs = self.mm_input_cache_client.get_and_update( - sorted_mm_inputs, + orig_sorted_mm_inputs, sorted_mm_hashes, ) + else: + assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem) + sorted_mm_inputs = orig_sorted_mm_inputs return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 562925bde669..8b703b6191fe 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -125,14 +125,17 @@ def from_engine_core_request( block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] ) -> "Request": if request.mm_kwargs is not None: - assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), ( + mm_kwargs_lst = list(request.mm_kwargs) + assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem), ( "mm_kwargs was not updated in EngineCore.add_request") + else: + mm_kwargs_lst = None return cls( request_id=request.request_id, client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, - multi_modal_kwargs=request.mm_kwargs, + multi_modal_kwargs=mm_kwargs_lst, multi_modal_hashes=request.mm_hashes, multi_modal_placeholders=request.mm_placeholders, sampling_params=request.sampling_params, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4c919b392fbd..5ee44a82574c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -500,8 +500,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: second_per_grid_ts = [] audio_feature_lengths = [] use_audio_in_video = False - for item in self.requests[req_id].mm_kwargs: - mm_input = item.require_data() + for mm_item in self.requests[req_id].mm_kwargs: + mm_input = mm_item.get_data() if mm_input.get("image_grid_thw") is not None: image_grid_thw.append( mm_input["image_grid_thw"].tolist()) From 0025ac69c3c54fffd2406f9b2a79a43b336e183e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Sat, 16 Aug 2025 20:16:58 +0800 Subject: [PATCH 086/231] [New Model]mBART model (#22883) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 Signed-off-by: Duncan Moss --- docs/models/supported_models.md | 4 + examples/offline_inference/encoder_decoder.py | 233 +++++---- .../models/language/generation/test_mbart.py | 123 +++++ tests/models/registry.py | 2 + vllm/model_executor/models/bart.py | 444 +++++++++++++++++- vllm/model_executor/models/registry.py | 1 + 6 files changed, 716 insertions(+), 91 deletions(-) create mode 100644 tests/models/language/generation/test_mbart.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index a24fa4bcce33..a514572945c3 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -330,6 +330,7 @@ th { | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | +| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | @@ -418,6 +419,9 @@ Some models are supported only via the [Transformers backend](#transformers). Th !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. +!!! note + Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture. + ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index 0da6fa5c4af5..df6c1eaf4a21 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -2,9 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrate prompting of text-to-text -encoder/decoder models, specifically BART +encoder/decoder models, specifically BART and mBART. + +This script is refactored to allow model selection via command-line arguments. """ +import argparse +from typing import NamedTuple, Optional + from vllm import LLM, SamplingParams from vllm.inputs import ( ExplicitEncoderDecoderPrompt, @@ -14,119 +19,175 @@ ) -def create_prompts(tokenizer): - # Test prompts - # - # This section shows all of the valid ways to prompt an - # encoder/decoder model. - # - # - Helpers for building prompts - text_prompt_raw = "Hello, my name is" - text_prompt = TextPrompt(prompt="The president of the United States is") - tokens_prompt = TokensPrompt( - prompt_token_ids=tokenizer.encode(prompt="The capital of France is") +class ModelRequestData(NamedTuple): + """ + Holds the configuration for a specific model, including its + HuggingFace ID and the prompts to use for the demo. + """ + + model_id: str + encoder_prompts: list + decoder_prompts: list + hf_overrides: Optional[dict] = None + + +def get_bart_config() -> ModelRequestData: + """ + Returns the configuration for facebook/bart-large-cnn. + This uses the exact test cases from the original script. + """ + encoder_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "An encoder prompt", + ] + decoder_prompts = [ + "A decoder prompt", + "Another decoder prompt", + ] + return ModelRequestData( + model_id="facebook/bart-large-cnn", + encoder_prompts=encoder_prompts, + decoder_prompts=decoder_prompts, ) - # - Pass a single prompt to encoder/decoder model - # (implicitly encoder input prompt); - # decoder input prompt is assumed to be None - - single_text_prompt_raw = text_prompt_raw # Pass a string directly - single_text_prompt = text_prompt # Pass a TextPrompt - single_tokens_prompt = tokens_prompt # Pass a TokensPrompt - - # ruff: noqa: E501 - # - Pass explicit encoder and decoder input prompts within one data structure. - # Encoder and decoder prompts can both independently be text or tokens, with - # no requirement that they be the same prompt type. Some example prompt-type - # combinations are shown below, note that these are not exhaustive. - - enc_dec_prompt1 = ExplicitEncoderDecoderPrompt( - # Pass encoder prompt string directly, & - # pass decoder prompt tokens - encoder_prompt=single_text_prompt_raw, - decoder_prompt=single_tokens_prompt, - ) - enc_dec_prompt2 = ExplicitEncoderDecoderPrompt( - # Pass TextPrompt to encoder, and - # pass decoder prompt string directly - encoder_prompt=single_text_prompt, - decoder_prompt=single_text_prompt_raw, + + +def get_mbart_config() -> ModelRequestData: + """ + Returns the configuration for facebook/mbart-large-en-ro. + This uses prompts suitable for an English-to-Romanian translation task. + """ + encoder_prompts = [ + "The quick brown fox jumps over the lazy dog.", + "How are you today?", + ] + decoder_prompts = ["", ""] + hf_overrides = {"architectures": ["MBartForConditionalGeneration"]} + return ModelRequestData( + model_id="facebook/mbart-large-en-ro", + encoder_prompts=encoder_prompts, + decoder_prompts=decoder_prompts, + hf_overrides=hf_overrides, ) - enc_dec_prompt3 = ExplicitEncoderDecoderPrompt( - # Pass encoder prompt tokens directly, and - # pass TextPrompt to decoder - encoder_prompt=single_tokens_prompt, - decoder_prompt=single_text_prompt, + + +MODEL_GETTERS = { + "bart": get_bart_config, + "mbart": get_mbart_config, +} + + +def create_all_prompt_types( + encoder_prompts_raw: list, + decoder_prompts_raw: list, + tokenizer, +) -> list: + """ + Generates a list of diverse prompt types for demonstration. + This function is generic and uses the provided raw prompts + to create various vLLM input objects. + """ + text_prompt_raw = encoder_prompts_raw[0] + text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)]) + tokens_prompt = TokensPrompt( + prompt_token_ids=tokenizer.encode( + encoder_prompts_raw[2 % len(encoder_prompts_raw)] + ) ) - # - Finally, here's a useful helper function for zipping encoder and - # decoder prompts together into a list of ExplicitEncoderDecoderPrompt - # instances + decoder_tokens_prompt = TokensPrompt( + prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0]) + ) + single_prompt_examples = [ + text_prompt_raw, + text_prompt, + tokens_prompt, + ] + explicit_pair_examples = [ + ExplicitEncoderDecoderPrompt( + encoder_prompt=text_prompt_raw, + decoder_prompt=decoder_tokens_prompt, + ), + ExplicitEncoderDecoderPrompt( + encoder_prompt=text_prompt, + decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)], + ), + ExplicitEncoderDecoderPrompt( + encoder_prompt=tokens_prompt, + decoder_prompt=text_prompt, + ), + ] zipped_prompt_list = zip_enc_dec_prompts( - ["An encoder prompt", "Another encoder prompt"], - ["A decoder prompt", "Another decoder prompt"], + encoder_prompts_raw, + decoder_prompts_raw, ) + return single_prompt_examples + explicit_pair_examples + zipped_prompt_list - # - Let's put all of the above example prompts together into one list - # which we will pass to the encoder/decoder LLM. - return [ - single_text_prompt_raw, - single_text_prompt, - single_tokens_prompt, - enc_dec_prompt1, - enc_dec_prompt2, - enc_dec_prompt3, - ] + zipped_prompt_list - -# Create a sampling params object. -def create_sampling_params(): +def create_sampling_params() -> SamplingParams: + """Create a sampling params object.""" return SamplingParams( temperature=0, top_p=1.0, min_tokens=0, - max_tokens=20, + max_tokens=30, ) -# Print the outputs. -def print_outputs(outputs): - print("-" * 50) +def print_outputs(outputs: list): + """Formats and prints the generation outputs.""" + print("-" * 80) for i, output in enumerate(outputs): prompt = output.prompt encoder_prompt = output.encoder_prompt generated_text = output.outputs[0].text print(f"Output {i + 1}:") - print( - f"Encoder prompt: {encoder_prompt!r}\n" - f"Decoder prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}" + print(f"Encoder Prompt: {encoder_prompt!r}") + print(f"Decoder Prompt: {prompt!r}") + print(f"Generated Text: {generated_text!r}") + print("-" * 80) + + +def main(args): + """Main execution function.""" + model_key = args.model + if model_key not in MODEL_GETTERS: + raise ValueError( + f"Unknown model: {model_key}. " + f"Available models: {list(MODEL_GETTERS.keys())}" ) - print("-" * 50) - - -def main(): - dtype = "float" + config_getter = MODEL_GETTERS[model_key] + model_config = config_getter() - # Create a BART encoder/decoder model instance + print(f"🚀 Running demo for model: {model_config.model_id}") llm = LLM( - model="facebook/bart-large-cnn", - dtype=dtype, + model=model_config.model_id, + dtype="float", + hf_overrides=model_config.hf_overrides, ) - - # Get BART tokenizer tokenizer = llm.llm_engine.get_tokenizer_group() - - prompts = create_prompts(tokenizer) + prompts = create_all_prompt_types( + encoder_prompts_raw=model_config.encoder_prompts, + decoder_prompts_raw=model_config.decoder_prompts, + tokenizer=tokenizer, + ) sampling_params = create_sampling_params() - - # Generate output tokens from the prompts. The output is a list of - # RequestOutput objects that contain the prompt, generated - # text, and other information. outputs = llm.generate(prompts, sampling_params) - print_outputs(outputs) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser( + description="A flexible demo for vLLM encoder-decoder models." + ) + parser.add_argument( + "--model", + "-m", + type=str, + default="bart", + choices=MODEL_GETTERS.keys(), + help="The short name of the model to run.", + ) + args = parser.parse_args() + main(args) diff --git a/tests/models/language/generation/test_mbart.py b/tests/models/language/generation/test_mbart.py new file mode 100644 index 000000000000..854a72713943 --- /dev/null +++ b/tests/models/language/generation/test_mbart.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import pytest +from transformers import AutoModelForSeq2SeqLM + +from vllm.sequence import SampleLogprobs + +from ....conftest import DecoderPromptType, HfRunner, VllmRunner +from ...utils import check_logprobs_close + + +def vllm_to_hf_output( + vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], + decoder_prompt_type: DecoderPromptType, +): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + hf_output_str = output_str + "" + return output_ids, hf_output_str, out_logprobs + + +def run_test( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + prompts: list[dict[str, str]], + decoder_prompt_type: DecoderPromptType, + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +) -> None: + ''' + Test the vLLM mBART model by validating it against HuggingFace (HF). + (Docstring content is omitted for brevity) + ''' + + vllm_prompts = prompts + if decoder_prompt_type == DecoderPromptType.NONE: + vllm_prompts = [{ + "encoder_prompt": p['encoder_prompt'], + "decoder_prompt": "" + } for p in prompts] + + vllm_kwargs = { + "hf_overrides": { + "architectures": ["MBartForConditionalGeneration"] + } + } + + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + **vllm_kwargs) as vllm_model: # type: ignore + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + vllm_prompts, max_tokens, num_logprobs) + + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_kwargs["decoder_start_token_id"] = ( + hf_model.tokenizer.lang_code_to_id["ro_RO"]) + + hf_outputs = ( + hf_model.generate_encoder_decoder_greedy_logprobs_limit( + prompts, # HF runner still uses the original prompts + max_tokens, + num_logprobs, + **hf_kwargs, + )) + + hf_skip_tokens = 0 + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) + + +@pytest.mark.parametrize( + "model", + [pytest.param("facebook/mbart-large-en-ro")], +) +@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) +def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, + dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: + + run_test( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts[decoder_prompt_type], + decoder_prompt_type, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 10e29e01e8a1..99cf997790fe 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -316,6 +316,8 @@ def check_available_online( # [Encoder-decoder] "BartModel": _HfExamplesInfo("facebook/bart-base"), "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), + "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro", # noqa: E501 + hf_overrides={"architectures": ["MBartForConditionalGeneration"]}), # noqa: E501 } _EMBEDDING_EXAMPLE_MODELS = { diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 3d328c88ff6e..32551d8102f3 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -46,7 +46,8 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsQuant, SupportsV0Only -from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix +from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, + maybe_prefix) logger = logging.get_logger(__name__) @@ -422,10 +423,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if hidden_states.dtype == torch.float16 and ( torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, - min=-clamp_value, - max=clamp_value) + hidden_states = cast_overflow_tensors(hidden_states) return hidden_states @@ -906,3 +904,439 @@ def load_weights(self, weights: Iterable[tuple[str, }) return loaded_params + + +class MBartEncoderLayer(BartEncoderLayer): + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + r""" + Args: + hidden_states + torch.Tensor of *encoder* input embeddings. + Returns: + Encoder layer output torch.Tensor + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states) + + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + fc1_out, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(fc1_out) + + hidden_states, _ = self.fc2(hidden_states) + + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() + or torch.isnan(hidden_states).any()): + hidden_states = cast_overflow_tensors(hidden_states) + + return hidden_states + + +class MBartDecoderLayer(BartDecoderLayer): + + def forward( + self, + decoder_hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + residual = decoder_hidden_states + hidden_states = self.self_attn_layer_norm(decoder_hidden_states) + + # Self Attention + hidden_states = self.self_attn(hidden_states=hidden_states) + + hidden_states = residual + hidden_states + + # Cross-Attention Block + + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + hidden_states = self.encoder_attn( + decoder_hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + fc1_out, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(fc1_out) + + hidden_states, _ = self.fc2(hidden_states) + + hidden_states = residual + hidden_states + + return hidden_states + + +class MBartEncoder(nn.Module): + """ + Transformer encoder consisting of *config.encoder_layers* + self attention layers. Each layer is a [`BartEncoderLayer`]. + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, + config: BartConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + embed_tokens: Optional[nn.Embedding] = None, + prefix: str = ""): + super().__init__() + + self.cache_config = cache_config + self.quant_config = quant_config + self.lora_config = lora_config + embed_dim = config.d_model + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, + embed_dim, + embed_scale=embed_scale) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([ + MBartEncoderLayer(config, + cache_config, + quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(config.encoder_layers) + ]) + + self.layernorm_embedding = nn.LayerNorm(embed_dim) + self.layer_norm = nn.LayerNorm(config.d_model) # 改动 + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + r""" + Args: + input_ids + Indices of *encoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + positions + Positions of *encoder* input sequence tokens. + Returns: + Decoder output torch.Tensor + """ + # retrieve input_ids and inputs_embeds + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(positions) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + + for encoder_layer in self.layers: + hidden_states = encoder_layer(hidden_states=hidden_states) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class MBartDecoder(nn.Module): + """ + Transformer decoder consisting of *config.decoder_layers* layers. + Each layer is a [`BartDecoderLayer`] + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__( + self, + config: BartConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + embed_tokens: Optional[nn.Embedding] = None, + prefix: str = "", + ): + super().__init__() + self.cache_config = cache_config + self.quant_config = quant_config + self.lora_config = lora_config + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt( + config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, + config.d_model, + embed_scale=embed_scale) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + + self.layers = nn.ModuleList( + [MBartDecoderLayer(config, cache_config, quant_config, + prefix=f"{prefix}.layers.{layer_idx}") \ + for layer_idx in range(config.decoder_layers)]) + + self.layernorm_embedding = nn.LayerNorm(config.d_model) + self.layer_norm = nn.LayerNorm(config.d_model) + + def forward( + self, + decoder_input_ids: torch.Tensor, + decoder_positions: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + r""" + Args: + decoder_input_ids + Indices of *decoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + decoder_positions + Positions of *decoder* input sequence tokens. + encoder_hidden_states: + Tensor of encoder output embeddings + Returns: + Decoder output torch.Tensor + """ + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(decoder_input_ids) + else: + decoder_positions = inputs_embeds[:, -1] + + # embed positions + embed_pos = self.embed_positions(decoder_positions) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + + # decoder layers + + for decoder_layer in self.layers: + hidden_states = decoder_layer( + decoder_hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class MBartModel(nn.Module, SupportsQuant): + _tied_weights_keys = [ + "encoder.embed_tokens.weight", "decoder.embed_tokens.weight" + ] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + + self.encoder = MBartEncoder(config, + cache_config, + quant_config=quant_config, + prefix=f"{prefix}.encoder") + self.decoder = MBartDecoder(config, + cache_config, + quant_config=quant_config, + prefix=f"{prefix}.decoder") + + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor) -> torch.Tensor: + r""" + Args: + input_ids + Indices of *decoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + positions + Positions of *decoder* input sequence tokens. + encoder_input_ids + Indices of *encoder* input sequence tokens in the vocabulary. + encoder_positions: + Positions of *encoder* input sequence tokens. + Returns: + Model output torch.Tensor + """ + + encoder_hidden_states = None + + if encoder_input_ids.numel() > 0: + # Run encoder attention if a non-zero number of encoder tokens + # are provided as input + encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, + positions=encoder_positions) + + # decoder outputs consists of + # (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + decoder_input_ids=input_ids, + decoder_positions=positions, + encoder_hidden_states=encoder_hidden_states) + + return decoder_outputs + + +class MBartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): + base_model_prefix = "model" + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder.": "model.decoder.", + "encoder.": "model.encoder.", + "shared.": "model.shared." + }, + orig_to_new_substr={ + "beta": "bias", + "gamma": "weight", + "LayerNorm": "layernorm", + }, + ) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + lora_config = vllm_config.lora_config + assert config.tie_word_embeddings + self.config = config + self.model = MBartModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + + embed_scale = math.sqrt( + config.d_model) if config.scale_embedding else 1.0 + + self.lm_head = BartParallelLMHead(config.vocab_size, + config.d_model, + embed_scale=embed_scale) + + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + *, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + return self.model(input_ids, positions, encoder_input_ids, + encoder_positions) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + model_params_dict = dict(self.named_parameters()) + loaded_params = set() + remaining_weights = [] + shared_embedding_weight = None + + for name, loaded_weight in weights: + if any(skip in name + for skip in ["cls.", "pooler.", "final_logits_bias"]): + continue + if any(embed_name in name for embed_name in [ + 'shared.weight', 'encoder.embed_tokens.weight', + 'decoder.embed_tokens.weight' + ]): + if shared_embedding_weight is None: + shared_embedding_weight = loaded_weight + continue + is_stacked = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + vllm_name = name + for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items( + ): + vllm_name = vllm_name.replace(src, dst) + for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items( + ): + if vllm_name.startswith(src): + vllm_name = dst + vllm_name[len(src):] + break + vllm_name = vllm_name.replace(weight_name, param_name) + if vllm_name in model_params_dict: + param = model_params_dict[vllm_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight, shard_id) + loaded_params.add(vllm_name) + is_stacked = True + break + if not is_stacked: + remaining_weights.append((name, loaded_weight)) + loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."]) + auto_loaded_params = loader.load_weights(remaining_weights, + mapper=self.hf_to_vllm_mapper) + loaded_params.update(auto_loaded_params) + if shared_embedding_weight is not None: + lm_head_param = self.lm_head.weight + weight_loader = getattr(lm_head_param, "weight_loader", + default_weight_loader) + weight_loader(lm_head_param, shared_embedding_weight) + self.model.encoder.embed_tokens.weight = self.lm_head.weight + self.model.decoder.embed_tokens.weight = self.lm_head.weight + loaded_params.update({ + 'model.encoder.embed_tokens.weight', 'lm_head.weight', + 'model.decoder.embed_tokens.weight' + }) + return loaded_params diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index b817615b4356..109bc1fe5c77 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -141,6 +141,7 @@ # [Encoder-decoder] "BartModel": ("bart", "BartForConditionalGeneration"), "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), + "MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"), } _EMBEDDING_MODELS = { From 10bd3f23118fe45cf396bebabbd6a0ffd7a63a91 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Sat, 16 Aug 2025 14:36:30 -0300 Subject: [PATCH 087/231] Fix handling of `max_num_batched_tokens` for pooling tasks (#23004) Signed-off-by: Max de Bayser Signed-off-by: Duncan Moss --- vllm/config/__init__.py | 3 --- vllm/engine/arg_utils.py | 10 +++++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 72fec5e205e3..14fc5589a89a 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3600,9 +3600,6 @@ def __post_init__(self): logger.info(reason) self.scheduler_config.chunked_prefill_enabled = False self.scheduler_config.long_prefill_token_threshold = 0 - self.scheduler_config.max_num_batched_tokens = max( - self.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS) if self.cache_config is not None: self.cache_config.enable_prefix_caching = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f8af6d36e0c0..630fbec4539e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1602,9 +1602,6 @@ def _set_default_args_v1(self, usage_context: UsageContext, self.enable_prefix_caching = incremental_prefill_supported logger.info("(%s) prefix caching by default", action) - if not self.enable_chunked_prefill: - self.max_num_batched_tokens = model_config.max_model_len - # V1 should use the new scheduler by default. # Swap it only if this arg is set to the original V0 default if self.scheduler_cls == EngineArgs.scheduler_cls: @@ -1692,8 +1689,11 @@ def _set_default_args_v1(self, usage_context: UsageContext, self.max_num_batched_tokens = \ default_max_num_batched_tokens[usage_context] else: - self.max_num_batched_tokens = default_max_num_batched_tokens[ - usage_context] + if not self.enable_chunked_prefill: + self.max_num_batched_tokens = model_config.max_model_len + else: + self.max_num_batched_tokens = \ + default_max_num_batched_tokens[usage_context] logger.debug( "Setting max_num_batched_tokens to %d for %s usage context.", self.max_num_batched_tokens, use_context_value) From 0691dba66f35504679ea940e3b5c24dd73705023 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Sun, 17 Aug 2025 02:38:42 +0900 Subject: [PATCH 088/231] [Frontend] Added support for HermesToolParser for models without special tokens (#16890) Signed-off-by: minpeter Signed-off-by: Duncan Moss --- .../tool_parsers/test_hermes_tool_parser.py | 127 ++++++++++++++++++ .../openai/tool_parsers/hermes_tool_parser.py | 81 ++++++++--- 2 files changed, 191 insertions(+), 17 deletions(-) create mode 100644 tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py new file mode 100644 index 000000000000..28b1f8358d80 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest + +from ....utils import RemoteOpenAIServer + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci" + +SERVER_ARGS = [ + "--enforce-eager", + "--enable-auto-tool-choice", + "--tool-call-parser", + "hermes", + "--enable-lora", + "--lora-modules", + f"{LORA_MODEL}={LORA_MODEL}", +] + +TOOLS = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": + "The city and state, e.g. San Francisco, CA", + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"] + }, + }, + "required": ["location"], + }, + }, +}] + +MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}] + + +@pytest.mark.asyncio +async def test_non_streaming_tool_call(): + """Test tool call in non-streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + response = await client.chat.completions.create( + model=LORA_MODEL, + messages=MESSAGES, + tools=TOOLS, + tool_choice="auto", + temperature=0.0, + ) + + assert response.choices + choice = response.choices[0] + message = choice.message + + assert choice.finish_reason == "tool_calls" + assert message.tool_calls is not None + + tool_call = message.tool_calls[0] + assert tool_call.type == "function" + assert tool_call.function.name == "get_current_weather" + + arguments = json.loads(tool_call.function.arguments) + assert "location" in arguments + assert "Boston" in arguments["location"] + print("\n[Non-Streaming Test Passed]") + print(f"Tool Call: {tool_call.function.name}") + print(f"Arguments: {arguments}") + + +@pytest.mark.asyncio +async def test_streaming_tool_call(): + """Test tool call in streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + stream = await client.chat.completions.create( + model=LORA_MODEL, + messages=MESSAGES, + tools=TOOLS, + tool_choice="auto", + temperature=0.0, + stream=True, + ) + + tool_call_chunks = {} + async for chunk in stream: + if not chunk.choices: + continue + + delta = chunk.choices[0].delta + if not delta or not delta.tool_calls: + continue + + for tool_chunk in delta.tool_calls: + index = tool_chunk.index + if index not in tool_call_chunks: + tool_call_chunks[index] = {"name": "", "arguments": ""} + + if tool_chunk.function.name: + tool_call_chunks[index]["name"] += tool_chunk.function.name + if tool_chunk.function.arguments: + tool_call_chunks[index][ + "arguments"] += tool_chunk.function.arguments + + assert len(tool_call_chunks) == 1 + reconstructed_tool_call = tool_call_chunks[0] + + assert reconstructed_tool_call["name"] == "get_current_weather" + + arguments = json.loads(reconstructed_tool_call["arguments"]) + assert "location" in arguments + assert "Boston" in arguments["location"] + print("\n[Streaming Test Passed]") + print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") + print(f"Reconstructed Arguments: {arguments}") diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index c7030d34d453..d126130ab9bc 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -52,14 +52,51 @@ def __init__(self, tokenizer: AnyTokenizer): raise ValueError( "The model tokenizer must be passed to the ToolParser " "constructor during construction.") - self.tool_call_start_token_id = self.vocab.get( - self.tool_call_start_token) - self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) - if (self.tool_call_start_token_id is None - or self.tool_call_end_token_id is None): - raise RuntimeError( - "Hermes 2 Pro Tool parser could not locate tool call start/end " - "tokens in the tokenizer!") + self.tool_call_start_token_ids = self.model_tokenizer.encode( + self.tool_call_start_token, add_special_tokens=False) + self.tool_call_end_token_ids = self.model_tokenizer.encode( + self.tool_call_end_token, add_special_tokens=False) + + self.tool_call_start_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_start_token_ids + ] + + self.tool_call_end_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_end_token_ids + ] + + self.buffered_delta_text = "" + + # Very simple idea: when encountering tokens like <, tool, _call, >, + # <, /, tool, _call, >, store them in a buffer. + # When the last token is encountered, empty the buffer and return it. + # If a token appears in an incorrect sequence while storing in the buffer, + # return the preceding buffer along with the token. + def tool_call_delta_buffer(self, delta_text: str): + # If the sequence of tool_call_start or tool_call_end tokens is not yet + # complete, fill the buffer with the token and return "". + if (delta_text in self.tool_call_start_token_array + or delta_text in self.tool_call_end_token_array): + # If delta_text is the last token of tool_call_start_token or + # tool_call_end_token, empty the buffer and return + # the buffered text + delta_text. + if (delta_text == self.tool_call_start_token_array[-1] + or delta_text == self.tool_call_end_token_array[-1]): + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + self.buffered_delta_text = self.buffered_delta_text + delta_text + return "" + else: + if self.buffered_delta_text: + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + return delta_text def extract_tool_calls( self, @@ -124,11 +161,23 @@ def extract_tool_calls_streaming( delta_token_ids: Sequence[int], request: ChatCompletionRequest, ) -> Union[DeltaMessage, None]: + # 1. All tokens are parsed based on _text, not token_ids. + # 2. All incoming text data is processed by the tool_call_delta_buffer + # function for buffering before being used for parsing. + + delta_text = self.tool_call_delta_buffer(delta_text) + # If the last characters of previous_text + # match self.buffered_delta_text, remove only the matching part. + if (len(previous_text) >= len(self.buffered_delta_text) + and previous_text[-len(self.buffered_delta_text):] + == self.buffered_delta_text): + previous_text = previous_text[:-len(self.buffered_delta_text)] + current_text = previous_text + delta_text logger.debug("delta_text: %s", delta_text) logger.debug("delta_token_ids: %s", delta_token_ids) # check to see if we should be streaming a tool call - is there a - if self.tool_call_start_token_id not in current_token_ids: + if self.tool_call_start_token not in current_text: logger.debug("No tool call tokens found!") return DeltaMessage(content=delta_text) @@ -136,14 +185,12 @@ def extract_tool_calls_streaming( # figure out where we are in the parsing by counting tool call # start & end tags - prev_tool_start_count = previous_token_ids.count( - self.tool_call_start_token_id) - prev_tool_end_count = previous_token_ids.count( - self.tool_call_end_token_id) - cur_tool_start_count = current_token_ids.count( - self.tool_call_start_token_id) - cur_tool_end_count = current_token_ids.count( - self.tool_call_end_token_id) + prev_tool_start_count = previous_text.count( + self.tool_call_start_token) + prev_tool_end_count = previous_text.count(self.tool_call_end_token) + cur_tool_start_count = current_text.count( + self.tool_call_start_token) + cur_tool_end_count = current_text.count(self.tool_call_end_token) tool_call_portion = None text_portion = None From e45076eba6b2195313e9efa291bd4ed07d899016 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 14:16:00 -0400 Subject: [PATCH 089/231] [Bugfix gpt-oss] Fix float32 convert for flashinfer sink support (#23016) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- vllm/attention/layer.py | 9 +++++++++ vllm/v1/attention/backends/flashinfer.py | 3 --- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 1a9c0e26b53c..0e87fa3f23e3 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -308,6 +308,15 @@ def process_weights_after_loading(self, act_dtype: torch.dtype): if hasattr(self.impl, "process_weights_after_loading"): self.impl.process_weights_after_loading(act_dtype) + # FlashInfer requires attention sinks to be float32 + if (self.backend == _Backend.FLASHINFER_VLLM_V1 + and hasattr(self.impl, 'sinks')): + from vllm.v1.attention.backends.flashinfer import FlashInferImpl + assert isinstance(self.impl, FlashInferImpl) + if (self.impl.sinks is not None + and self.impl.sinks.dtype != torch.float32): + self.impl.sinks = self.impl.sinks.to(torch.float32) + def get_attn_backend(self) -> type[AttentionBackend]: return self.attn_backend diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index eac3f33e1509..991904229fd7 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -642,9 +642,6 @@ def __init__( f"heads in the layer. Expected {num_heads}, but got " f"{sinks.shape[0]}." ) - # Cast sinks to float32 if needed (FlashInfer requirement) - if sinks.dtype != torch.float32: - sinks = sinks.to(torch.float32) self.sinks = sinks def forward( From 40a0d5141cc22444524cacde1c9881a5cf7fdf39 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 14:33:08 -0400 Subject: [PATCH 090/231] [Flaky CI] Increase timeout tolerance for test_mp_crash_detection+test_default_mm_lora_chat_completions (#23028) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- tests/entrypoints/openai/test_default_mm_loras.py | 3 ++- tests/mq_llm_engine/test_error_handling.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py index 372e9b1fecd4..b9c466a6fbeb 100644 --- a/tests/entrypoints/openai/test_default_mm_loras.py +++ b/tests/entrypoints/openai/test_default_mm_loras.py @@ -48,7 +48,8 @@ def multimodal_server(): # noqa: F811 f"{{\"audio\": \"{AUDIO_LORA_PATH}\"}}", ] - with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args, + max_wait_seconds=480) as remote_server: yield remote_server diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index 3feee01dadf7..77e3732cd06c 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -255,8 +255,8 @@ def mock_init(): pass end = time.perf_counter() - assert end - start < 60, ( - "Expected vLLM to gracefully shutdown in <60s " + assert end - start < 100, ( + "Expected vLLM to gracefully shutdown in <100s " "if there is an error in the startup.") From ed52e5340ec8cdc31514b52c1d9a6efc2f068879 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 15:38:21 -0400 Subject: [PATCH 091/231] [Kernel/Quant] Remove AQLM (#22943) Signed-off-by: mgoin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: Duncan Moss --- .../scripts/hardware_ci/run-amd-test.sh | 1 - CMakeLists.txt | 1 - benchmarks/kernels/benchmark_aqlm.py | 345 ---------- csrc/ops.h | 9 - csrc/quantization/aqlm/gemm_kernels.cu | 597 ------------------ csrc/torch_bindings.cpp | 15 - .../quantization/supported_hardware.md | 1 - docs/mkdocs/hooks/generate_examples.py | 1 - examples/offline_inference/basic/README.md | 14 - tests/compile/test_full_graph.py | 4 - tests/kernels/quantization/test_aqlm.py | 40 -- tests/models/quantization/test_aqlm.py | 68 -- vllm/_custom_ops.py | 41 -- vllm/model_executor/layers/linear.py | 18 - .../layers/quantization/__init__.py | 3 - .../layers/quantization/aqlm.py | 376 ----------- 16 files changed, 1534 deletions(-) delete mode 100644 benchmarks/kernels/benchmark_aqlm.py delete mode 100644 csrc/quantization/aqlm/gemm_kernels.cu delete mode 100644 tests/kernels/quantization/test_aqlm.py delete mode 100644 tests/models/quantization/test_aqlm.py delete mode 100644 vllm/model_executor/layers/quantization/aqlm.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 5e5a532cb57d..df0bae0c9cbf 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -121,7 +121,6 @@ fi if [[ $commands == *" kernels/quantization"* ]]; then commands="${commands} \ --ignore=kernels/quantization/test_int8_quant.py \ - --ignore=kernels/quantization/test_aqlm.py \ --ignore=kernels/quantization/test_machete_mm.py \ --ignore=kernels/quantization/test_block_fp8.py \ --ignore=kernels/quantization/test_block_int8.py \ diff --git a/CMakeLists.txt b/CMakeLists.txt index cda1ffc795d1..34386d670ac7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -286,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_MakeAvailable(cutlass) list(APPEND VLLM_EXT_SRC - "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py deleted file mode 100644 index 42de062b08e4..000000000000 --- a/benchmarks/kernels/benchmark_aqlm.py +++ /dev/null @@ -1,345 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -import sys -from typing import Optional - -import torch -import torch.nn.functional as F - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.aqlm import ( - dequantize_weight, - generic_dequantize_gemm, - get_int_dtype, - optimized_dequantize_gemm, -) -from vllm.utils import FlexibleArgumentParser - -os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - -def torch_mult( - # [..., in_features] - input: torch.Tensor, - weights: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, -) -> torch.Tensor: - output = F.linear(input, weights) - return output - - -def dequant_out_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - if bias is None: - output = F.linear(input, weights, bias) - orig_shape = output.shape - flattened_output = output.view(-1, output.size(-1)) - f_scales = scales.view(-1, scales.shape[0]) - b_scales = f_scales.expand(flattened_output.shape[0], -1) - flattened_output *= b_scales - return flattened_output.view(orig_shape) - else: - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_weight_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_no_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - return F.linear(input, weights, bias) - - -# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against -# the generic pytorch version. -# Just visual comparison. -def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - count = 0 - for index in range(16): - for i in range(8): - for book in range(nbooks): - codebooks[book, index, 0, i] = count * (10**book) - count += 1 - - print("codes shape", codes.shape) - - for i in range(16): - for book in range(nbooks): - codes[0, i, book] = i - codes[0, -i, book] = i - - weights = dequantize_weight(codes, codebooks, None) - weights2 = ops.aqlm_dequant(codes, codebooks, parts) - - print("weights shape:", weights.shape) - print("weights2 shape:", weights2.shape) - - print("weights are:", weights) - print("weights2 are:", weights2) - - print("first 128 weights are", weights[0, 0:128].to(torch.int32)) - print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32)) - - print("last 128 weights are", weights[0, -128:]) - print("last 128 weights2 are:", weights2[0, -128:]) - - -def main(): - parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") - - # Add arguments - parser.add_argument( - "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)" - ) - parser.add_argument( - "--bits", - type=int, - default=16, - help="Number of bits per code element (default: 16)", - ) - parser.add_argument( - "--test", - type=bool, - default=False, - help="Run the decompression/dequant tester rather than benchmarking " - "(default: False)", - ) - - # Parse the arguments - args = parser.parse_args() - - # Extract values - nbooks = args.nbooks - bits = args.bits - - if args.test: - dequant_test(4096, torch.tensor((4096,)), nbooks, bits) - return - - # Otherwise, benchmark. - methods = [ - ops.aqlm_gemm, - dequant_out_scale, - generic_dequantize_gemm, - optimized_dequantize_gemm, - dequant_weight_scale, - torch_mult, - dequant_no_scale, - ] - - filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv" - print(f"writing benchmarks to file {filename}") - with open(filename, "w") as f: - sys.stdout = f - - print("m | k | n | n parts", end="") - for method in methods: - print(f" | {method.__name__.replace('_', ' ')} (µs)", end="") - print("") - - # These are reasonable prefill sizes. - ksandpartions = ( - (4096, (4096, 4096, 4096)), - (4096, (4096,)), - (4096, (11008, 11008)), - (11008, (4096,)), - ) - - # reasonable ranges for m. - for m in [ - 1, - 2, - 4, - 8, - 10, - 12, - 14, - 16, - 24, - 32, - 48, - 52, - 56, - 64, - 96, - 112, - 128, - 256, - 512, - 1024, - 1536, - 2048, - 3072, - 4096, - ]: - print(f"{m}", file=sys.__stdout__) - for ksp in ksandpartions: - run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods) - - sys.stdout = sys.__stdout__ - - -def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): - # I didn't see visible improvements from increasing these, but feel free :) - num_warmup_trials = 1 - num_trials = 1 - - num_calls = 100 - - # warmup. - for method in methods: - for _ in range(num_warmup_trials): - run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - n = parts.sum().item() - print(f"{m} | {k} | {n} | {parts.tolist()}", end="") - - for method in methods: - best_time_us = 1e20 - for _ in range(num_trials): - kernel_dur_ms = run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - kernel_dur_us = 1000 * kernel_dur_ms - - if kernel_dur_us < best_time_us: - best_time_us = kernel_dur_us - - print(f" | {kernel_dur_us:.0f}", end="") - - print("") - - -def run_timing( - num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method -) -> float: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - input = torch.randn((1, m, k), dtype=torch.float16, device=device) - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) - - # for comparison to just a pytorch mult. - weights = torch.randn((n, k), dtype=torch.float16, device=device) - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - start_event.record() - - if method is torch_mult: - for i in range(num_calls): - torch_mult(input, weights, scales) - else: - for i in range(num_calls): - method(input, codes, codebooks, scales, parts, None) - - end_event.record() - end_event.synchronize() - - dur_ms = start_event.elapsed_time(end_event) / num_calls - return dur_ms - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/csrc/ops.h b/csrc/ops.h index 3e29f0a973dd..6e39758f16a1 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -154,15 +154,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope, torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor); #ifndef USE_ROCM -torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::vector& codebook_partition_sizes, - const std::optional& bias); - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, const torch::Tensor& codebooks, - const std::vector& codebook_partition_sizes); torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scaling_factors, torch::Tensor _zeros, diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu deleted file mode 100644 index 79cd2c610b3c..000000000000 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Modified by Neural Magic - * Adapted from https://github.com/Vahe1994/AQLM - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace vllm { -namespace aqlm { - -__global__ void Code1x16MatVec( - const int4* __restrict__ A, const int4* __restrict__ B, - int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m, - const int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - int b_gl_rd = 0; - int c_gl_wr = a_gl_rd; - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - - __shared__ int4 sh_b[32 * 9]; - float res = 0; - - int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); - while (iters--) { - // We pad shared memory to avoid bank conflicts during reads - __syncthreads(); - for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; - } - __syncthreads(); - b_gl_rd += 32 * 8; - - int b_sh_rd = 9 * (threadIdx.x % 32); - if (pred && a_gl_rd < a_gl_end) { - const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - uint32_t dec[4]; - // We bypass the L1 cache to avoid massive amounts of memory streaming - // that doesn't actually help us; this brings > 2x speedup. - asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*)&codebook[enc[i]])); - half2* a = reinterpret_cast(&dec); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); - half2 res2 = {}; -#pragma unroll - for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2); - res += __half2float(res2.x) + __half2float(res2.y); - b_sh_rd++; - } - a_gl_rd += 32; - } - } - - if (pred) { -#pragma unroll - for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); - if (threadIdx.x % 32 == 0) - reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); - } -} - -__global__ void Code2x8MatVec( - const int4* __restrict__ A, const int4* __restrict__ B, - int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. - -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - int b_gl_rd = 0; - int c_gl_wr = a_gl_rd; - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - int lane = threadIdx.x % 8; - - extern __shared__ int4 sh[]; - int4* sh_b = sh; - int4* sh_code = sh_b + 32 * 9; - int4* sh_code0 = sh_code; - int4* sh_code1 = sh_code + 256 * 8; - - for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { - int4 dec = codebook[i]; -#pragma unroll - for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; - } - __syncthreads(); - - float res = 0; - - int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); - while (iters--) { - // We pad shared memory to avoid bank conflicts during reads - __syncthreads(); - for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; - } - __syncthreads(); - b_gl_rd += 32 * 8; - - int b_sh_rd = 9 * (threadIdx.x % 32); - if (pred && a_gl_rd < a_gl_end) { - const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - half2* a0 = - reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = - reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); - half2 res2 = {}; -#pragma unroll - for (int j = 0; j < 4; j++) - res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2); - res += __half2float(res2.x) + __half2float(res2.y); - b_sh_rd++; - } - a_gl_rd += 32; - } - } - - if (pred) { -#pragma unroll - for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); - if (threadIdx.x % 32 == 0) - reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); - } -} - -__global__ void Code1x16Dequant( - const int4* __restrict__ A, int4* __restrict__ C, - const int4* __restrict__ codebook, int prob_m, int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long, sums to m. - const int codebook_stride // as int4 -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - - int c_gl_stride = prob_k / 8; - int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; - - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; - while (iters--) { - if (pred && a_gl_rd < a_gl_end) { - const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - int4 chunk; - auto dec = reinterpret_cast(&chunk); - // We bypass the L1 cache to avoid massive amounts of memory streaming - // that doesn't actually help us; this brings > 2x speedup. - asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*)&codebook[enc[i]])); - - C[a_gl_rd * 8 + i] = chunk; - } - } - a_gl_rd += 32; - } -} - -__global__ void Code2x8Dequant( - const int4* __restrict__ A, int4* __restrict__ C, - const int4* __restrict__ codebook, int prob_m, int prob_k, - const int4 - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long, corresponds to cols. - const int codebook_stride // as int4 -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - int lane = threadIdx.x % 8; - - int c_gl_stride = prob_k / 8; - int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; - - extern __shared__ int4 sh[]; - int4* sh_code = sh; - int4* sh_code0 = sh_code; - int4* sh_code1 = sh_code + 256 * 8; - - for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { - int4 dec = codebook[i]; -#pragma unroll - for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; - } - __syncthreads(); - - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; - while (iters--) { - if (pred && a_gl_rd < a_gl_end) { - const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - int4 chunk; - half2* a0 = - reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = - reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); -#pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(&chunk)[j] = __hadd2(a0[j], a1[j]); - C[a_gl_rd * 8 + i] = chunk; - } - } - a_gl_rd += 32; - } -} - -inline int ceildiv(int a, int b) { return (a + b - 1) / b; } - -const int THREAD_M = 16; - -void code1x16_matvec_cuda(const void* __restrict__ A, - const void* __restrict__ B, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, - int prob_k, const int4 codebook_a_sizes, - const int codebook_stride) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16MatVec<<>>( - (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, - prob_k, codebook_a_sizes, codebook_stride); -} - -void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B, - void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, - int prob_k, const int4 codebook_a_sizes, - const int codebook_stride) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaFuncSetAttribute(Code2x8MatVec, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code2x8MatVec<<>>( - (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, - prob_k, codebook_a_sizes, codebook_stride); -} - -void code1x16_dequant_cuda( - const void* __restrict__ A, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. -) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16Dequant<<>>( - (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long. - codebook_stride // as int4. - ); -} - -// Dequantizes the code and codebook into weights. -void code2x8_dequant_cuda( - const void* __restrict__ A, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, int prob_k, - const int4 - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long, corresponds to cols. - const int codebook_stride // as int4 -) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - - cudaFuncSetAttribute(Code2x8Dequant, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared); - Code2x8Dequant<<>>( - (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, - codebook_a_sizes, codebook_stride); -} - -int codebook_stride(const torch::Tensor& codebooks) { - return codebooks.stride(0) * codebooks.element_size() / sizeof(int4); -} - -void code1x16_matvec( - const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C, - const torch::Tensor& codebook, - const int4 codebook_a_sizes // cumulative sizes of A spanning each - // codebook, at most 3 long. -) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - - code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), - codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, - codebook_stride(codebook)); -} - -torch::Tensor code1x16_matmat(const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty( - {flat_input.size(0), out_features}, - torch::TensorOptions().dtype(input.dtype()).device(input.device())); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, - codebook_a_sizes); - } - flat_output *= scales.flatten().unsqueeze(0); - - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B, - torch::Tensor& C, const torch::Tensor& codebook, - const int4 codebook_a_sizes) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), - codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, - 2 * codebook_stride(codebook)); -} - -torch::Tensor code2x8_matmat(const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty( - {flat_input.size(0), out_features}, - torch::TensorOptions().dtype(input.dtype()).device(input.device())); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, - codebook_a_sizes); - } - flat_output *= scales.flatten().unsqueeze(0); - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -// Accumulate the partition sizes. -int4 accumulate_sizes(const std::vector& codebook_partition_sizes) { - int4 cumulative_sizes; - auto cumulative_size = &cumulative_sizes.x; - size_t i = 0; - int last = 0; - assert(codebook_partition_sizes.size() <= 4); - for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) { - *cumulative_size = codebook_partition_sizes[i] + last; - last = *cumulative_size; - } - // fill in the rest with unreachable. - for (; i < 4; ++i, ++cumulative_size) { - *cumulative_size = last * 10; - } - return cumulative_sizes; -} - -} // namespace aqlm -} // namespace vllm - -torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::vector& codebook_partition_sizes, - const std::optional& bias) { - int4 cumulative_sizes = - vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); - int const entries = codebooks.size(1); - - if (nbooks == 1 && entries == (1 << 16)) { - return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, - cumulative_sizes, bias); - } - if (nbooks == 2 && entries == (1 << 8)) { - return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, - cumulative_sizes, bias); - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, - " entries is not currently supported.") - return {}; -} - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, const torch::Tensor& codebooks, - const std::vector& codebook_partition_sizes) { - int4 cumulative_sizes = - vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); - int const entries = codebooks.size(1); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(codes)); - int rows = codes.size(1); - int cols = codes.size(0); - - auto in_features = codes.size(1) * 8; - auto out_features = codes.size(0); - - assert(out_features == std::accumulate(codebook_partition_sizes.begin(), - codebook_partition_sizes.end(), 0)); - - auto weights = torch::empty({out_features, in_features}, - torch::TensorOptions() - .dtype(codebooks.dtype()) - .device(codebooks.device())); - - if (nbooks == 1 && entries == (1 << 16)) { - vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(), - codebooks.data_ptr(), out_features, - in_features, cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower - // and not consistent with gemv implementation.) weights *= - // scales.index({"...", 0, 0}); - - return weights; - } - - if (nbooks == 2 && entries == (1 << 8)) { - vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(), - codebooks.data_ptr(), out_features, - in_features, cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower - // and not consistent with gemv implementation) weights *= - // scales.index({"...", 0, 0}); - - return weights; - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, - " entries is not currently supported.") - return {}; -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index a547baec50d6..5fee106335d3 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -207,21 +207,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantization ops #ifndef USE_ROCM - // Quantized GEMM for AQLM. - ops.def( - "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, " - "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) " - "-> Tensor", - {stride_tag}); - ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm); - - // Decompression method for AQLM. - ops.def( - "aqlm_dequant(Tensor codes, Tensor codebooks, " - "int[] codebook_partition_sizes) -> Tensor", - {stride_tag}); - ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant); - // Quantized GEMM for AWQ. ops.def( "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md index f53e69ecc611..06264d08b56a 100644 --- a/docs/features/quantization/supported_hardware.md +++ b/docs/features/quantization/supported_hardware.md @@ -17,7 +17,6 @@ th { | INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | | FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | | BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 6b4c5b31075f..1e8b848db46d 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -24,7 +24,6 @@ def fix_case(text: str) -> str: "llm": "LLM", "mae": "MAE", "tpu": "TPU", - "aqlm": "AQLM", "gguf": "GGUF", "lora": "LoRA", "rlhf": "RLHF", diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md index 0a2bd6e2b70b..cbb3116e9741 100644 --- a/examples/offline_inference/basic/README.md +++ b/examples/offline_inference/basic/README.md @@ -52,20 +52,6 @@ Try it yourself with the following argument: ### Quantization -#### AQLM - -vLLM supports models that are quantized using AQLM. - -Try one yourself by passing one of the following models to the `--model` argument: - -- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf` -- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf` -- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf` -- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf` -- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf` - -> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs. - #### GGUF vLLM supports models that are quantized using GGUF. diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 72f962ed7484..a2fc6ffeb8b2 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -31,10 +31,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): ] if all: - if is_quant_method_supported("aqlm"): - TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { - "quantization": "aqlm" - })) # TODO: figure out why this fails. if False and is_quant_method_supported("gguf"): # noqa: SIM223 diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py deleted file mode 100644 index 427db3e60292..000000000000 --- a/tests/kernels/quantization/test_aqlm.py +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from tests.kernels.utils import opcheck -from vllm import _custom_ops as ops # noqa: F401 - - -def test_aqlm_dequant_opcheck(): - codes = torch.randint(-32768, - 32767, (22016, 512, 1), - device='cuda', - dtype=torch.int16) - codebooks = torch.rand((2, 65536, 1, 8), - device='cuda', - dtype=torch.float16) - codebook_partition_sizes = [11008, 11008] - - opcheck(torch.ops._C.aqlm_dequant, - (codes, codebooks, codebook_partition_sizes)) - - -def test_aqlm_gemm_opcheck(): - input = torch.rand((4, 4096), device='cuda', dtype=torch.float16) - codes = torch.randint(-32768, - 32767, (12288, 512, 1), - device='cuda', - dtype=torch.int16) - codebooks = torch.rand((3, 65536, 1, 8), - device='cuda', - dtype=torch.float16) - scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16) - codebook_partition_sizes = [4096, 4096, 4096] - bias = None - - opcheck(torch.ops._C.aqlm_gemm, - (input, codes, codebooks, scales, codebook_partition_sizes, None)) - opcheck(torch.ops._C.aqlm_gemm, - (input, codes, codebooks, scales, codebook_partition_sizes, bias)) diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py deleted file mode 100644 index de6851e2fc28..000000000000 --- a/tests/models/quantization/test_aqlm.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - -from tests.quantization.utils import is_quant_method_supported -from vllm.platforms import current_platform - -# These ground truth generations were generated using `transformers==4.38.1 -# aqlm==1.1.0 torch==2.2.0` -# and the below code: -# ```python -# from transformers import AutoTokenizer, AutoModelForCausalLM -# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf" -# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, -# torch_dtype="auto", device_map="cuda").cuda() -# tokenizer = AutoTokenizer.from_pretrained(model_id) -# outputs = [] -# for prompt in example_prompts: -# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") -# hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32) -# outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:])) -# print(outputs) -# ``` -ground_truth_generations = [ - '\n### Features\n\n- **High-throughput**: v', - 'The major milestones in the development of artificial intelligence from ' - '195', - 'Compare and contrast artificial intelligence with human intelligence in ' - 'terms of processing information. The', - 'Explain the difference between supervised and unsupervised learning.' - '\nExplain', - 'Write a short story about a robot that dreams for the first time. The', - 'Analyze the impact of the COVID-19 pandemic on global economic', - 'The Mona Lisa is a painting by Leonardo da Vinci, and it', - 'The early bird catches the worm.\nThe early bird catches the' -] - - -@pytest.mark.skipif(not is_quant_method_supported("aqlm") - or current_platform.is_rocm() - or not current_platform.is_cuda(), - reason="AQLM is not supported on this GPU type.") -@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [16]) -@pytest.mark.parametrize("num_logprobs", [1]) -def test_models( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - # loop through the prompts to compare against the ground truth generations - for prompt_idx in range(len(example_prompts)): - vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[ - prompt_idx] - - print("Prompt: ", repr(example_prompts[prompt_idx])) - print("Reference output:", repr(ground_truth_generations[prompt_idx])) - print("Output output: ", repr(vllm_output_str)) - assert vllm_output_str == ground_truth_generations[prompt_idx] diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index a318637c5aeb..0d556053f898 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -476,32 +476,6 @@ def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor, dtype=input.dtype, device=input.device).sum(0) - @register_fake("_C::aqlm_gemm") - def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor, - codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: list[int], - bias: Optional[torch.Tensor]) -> torch.Tensor: - out_features = codes.size(0) * codebooks.size(2) - flat_input = input.reshape((-1, input.size(-1))) - flat_output = torch.empty((flat_input.size(0), out_features), - dtype=input.dtype, - device=input.device) - - output_sizes = list(input.shape) - output_sizes.pop() - output_sizes.append(-1) - return flat_output.reshape(tuple(output_sizes)) - - @register_fake("_C::aqlm_dequant") - def _aqlm_dequant_fake( - codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: list[int]) -> torch.Tensor: - in_features = codes.size(1) * 8 - out_features = codes.size(0) - return torch.empty((out_features, in_features), - dtype=codebooks.dtype, - device=codebooks.device) - @register_fake("_C::machete_mm") def machete_mm_fake( a: torch.Tensor, @@ -957,21 +931,6 @@ def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor, sf_offsets) -# aqlm -def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, - codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: list[int], - bias: Optional[torch.Tensor]) -> torch.Tensor: - return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales, - codebook_partition_sizes, bias) - - -def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: list[int]) -> torch.Tensor: - return torch.ops._C.aqlm_dequant(codes, codebooks, - codebook_partition_sizes) - - # gptq_marlin def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor, size_k: int, size_n: int, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 75391c51f775..671ad9eed234 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -692,8 +692,6 @@ def weight_loader(self, param_data = param.data output_dim = getattr(param, "output_dim", None) - # Special case for AQLM codebooks. - is_metadata = getattr(param, "is_metadata", False) # Special case for per-tensor scale to load scalar into fused array. needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) @@ -781,13 +779,6 @@ def weight_loader(self, if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - # Special case for AQLM codebooks. - elif is_metadata: - # metadata indicates fixed size concatenated along dim 0 - shard_size = loaded_weight.shape[0] - shard_offset = loaded_shard_id * shard_size - param_data = param_data.narrow(0, shard_offset, shard_size) - # Special case for per-tensor scales in fused case. elif needs_scalar_to_array: param_data, loaded_weight = adjust_scalar_to_fused_array( @@ -1081,8 +1072,6 @@ def weight_loader(self, param_data = param.data output_dim = getattr(param, "output_dim", None) - # Special case for AQLM codebooks. - is_metadata = getattr(param, "is_metadata", False) # Special case for per-tensor scales in fused case. needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) @@ -1204,13 +1193,6 @@ def weight_loader(self, loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - # Special case for for AQLM codebooks. - elif is_metadata: - # metadata indicates fixed size concatenated along dim 0 - shard_size = loaded_weight.shape[0] - shard_index = ["q", "k", "v"].index(loaded_shard_id) - param_data = param_data.narrow(0, shard_index * shard_size, - shard_size) # Special case for per-tensor scales in fused case. elif needs_scalar_to_array: param_data, loaded_weight = adjust_scalar_to_fused_array( diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 8d63027e1863..a4c2671225f5 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -7,7 +7,6 @@ QuantizationConfig) QuantizationMethods = Literal[ - "aqlm", "awq", "deepspeedfp", "tpu_int8", @@ -88,7 +87,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: # lazy import to avoid triggering `torch.compile` too early from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig - from .aqlm import AQLMConfig from .auto_round import AutoRoundConfig from .awq import AWQConfig from .awq_marlin import AWQMarlinConfig @@ -120,7 +118,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .tpu_int8 import Int8TpuConfig method_to_config: dict[str, type[QuantizationConfig]] = { - "aqlm": AQLMConfig, "awq": AWQConfig, "deepspeedfp": DeepSpeedFPConfig, "tpu_int8": Int8TpuConfig, diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py deleted file mode 100644 index 2ea8c5dc5113..000000000000 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ /dev/null @@ -1,376 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Supports AQLM compression, see https://github.com/Vahe1994/AQLM -# and https://arxiv.org/pdf/2401.06118.pdf - -import math -from typing import Any, Optional - -import torch -import torch.nn.functional as F -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.utils import set_weight_attrs - - -def get_int_dtype(nbits: int) -> torch.dtype: - if nbits <= 8: - return torch.int8 - if nbits <= 16: - return torch.int16 - if nbits <= 32: - return torch.int32 - if nbits <= 64: - return torch.int64 - raise ValueError(f"No dtype available for {nbits}-bit codebooks") - - -@torch.inference_mode() -def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor: - return data.to(torch.int64) % (2**nbits) - - -def dequantize_weight(codes: torch.Tensor, - codebooks: torch.Tensor, - scales: Optional[torch.Tensor] = None) -> torch.Tensor: - """ - Decode float weights from quantization codes. Differentiable. - :param codes: tensor of integer quantization codes, shape - [*dims, num_out_groups, num_in_groups, num_codebooks] - :param codebooks: tensor of vectors for each quantization code, - [num_codebooks, codebook_size, out_group_size, in_group_size] - :param scales: weight will be multiplied by this factor, must be - broadcastble with - [*dims, out_groups, num_in_groups, out_group_size, in_group_size] - :return: reconstructed weight tensor of shape - [*dims, num_in_groups*group_size] - """ - num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:] - num_codebooks, codebook_size, out_group_size, in_group_size = \ - codebooks.shape - out_features = num_out_groups * out_group_size - in_features = num_in_groups * in_group_size - codebook_offsets = torch.arange( - 0, num_codebooks * codebook_size, codebook_size, - device=codes.device) # shape: [num_codebooks] - reconstructed_weight_flat = F.embedding_bag( - codes.flatten(0, -2) + codebook_offsets, - codebooks.flatten(0, 1).flatten(-2, -1), - mode="sum" - ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size - # * in_group_size] - - reconstructed_weight_groupwise = reconstructed_weight_flat.view( - list(codes.shape[:-3]) + - [num_out_groups, num_in_groups, out_group_size, in_group_size]) - if scales is not None: - reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul( - scales) - return reconstructed_weight_groupwise.swapaxes( - -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features]) - - -def dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - bias: Optional[torch.Tensor], -) -> torch.Tensor: - dequantized_weight = dequantize_weight( - unpack_int_data(codes, codebooks.shape[1].bit_length() - 1), - codebooks, - scales, - ) - return F.linear(input, dequantized_weight, bias) - - -# Generic dequantization, slow but flexible. -def generic_dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: list[int], - bias: Optional[torch.Tensor], -) -> torch.Tensor: - output_shape = input.shape[:-1] + (scales.shape[0], ) - output = torch.empty(output_shape, dtype=input.dtype, device=input.device) - num_outputs = len(output_partition_sizes) - - # break the inputs and codebooks apart then combine the outputs. - # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big - # multiply at the end. - num_codebooks = codebooks.shape[0] // num_outputs - assert (scales.shape[0] == codes.shape[0]) - assert (sum(output_partition_sizes) == scales.shape[0]) - output_offset = 0 - codebooks_offset = 0 - for output_size in output_partition_sizes: - shard_output = dequantize_gemm( - input, codes.narrow(0, output_offset, output_size), - codebooks.narrow(0, codebooks_offset, num_codebooks), - scales.narrow(0, output_offset, output_size), None - if bias is None else bias.narrow(0, output_offset, output_size)) - - output_slice = output.narrow(-1, output_offset, output_size) - assert (output_slice.shape == shard_output.shape) - output_slice.copy_(shard_output) - output_offset += output_size - codebooks_offset += num_codebooks - return output - - -# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8 -# at 6 and 9 times faster than the generic version above, respectively. -def optimized_dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: list[int], - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - if bias is None: - # scaling the output is fastest, so we do that when possible. - output = F.linear(input, weights, bias) - orig_shape = output.shape - flattened_output = output.view(-1, output.size(-1)) - f_scales = scales.view(-1, scales.shape[0]) - b_scales = f_scales.expand(flattened_output.shape[0], -1) - flattened_output *= b_scales - return output.view(orig_shape) - else: - b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( - -1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -class AQLMConfig(QuantizationConfig): - """Config class for AQLM. - - Reference: https://github.com/Vahe1994/AQLM - """ - - def __init__( - self, - in_group_size: int, - nbits_per_codebook: int, - num_codebooks: int, - out_group_size: int, - ) -> None: - super().__init__() - self.in_group_size = in_group_size - self.nbits_per_codebook = nbits_per_codebook - self.num_codebooks = num_codebooks - self.out_group_size = out_group_size - - # out_group_size > 1 is untested, and probably won't work as-is. - assert (self.out_group_size == 1) - self.pack_factor = (self.in_group_size * self.out_group_size) - - def __repr__(self) -> str: - return (f"AQLMConfig(in_group_size={self.in_group_size}, " - f"nbits_per_codebook={self.nbits_per_codebook}, " - f"num_codebooks={self.num_codebooks}, " - f"out_group_size={self.out_group_size})") - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "aqlm" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - def get_min_capability(cls) -> int: - return 60 - - @classmethod - def get_config_filenames(cls) -> list[str]: - return [] # no extra configs. - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "AQLMConfig": - in_group_size = cls.get_from_keys(config, ["in_group_size"]) - nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"]) - num_code_books = cls.get_from_keys(config, ["num_codebooks"]) - out_group_size = cls.get_from_keys(config, ["out_group_size"]) - return cls(in_group_size, nbits_per_codebook, num_code_books, - out_group_size) - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["AQLMLinearMethod"]: - if isinstance(layer, LinearBase): - return AQLMLinearMethod(self) - return None - - -class AQLMLinearMethod(LinearMethodBase): - """Linear method for AQLM. - - Args: - quant_config: The AQLM quantization config. - """ - - def __init__(self, quant_config: AQLMConfig): - self.quant_config = quant_config - - def create_weights(self, layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): - del output_size # Unused. - del input_size # Unused. - - if params_dtype != torch.half: - raise ValueError("Only half is currently supported by aqlm") - if input_size_per_partition % self.quant_config.in_group_size != 0: - raise ValueError( - "The input size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.out_group_size != 0: - raise ValueError( - "The output size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - codes = Parameter( - torch.empty( - # There could actually be two pack factors, one along input and - # one along output, but we don't currently support - # out_group_size, and only the one along output needs to be - # marked with "packed_dim" in order for QKVLinear to work. - output_size_per_partition, - input_size_per_partition // self.quant_config.pack_factor, - self.quant_config.num_codebooks, - dtype=get_int_dtype(self.quant_config.nbits_per_codebook), - ), - requires_grad=False, - ) - - set_weight_attrs( - codes, - { - "input_dim": 1, - "output_dim": 0, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - }, - ) - - codebooks = Parameter( - torch.empty( - self.quant_config.num_codebooks * len(output_partition_sizes), - 2**self.quant_config.nbits_per_codebook, - self.quant_config.out_group_size, - self.quant_config.in_group_size, - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs( - codebooks, - { - # metadata indicates fixed size concatenated along dim 0 - "is_metadata": True, - "output_partition_sizes": output_partition_sizes - }, - ) - - scales = Parameter( - torch.empty( - ( - output_size_per_partition // - self.quant_config.out_group_size, - 1, - 1, - 1, - ), - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs( - scales, - { - "output_dim": 0, - "packed_dim": 0, - "pack_factor": self.quant_config.out_group_size - }, - ) - - layer.register_parameter("codes", codes) - set_weight_attrs(codes, extra_weight_attrs) - layer.register_parameter("codebooks", codebooks) - set_weight_attrs(codebooks, extra_weight_attrs) - layer.register_parameter("scales", scales) - set_weight_attrs(scales, extra_weight_attrs) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - codebooks = layer.codebooks - codes = layer.codes - scales = layer.scales - output_partition_sizes = getattr(codebooks, "output_partition_sizes", - []) - - nbooks = codes.shape[2] - ingroups = codebooks.shape[3] - outgroups = codebooks.shape[2] - bits = codebooks.shape[1] - - # We support these formats with dedicated gemm and decompression - # kernels. - if ingroups == 8 and outgroups == 1 and ( - (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)): - - # thresholds determined by timings on an A6000, one GPU - use_gemv = math.prod(x.shape[:-1]) <= 6 - - return ops.aqlm_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) if use_gemv else optimized_dequantize_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) - - # fall back all unoptimized formats - return generic_dequantize_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) From 9931ad7afc4e074c0d8c8a415ed8f9503262e3f2 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Sat, 16 Aug 2025 15:59:17 -0400 Subject: [PATCH 092/231] [V1] Logits processors extensibility (#19912) Signed-off-by: Andrew Feldman Signed-off-by: Andrew Feldman Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Nick Hill Co-authored-by: Nick Hill Co-authored-by: Andrew Feldman Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Duncan Moss --- .buildkite/test-pipeline.yaml | 1 + .../offline_inference/logits_processor.py | 147 +++++++++ tests/utils.py | 79 ++++- tests/v1/logits_processors/__init__.py | 0 .../test_correctness.py} | 24 +- .../logits_processors/test_custom_offline.py | 237 ++++++++++++++ .../logits_processors/test_custom_online.py | 180 +++++++++++ tests/v1/logits_processors/utils.py | 127 ++++++++ tests/v1/sample/test_rejection_sampler.py | 4 +- tests/v1/sample/test_sampler.py | 4 +- tests/v1/worker/test_gpu_input_batch.py | 4 +- vllm/config/__init__.py | 5 + vllm/engine/arg_utils.py | 8 + vllm/entrypoints/llm.py | 4 + vllm/utils/__init__.py | 2 +- vllm/v1/sample/logits_processor/__init__.py | 185 +++++++++++ .../builtin.py} | 296 ++---------------- vllm/v1/sample/logits_processor/interface.py | 86 +++++ vllm/v1/sample/logits_processor/state.py | 149 +++++++++ vllm/v1/sample/metadata.py | 4 +- vllm/v1/worker/gpu_input_batch.py | 91 ++++-- vllm/v1/worker/gpu_model_runner.py | 11 +- 22 files changed, 1313 insertions(+), 335 deletions(-) create mode 100644 examples/offline_inference/logits_processor.py create mode 100644 tests/v1/logits_processors/__init__.py rename tests/v1/{sample/test_logits_processors.py => logits_processors/test_correctness.py} (97%) create mode 100644 tests/v1/logits_processors/test_custom_offline.py create mode 100644 tests/v1/logits_processors/test_custom_online.py create mode 100644 tests/v1/logits_processors/utils.py create mode 100644 vllm/v1/sample/logits_processor/__init__.py rename vllm/v1/sample/{logits_processor.py => logits_processor/builtin.py} (54%) create mode 100644 vllm/v1/sample/logits_processor/interface.py create mode 100644 vllm/v1/sample/logits_processor/state.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 87296a08e207..4fc885785492 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -253,6 +253,7 @@ steps: - pytest -v -s v1/engine - pytest -v -s v1/entrypoints - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py new file mode 100644 index 000000000000..7ef20efa7d28 --- /dev/null +++ b/examples/offline_inference/logits_processor.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""This example demonstrates instantiating vLLM with a custom logits processor +class object. + +For a basic example of implementing a custom logits processor, see +the `DummyLogitsProcessor` implementation in `vllm/test_utils.py`. + +For testing purposes, a dummy logits processor is employed which, if +`target_token` is passed as a keyword argument to `SamplingParams.extra_args`, +will mask out all tokens except `target_token`. + +A batch is constructed with `temperature=0.0` and 50% of requests specifying +`target_token`, and for these requests - and *only* these requests - we +expect the `target_token` to be decoded in each step, yielding an output +similar to that shown below: + +Generated Outputs: +------------------------------------------------------------ +Prompt: 'Hello, my name is' +Output: " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '" +------------------------------------------------------------ +Prompt: 'The president of the United States is' +Output: " not a racist. He is a racist.\nHe's a racist because he" +------------------------------------------------------------ +Prompt: 'The capital of France is' +Output: ' also also also also also also also also also also also also also + also also also' +------------------------------------------------------------ +Prompt: 'The future of AI is' +Output: ' in the hands of the people.\n\nThe future of AI is in the' +------------------------------------------------------------ +""" + +from typing import Optional + +import torch + +from vllm import LLM, SamplingParams +from vllm.config import VllmConfig +from vllm.v1.sample.logits_processor import ( + BatchUpdate, + LogitsProcessor, + MoveDirectionality, +) + + +# Hypothetical custom logits processor +class DummyLogitsProcessor(LogitsProcessor): + """Fake logit processor to support unit testing and examples""" + + def __init__( + self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool + ): + self.req_info: dict[int, SamplingParams] = {} + + def is_argmax_invariant(self) -> bool: + """Never impacts greedy sampling""" + return False + + def update_state(self, batch_update: Optional[BatchUpdate]): + if not batch_update: + return + + # Process added requests. + for index, params, _, _ in batch_update.added: + assert params is not None + if params.extra_args and ( + target_token := params.extra_args.get("target_token") + ): + self.req_info[index] = target_token + + if self.req_info: + # Process removed requests. + for index in batch_update.removed: + self.req_info.pop(index, None) + + # Process moved requests, unidirectional move (a->b) and swap + # (a<->b) + for adx, bdx, direct in batch_update.moved: + a_val = self.req_info.pop(adx, None) + b_val = self.req_info.pop(bdx, None) + if a_val is not None: + self.req_info[bdx] = a_val + if direct == MoveDirectionality.SWAP and b_val is not None: + self.req_info[adx] = b_val + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if not self.req_info: + return logits + + # Save target values before modification + rows_list = list(self.req_info.keys()) + cols = torch.tensor( + [self.req_info[i] for i in rows_list], + dtype=torch.long, + device=logits.device, + ) + rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) + values_to_keep = logits[rows, cols].clone() + + # Mask all but target tokens + logits[rows] = float("-inf") + logits[rows, cols] = values_to_keep + + return logits + + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a mixture of requests which do and don't utilize the dummy logitproc +sampling_params_list = [ + SamplingParams(temperature=0.0, extra_args={"target_token": 128}), + SamplingParams(temperature=0.0), + SamplingParams(temperature=0.0, extra_args={"target_token": 67}), + SamplingParams(temperature=0.0), +] + + +def main(): + # Create an LLM. + llm = LLM( + model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor], + ) + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params_list) + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/tests/utils.py b/tests/utils.py index 18fcde949160..e98707fb4447 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -13,6 +13,7 @@ import time import warnings from contextlib import contextmanager, suppress +from multiprocessing import Process from pathlib import Path from typing import Any, Callable, Literal, Optional, Union @@ -76,6 +77,23 @@ def _nvml(): class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key + def _start_server(self, model: str, vllm_serve_args: list[str], + env_dict: Optional[dict[str, str]]) -> None: + """Subclasses override this method to customize server process launch + """ + env = os.environ.copy() + # the current process might initialize cuda, + # to be safe, we should use spawn method + env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + if env_dict is not None: + env.update(env_dict) + self.proc: subprocess.Popen = subprocess.Popen( + ["vllm", "serve", model, *vllm_serve_args], + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + def __init__(self, model: str, vllm_serve_args: list[str], @@ -128,18 +146,7 @@ def __init__(self, model_loader = get_model_loader(load_config) model_loader.download_model(model_config) - env = os.environ.copy() - # the current process might initialize cuda, - # to be safe, we should use spawn method - env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - if env_dict is not None: - env.update(env_dict) - self.proc = subprocess.Popen( - ["vllm", "serve", model, *vllm_serve_args], - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) + self._start_server(model, vllm_serve_args, env_dict) max_wait_seconds = max_wait_seconds or 240 self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds) @@ -155,6 +162,10 @@ def __exit__(self, exc_type, exc_value, traceback): # force kill if needed self.proc.kill() + def _poll(self) -> Optional[int]: + """Subclasses override this method to customize process polling""" + return self.proc.poll() + def _wait_for_server(self, *, url: str, timeout: float): # run health check start = time.time() @@ -169,7 +180,7 @@ def _wait_for_server(self, *, url: str, timeout: float): # which means the server is not ready yet. # the stack trace is not useful, so we suppress it # by using `raise from None`. - result = self.proc.poll() + result = self._poll() if result is not None and result != 0: raise RuntimeError("Server exited unexpectedly.") from None @@ -205,6 +216,48 @@ def get_async_client(self, **kwargs): **kwargs) +class RemoteOpenAIServerCustom(RemoteOpenAIServer): + """Launch test server with custom child process""" + + def _start_server(self, model: str, vllm_serve_args: list[str], + env_dict: Optional[dict[str, str]]) -> None: + self.proc: Process = Process( + target=self.child_process_fxn, + args=(env_dict, model, + vllm_serve_args)) # type: ignore[assignment] + self.proc.start() + + def __init__(self, + model: str, + vllm_serve_args: list[str], + child_process_fxn: Callable[ + [Optional[dict[str, str]], str, list[str]], None], + *, + env_dict: Optional[dict[str, str]] = None, + seed: Optional[int] = 0, + auto_port: bool = True, + max_wait_seconds: Optional[float] = None) -> None: + """Store custom child process function then invoke superclass + constructor which will indirectly launch it.""" + self.child_process_fxn = child_process_fxn + super().__init__(model=model, + vllm_serve_args=vllm_serve_args, + env_dict=env_dict, + seed=seed, + auto_port=auto_port, + max_wait_seconds=max_wait_seconds) + + def _poll(self) -> Optional[int]: + return self.proc.exitcode + + def __exit__(self, exc_type, exc_value, traceback): + self.proc.terminate() + self.proc.join(8) + if self.proc.is_alive(): + # force kill if needed + self.proc.kill() + + def _test_completion( client: openai.OpenAI, model: str, diff --git a/tests/v1/logits_processors/__init__.py b/tests/v1/logits_processors/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/logits_processors/test_correctness.py similarity index 97% rename from tests/v1/sample/test_logits_processors.py rename to tests/v1/logits_processors/test_correctness.py index 84ee3b0392b4..43caef79b02f 100644 --- a/tests/v1/sample/test_logits_processors.py +++ b/tests/v1/logits_processors/test_correctness.py @@ -9,11 +9,13 @@ import pytest import torch +from tests.utils import create_new_process_for_each_test from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits, create_penalty_tensor, create_prompt_tokens_tensor, fake_apply_logitsprocs, fake_update_logitsprocs_state) +from vllm.config import VllmConfig from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available @@ -24,7 +26,7 @@ MinPLogitsProcessor, MinTokensLogitsProcessor, MoveDirectionality, - init_builtin_logitsprocs) + build_logitsprocs) # yapf: enable from vllm.v1.sample.metadata import SamplingMetadata @@ -53,6 +55,7 @@ class LogitsProcsRequestParams: workload_index: int logitproc_type: LogitprocType # Logitproc enabled, specified by str id out_tokens: list[int] # Output tokens required for min tokens test + prompt_tokens: list[int] # Dummy prompt tokens placeholder params: SamplingParams # Settings customized for logitproc def __init__(self, workload_index: int, logitproc_type: LogitprocType): @@ -63,6 +66,7 @@ def __init__(self, workload_index: int, logitproc_type: LogitprocType): # don't matter *for these tests* so use 0 as a dummy value self.out_tokens = ([0] * (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2))) + self.prompt_tokens = [] self.params = _sampling_params_from_logitproc(logitproc_type) def __str__(self): @@ -88,11 +92,12 @@ def _generate_fake_sampling_metadata( vocab_size, size=np.random.randint( 1, MAX_NUM_PROMPT_TOKENS)).tolist()) - logitsprocs = init_builtin_logitsprocs( - pin_memory_available=PIN_MEMORY_AVAILABLE, - max_num_reqs=MAX_NUM_REQS + 1, - device=device) - + logitsprocs = build_logitsprocs( + vllm_config=VllmConfig(), + device=device, + is_pin_memory=PIN_MEMORY_AVAILABLE, + is_pooling_model=False, + ) fake_sampling_metadata = SamplingMetadata( temperature=torch.full((batch_size, ), 0.0), all_greedy=True, @@ -462,7 +467,8 @@ def _generate_fake_step_update( # Replace as many removed requests as possible with added requests add_remove_idx = batch_update_builder.pop_removed() batch_update_builder.added.append( - (add_remove_idx, add_req_params.params, add_req_params.out_tokens)) + (add_remove_idx, add_req_params.params, + add_req_params.prompt_tokens, add_req_params.out_tokens)) persistent_batch[add_remove_idx] = add_req_params # Append remaining added requests to end of batch @@ -470,7 +476,8 @@ def _generate_fake_step_update( num_step_add_replace):(wdx + num_step_add)] batch_update_builder.added.extend([ - (adx + batch_size, add_req_params.params, add_req_params.out_tokens) + (adx + batch_size, add_req_params.params, add_req_params.prompt_tokens, + add_req_params.out_tokens) for adx, add_req_params in enumerate(add_reqs_append) ]) persistent_batch.extend(add_reqs_append) @@ -561,6 +568,7 @@ def _assert_valid( step_idx=step_idx) +@create_new_process_for_each_test() @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC]) @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases()) diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py new file mode 100644 index 000000000000..a7fde1990f7e --- /dev/null +++ b/tests/v1/logits_processors/test_custom_offline.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random +import sys +from typing import Union + +import pytest + +from tests.utils import create_new_process_for_each_test +# yapf: disable +from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG, + DUMMY_LOGITPROC_FQCN, + DUMMY_LOGITPROC_MODULE, + MAX_TOKENS, MODEL_NAME, + POOLING_MODEL_NAME, TEMP_GREEDY, + CustomLogitprocSource, + DummyLogitsProcessor, + dummy_module) +from tests.v1.logits_processors.utils import entry_points as fake_entry_points +from tests.v1.logits_processors.utils import prompts +# yapf: enable +from vllm import LLM, SamplingParams +from vllm.v1.sample.logits_processor import (STR_POOLING_REJECTS_LOGITSPROCS, + LogitsProcessor) + +# Create a mixture of requests which do and don't utilize the dummy logitproc +sampling_params_list = [ + SamplingParams(temperature=TEMP_GREEDY, + max_tokens=MAX_TOKENS, + extra_args={DUMMY_LOGITPROC_ARG: 128}), + SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS), + SamplingParams(temperature=TEMP_GREEDY, + max_tokens=MAX_TOKENS, + extra_args={DUMMY_LOGITPROC_ARG: 67}), + SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS), +] + + +def _run_test(kwargs: dict, logitproc_loaded: bool) -> None: + """Compare `LLM` instance initialized with specified `kwargs` against + reference `LLM` instance. + + Two scenarios: + 1. Server has loaded dummy logitproc; test that requests which specify + dummy logitproc arg value behave as if logitproc is operating (output + token value should repeat), while requests that don't specify dummy + logitproc arg value should match reference `LLM` output. + 2. Server has *not* loaded dummy logitproc; test that all requests + behave as if logitproc is *not* operating (output matches reference + `LLM` output.) + + Args: + kwargs: `LLM` constructor kwargs + logitproc_loaded: server has loaded dummy logitproc if True + """ + + # Create a vLLM instance and load custom logitproc + llm_logitproc = LLM( + model=MODEL_NAME, + gpu_memory_utilization=0.1, + **kwargs, + ) + + # Create a reference vLLM instance without custom logitproc + llm_ref = LLM(model=MODEL_NAME, gpu_memory_utilization=0.1) + + # Run inference with logitproc loaded + outputs_logitproc = llm_logitproc.generate(prompts, sampling_params_list) + + # Reference run + outputs_ref = llm_ref.generate(prompts, sampling_params_list) + + # Validate outputs + for bdx, (out_lp, out_ref, params) in enumerate( + zip(outputs_logitproc, outputs_ref, sampling_params_list)): + lp_toks = out_lp.outputs[0].token_ids + if logitproc_loaded and params.extra_args: + # This request exercises custom logitproc; validate that logitproc + # forces `target_token` to be decoded in each step + target_token = params.extra_args[DUMMY_LOGITPROC_ARG] + if not all(x == target_token for x in lp_toks): + raise AssertionError( + f"Request {bdx} generated {lp_toks}, shoud all be " + f"{target_token}") + else: + # This request does not exercise custom logitproc (or custom + # logitproc is not enabled on this server); validate against + # reference result + ref_toks = out_ref.outputs[0].token_ids + if lp_toks != ref_toks: + raise AssertionError( + f"Request {bdx} generated {lp_toks}, should match " + f"{ref_toks}") + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("logitproc_source", list(CustomLogitprocSource)) +def test_custom_logitsprocs(monkeypatch, + logitproc_source: CustomLogitprocSource): + """Test offline Python interface for passing custom logitsprocs + + Construct an `LLM` instance which loads a custom logitproc that has a + well-defined behavior (mask out all tokens except one `target_token`) + + Construct a reference `LLM` instance with no custom logitproc + + Pass in a batch of requests, 50% of which pass a `target_token` value + in through `SamplingParams.extra_args`, 50% of which do not. + + Validate that + * Requests which do not activate the custom logitproc, yield the same + results for both `LLM` instances + * Requests which activate the custom logitproc, only output `target_token` + + Test four scenarios, corresponding to `logitproc_source` value + * No logitsprocs loaded - test that generated tokens match reference `LLM` + instance output + * Logitproc passed in via {entrypoint, class object, fully-qualified class + name (FQCN)} - test that dummy logitproc is utilized correctly when + provided via any of these three possible sources + + Args: + monkeypatch: for setting env vars + logitproc_source: what source (entrypoint, fully-qualified class name + (FQCN), class object, or None) the user pulls the + logitproc from + """ + + # Test that logitproc info is passed to workers + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1") + random.seed(40) + + # Choose LLM args based on logitproc source + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_NONE: + # Scenario: the server does not load any custom logitproc + # Every other scenario is a different way of loading a custom logitproc + _run_test({}, logitproc_loaded=False) + return + + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT: + # Scenario: vLLM loads a logitproc from a preconfigured entrypoint + # To that end, mock a dummy logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + + # fork is required for workers to see entrypoint patch + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork") + _run_test({}, logitproc_loaded=True) + return + + kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {} + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN: + # Scenario: load logitproc based on fully-qualified class name (FQCN) + # Inject dummy module which defines logitproc + sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module + kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN] + elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS: + # Scenario: load logitproc from provided class object + kwargs["logits_processors"] = [DummyLogitsProcessor] + + _run_test(kwargs, logitproc_loaded=True) + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("logitproc_source", [ + CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT, + CustomLogitprocSource.LOGITPROC_SOURCE_FQCN, + CustomLogitprocSource.LOGITPROC_SOURCE_CLASS, +]) +def test_pooling_rejects_custom_logitsprocs( + monkeypatch, logitproc_source: CustomLogitprocSource): + """Validate that vLLM engine initialization properly rejects custom + logitsprocs when the model is a pooling model. + + Use `LLM` entrypoint. We expect `LLM` initialization to fail before the + logitproc is actually loaded. + + Scenario 1: + * Mock a logitproc entrypoint + * Validate that `LLM` does not load the logitproc + + Scenario 2: + * Pass custom logitproc to `LLM` constructor + * Scenario 2a: via FQCN + * Scenario 2b: via class object + * Validate that initialization fails with appropriate exception + + Args: + monkeypatch: used to set environment variables + logitproc_source: what source (entrypoint, fully-qualified class name + (FQCN), or class object) the user pulls the + logitproc from + """ + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + random.seed(40) + + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT: + # Scenario: vLLM loads a pooling model and ignores a logitproc that is + # available at a preconfigured entrypoint + + # Patch in dummy logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + + # fork is required for entrypoint patch to be visible to workers, + # although they should ignore the entrypoint patch anyway + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork") + + llm = LLM( + runner="pooling", + model=POOLING_MODEL_NAME, + gpu_memory_utilization=0.1, + ) + # Require that no logitsprocs have been loaded + assert sum([ + 1 for _ in llm.llm_engine.model_executor.driver_worker.worker. + model_runner.input_batch.logitsprocs.all + ]) == 0 + return + + kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {} + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN: + # Scenario: load logitproc based on fully-qualified class name (FQCN) + kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN] + elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS: + # Scenario: load logitproc from provided class object + kwargs["logits_processors"] = [DummyLogitsProcessor] + + with pytest.raises(ValueError, match=STR_POOLING_REJECTS_LOGITSPROCS): + # Require that loading a pooling model alongside the logitproc raises + # the appropriate exception. + LLM( + runner="pooling", + model=POOLING_MODEL_NAME, + gpu_memory_utilization=0.1, + **kwargs, + ) diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py new file mode 100644 index 000000000000..a01a479e5b24 --- /dev/null +++ b/tests/v1/logits_processors/test_custom_online.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import random +import sys +from typing import Any, Optional + +import openai +import pytest +import pytest_asyncio + +from tests.utils import (RemoteOpenAIServerCustom, + create_new_process_for_each_test) +# yapf: disable +from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG, + DUMMY_LOGITPROC_FQCN, + DUMMY_LOGITPROC_MODULE, + MAX_TOKENS, MODEL_NAME, + TEMP_GREEDY, dummy_module) +from tests.v1.logits_processors.utils import entry_points as fake_entry_points +from tests.v1.logits_processors.utils import prompts + +# yapf: enable + + +def _server_with_logitproc_entrypoint( + env_dict: Optional[dict[str, str]], + model: str, + vllm_serve_args: list[str], +) -> None: + """Start vLLM server, inject dummy logitproc entrypoint""" + + # Patch `entry_points` to inject logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + from vllm.entrypoints.cli import main + + # fork is required for workers to see entrypoint patch + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork" + if env_dict is not None: + os.environ.update(env_dict) + + # Emulate `vllm serve ` + sys.argv = ["vllm", "serve", model] + vllm_serve_args + main.main() + + +def _server_with_logitproc_module( + env_dict: Optional[dict[str, str]], + model: str, + vllm_serve_args: list[str], +) -> None: + """Start vLLM server, inject module with dummy logitproc""" + + # Patch `modules` to inject dummy logitproc module + from vllm.entrypoints.cli import main + sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module + + # fork is required for workers to see entrypoint patch + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork" + if env_dict is not None: + os.environ.update(env_dict) + + # Emulate `vllm serve ` + sys.argv = ["vllm", "serve", model] + vllm_serve_args + main.main() + + +@pytest.fixture(scope="module") +def default_server_args(): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + ] + + +@pytest.fixture(scope="function", + params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]]) +def server(default_server_args, request, monkeypatch): + """Consider two server configurations: + (1) --logits-processors cli arg specifies dummy logits processor via fully- + qualified class name (FQCN); patch in a dummy logits processor module + (2) No --logits-processors cli arg; patch in a dummy logits processor + entrypoint + """ + + # Test that logitproc info is passed to workers + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1") + + if request.param: + # Launch server, append FQCN argument, inject dummy logitproc module + args = default_server_args + request.param + _server_fxn = _server_with_logitproc_module + else: + # Launch server, inject dummy logitproc entrypoint + args = default_server_args + _server_fxn = _server_with_logitproc_entrypoint + + with RemoteOpenAIServerCustom(MODEL_NAME, args, + _server_fxn) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +# General request argument values for these tests +api_keyword_args = { + # Greedy sampling ensures that requests which receive the `target_token` + # arg will decode it in every step + "temperature": TEMP_GREEDY, + # Since EOS will never be decoded (unless `target_token` is EOS) + "max_tokens": MAX_TOKENS, + # Return decoded token logprobs (as a way of getting token id) + "logprobs": 0, +} + + +@create_new_process_for_each_test() +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str): + """Test custom logitsprocs when starting OpenAI server from CLI + + Launch vLLM OpenAI-compatible server, configured to load a custom logitproc + that has a well-defined behavior (mask out all tokens except one + `target_token`). + + Pass in requests, 50% of which pass a `target_token` value + in through `extra_body["vllm_xargs"]`, 50% of which do not. + + Validate that requests which activate the custom logitproc, repeat the same + token + """ + + use_dummy_logitproc = True + for prompt in prompts: + # Build request arguments + request_keyword_args: dict[str, Any] = { + **api_keyword_args, + } + if use_dummy_logitproc: + # 50% of requests pass target_token custom arg + target_token = random.choice([128, 67]) + # For requests which activate the dummy logitproc, choose one of + # two `target_token` values which are known not to be EOS tokens + request_keyword_args["extra_body"] = { + "vllm_xargs": { + DUMMY_LOGITPROC_ARG: target_token + } + } + batch = await client.completions.create( + model=model_name, + prompt=prompt, + **request_keyword_args, + ) + + if use_dummy_logitproc: + # Only for requests which activate dummy logitproc - validate that + # output token is repeated + choices: openai.types.CompletionChoice = batch.choices + toks = choices[0].logprobs.tokens + if not all([x == toks[0] for x in toks]): + raise AssertionError( + f"Generated {toks} should all be {toks[0]}") + + # Alternate whether to activate dummy logitproc for each request + use_dummy_logitproc = not use_dummy_logitproc diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py new file mode 100644 index 000000000000..c0bfc1a18fec --- /dev/null +++ b/tests/v1/logits_processors/utils.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import types +from enum import Enum, auto +from typing import Optional + +import torch + +from vllm.config import VllmConfig +from vllm.sampling_params import SamplingParams +from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate, + LogitsProcessor, + MoveDirectionality) + +MODEL_NAME = "facebook/opt-125m" +POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5" +DUMMY_LOGITPROC_ARG = "target_token" +TEMP_GREEDY = 0.0 +MAX_TOKENS = 20 +DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc" +DUMMY_LOGITPROC_MODULE = "DummyModule" +DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor" + + +class CustomLogitprocSource(Enum): + """How to source a logitproc for testing purposes""" + LOGITPROC_SOURCE_NONE = auto() # No custom logitproc + LOGITPROC_SOURCE_ENTRYPOINT = auto() # Via entrypoint + LOGITPROC_SOURCE_FQCN = auto() # Via fully-qualified class name (FQCN) + LOGITPROC_SOURCE_CLASS = auto() # Via provided class object + + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +class DummyLogitsProcessor(LogitsProcessor): + """Fake logit processor to support unit testing and examples""" + + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + self.req_info: dict[int, SamplingParams] = {} + + def is_argmax_invariant(self) -> bool: + """Never impacts greedy sampling""" + return False + + def update_state(self, batch_update: Optional[BatchUpdate]): + if not batch_update: + return + + # Process added requests. + for index, params, _, _ in batch_update.added: + assert params is not None + if params.extra_args and (target_token := + params.extra_args.get("target_token")): + self.req_info[index] = target_token + + if self.req_info: + # Process removed requests. + for index in batch_update.removed: + self.req_info.pop(index, None) + + # Process moved requests, unidirectional move (a->b) and swap + # (a<->b) + for adx, bdx, direct in batch_update.moved: + a_val = self.req_info.pop(adx, None) + b_val = self.req_info.pop(bdx, None) + if a_val is not None: + self.req_info[bdx] = a_val + if direct == MoveDirectionality.SWAP and b_val is not None: + self.req_info[adx] = b_val + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if not self.req_info: + return logits + + # Save target values before modification + rows_list = list(self.req_info.keys()) + cols = torch.tensor([self.req_info[i] for i in rows_list], + dtype=torch.long, + device=logits.device) + rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) + values_to_keep = logits[rows, cols].clone() + + # Mask all but target tokens + logits[rows] = float('-inf') + logits[rows, cols] = values_to_keep + + return logits + + +"""Dummy module with dummy logitproc class""" +dummy_module = types.ModuleType(DUMMY_LOGITPROC_MODULE) +dummy_module.DummyLogitsProcessor = DummyLogitsProcessor # type: ignore + + +class EntryPoint: + """Dummy entrypoint class for logitsprocs testing""" + + def __init__(self): + self.name = DUMMY_LOGITPROC_ENTRYPOINT + self.value = DUMMY_LOGITPROC_FQCN + + def load(self): + return DummyLogitsProcessor + + +class EntryPoints(list): + """Dummy EntryPoints class for logitsprocs testing""" + + def __init__(self, group: str): + # Emulate list-like functionality + eps = [EntryPoint()] if group == LOGITSPROCS_GROUP else [] + super().__init__(eps) + # Extra attributes + self.names = [ep.name for ep in eps] + + +"""Fake version of importlib.metadata.entry_points""" +entry_points = lambda group: EntryPoints(group) diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index 3a4d48afc9d7..4e912f98f376 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -7,7 +7,7 @@ import torch.nn.functional as F from vllm.platforms import current_platform -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID, RejectionSampler) @@ -69,7 +69,7 @@ def create_sampling_metadata( output_token_ids=[], allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 31c6c881d7b8..53215f88bb27 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -9,7 +9,7 @@ from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available, make_tensor_with_pad -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler @@ -173,7 +173,7 @@ def _create_default_sampling_metadata( no_penalties=True, allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) return fake_sampling_metadata diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 74ab19a3ce32..d7b4746562be 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -13,7 +13,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.v1.pool.metadata import PoolingMetadata -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -169,7 +169,7 @@ def _construct_expected_sampling_metadata( and all(x == 1 for x in repetition_penalties)), allowed_token_ids_mask=allowed_token_ids_mask, bad_words_token_ids=bad_words_token_ids, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 14fc5589a89a..51db277f65dc 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -62,6 +62,7 @@ QuantizationConfig) from vllm.model_executor.model_loader import LoadFormats from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + from vllm.v1.sample.logits_processor import LogitsProcessor HfOverrides = Union[dict, Callable[[type], type]] else: @@ -72,6 +73,7 @@ BaseModelLoader = Any LoadFormats = Any TensorizerConfig = Any + LogitsProcessor = Any HfOverrides = Union[dict[str, Any], Callable[[type], type]] me_quant = LazyLoader("model_executor", globals(), @@ -465,6 +467,9 @@ class ModelConfig: - "transformers" will use the Transformers model implementation.""" override_attention_dtype: Optional[str] = None """Override dtype for attention""" + logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None + """One or more logits processors' fully-qualified class names or class + definitions""" def compute_hash(self) -> str: """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 630fbec4539e..6fc894827c4a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -43,6 +43,7 @@ from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) +from vllm.v1.sample.logits_processor import LogitsProcessor # yapf: enable @@ -435,6 +436,10 @@ class EngineArgs: enable_multimodal_encoder_data_parallel: bool = \ ParallelConfig.enable_multimodal_encoder_data_parallel + logits_processors: Optional[list[Union[ + str, type[LogitsProcessor]]]] = ModelConfig.logits_processors + """Custom logitproc types""" + async_scheduling: bool = SchedulerConfig.async_scheduling # DEPRECATED enable_prompt_adapter: bool = False @@ -549,6 +554,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **model_kwargs["model_impl"]) model_group.add_argument("--override-attention-dtype", **model_kwargs["override_attention_dtype"]) + model_group.add_argument("--logits-processors", + **model_kwargs["logits_processors"]) # Model loading arguments load_kwargs = get_kwargs(LoadConfig) @@ -940,6 +947,7 @@ def create_model_config(self) -> ModelConfig: enable_sleep_mode=self.enable_sleep_mode, model_impl=self.model_impl, override_attention_dtype=self.override_attention_dtype, + logits_processors=self.logits_processors, ) def validate_tensorizer_args(self): diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 915f14a29b90..b002f234c043 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -55,6 +55,7 @@ get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of +from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: from vllm.v1.metrics.reader import Metric @@ -198,6 +199,8 @@ def __init__( override_pooler_config: Optional[PoolerConfig] = None, compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, + logits_processors: Optional[list[Union[str, + type[LogitsProcessor]]]] = None, **kwargs, ) -> None: """LLM constructor.""" @@ -272,6 +275,7 @@ def __init__( mm_processor_kwargs=mm_processor_kwargs, override_pooler_config=override_pooler_config, compilation_config=compilation_config_instance, + logits_processors=logits_processors, **kwargs, ) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 64f7426bd65d..5cb9f97ae0b0 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2562,7 +2562,7 @@ def direct_register_custom_op( def resolve_obj_by_qualname(qualname: str) -> Any: """ - Resolve an object by its fully qualified name. + Resolve an object by its fully-qualified class name. """ module_name, obj_name = qualname.rsplit(".", 1) module = importlib.import_module(module_name) diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py new file mode 100644 index 000000000000..822026916295 --- /dev/null +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +import itertools +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional, Union + +import torch + +from vllm.logger import init_logger +from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor, + MinPLogitsProcessor, + MinTokensLogitsProcessor) +from vllm.v1.sample.logits_processor.interface import (BatchUpdate, + LogitsProcessor, + MoveDirectionality) +from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder, + LogitsProcessors) + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +logger = init_logger(__name__) + +# Error message when the user tries to initialize vLLM with a pooling model +# and custom logitsproces +STR_POOLING_REJECTS_LOGITSPROCS = ("Pooling models do not support custom" + " logits processors.") + +LOGITSPROCS_GROUP = 'vllm.logits_processors' + +BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [ + MinTokensLogitsProcessor, + LogitBiasLogitsProcessor, + MinPLogitsProcessor, +] + + +def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]: + """Load all installed logit processor plugins""" + + import sys + + if sys.version_info < (3, 10): + from importlib_metadata import entry_points + else: + from importlib.metadata import entry_points + + installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP) + if len(installed_logitsprocs_plugins) == 0: + logger.debug("No logitsprocs plugins installed (group %s).", + LOGITSPROCS_GROUP) + return [] + + # Load logitsprocs plugins + logger.debug("Loading installed logitsprocs plugins (group %s):", + LOGITSPROCS_GROUP) + classes: list[type[LogitsProcessor]] = [] + for entrypoint in installed_logitsprocs_plugins: + try: + logger.debug("- Loading logitproc plugin entrypoint=%s target=%s", + entrypoint.name, entrypoint.value) + classes.append(entrypoint.load()) + except Exception as e: + raise RuntimeError( + f"Failed to load LogitsProcessor plugin {entrypoint}") from e + return classes + + +def _load_logitsprocs_by_fqcns( + logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]] +) -> list[type[LogitsProcessor]]: + """Load logit processor types, identifying them by fully-qualified class + names (FQCNs). + + Effectively, a mixed list of logitproc types and FQCN strings is converted + into a list of entirely logitproc types, by loading from the FQCNs. + + FQCN syntax is : i.e. x.y.z:CustomLogitProc + + Already-loaded logitproc types must be subclasses of LogitsProcessor + + Args: + logits_processors: Potentially mixed list of logitsprocs types and FQCN + strings for logitproc types + + Returns: + List of logitproc types + + """ + if not logits_processors: + return [] + + logger.debug( + "%s additional custom logits processors specified, checking whether " + "they need to be loaded.", len(logits_processors)) + + classes: list[type[LogitsProcessor]] = [] + for ldx, logitproc in enumerate(logits_processors): + if isinstance(logitproc, type): + logger.debug(" - Already-loaded logit processor: %s", + logitproc.__name__) + if not issubclass(logitproc, LogitsProcessor): + raise ValueError( + f"{logitproc.__name__} is not a subclass of LogitsProcessor" + ) + classes.append(logitproc) + continue + + logger.debug("- Loading logits processor %s", logitproc) + module_path, qualname = logitproc.split(":") + + try: + # Load module + module = importlib.import_module(module_path) + except Exception as e: + raise RuntimeError( + f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}" + ) from e + + # Walk down dotted name to get logitproc class + obj = module + for attr in qualname.split("."): + obj = getattr(obj, attr) + if not isinstance(obj, type): + raise ValueError("Loaded logit processor must be a type.") + if not issubclass(obj, LogitsProcessor): + raise ValueError( + f"{obj.__name__} must be a subclass of LogitsProcessor") + classes.append(obj) + + return classes + + +def _load_custom_logitsprocs( + logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]], +) -> list[type[LogitsProcessor]]: + """Load all custom logits processors. + + * First load all installed logitproc plugins + * Second load custom logitsprocs pass by the user at initialization time + + Args: + logits_processors: potentially mixed list of logitproc types and + logitproc type fully-qualified names (FQCNs) + which need to be loaded + + Returns: + A list of all loaded logitproc types + """ + from vllm.platforms import current_platform + if current_platform.is_tpu(): + # No logitsprocs specified by caller + # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs + return [] + + return (_load_logitsprocs_plugins() + + _load_logitsprocs_by_fqcns(logits_processors)) + + +def build_logitsprocs( + vllm_config: "VllmConfig", + device: torch.device, + is_pin_memory: bool, + is_pooling_model: bool, + custom_logitsprocs: Sequence[Union[str, type[LogitsProcessor]]] = (), +) -> LogitsProcessors: + if is_pooling_model: + if custom_logitsprocs: + raise ValueError(STR_POOLING_REJECTS_LOGITSPROCS) + logger.debug("Skipping logits processor loading because pooling models" + " do not support logits processors.") + return LogitsProcessors() + custom_logitsprocs_classes = _load_custom_logitsprocs(custom_logitsprocs) + return LogitsProcessors( + ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain( + BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes)) + + +__all__ = [ + "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor", + "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder", + "MoveDirectionality", "LogitsProcessors", "build_logitsprocs", + "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP" +] diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor/builtin.py similarity index 54% rename from vllm/v1/sample/logits_processor.py rename to vllm/v1/sample/logits_processor/builtin.py index 3a06e71057cd..24387ab79390 100644 --- a/vllm/v1/sample/logits_processor.py +++ b/vllm/v1/sample/logits_processor/builtin.py @@ -1,241 +1,32 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import dataclasses -from abc import ABC, abstractmethod -from collections.abc import Iterator, Sequence -from dataclasses import dataclass, field -from enum import Enum -from itertools import chain -from typing import Optional, Union +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional import torch -from torch._prims_common import DeviceLikeType - -from vllm import PoolingParams, SamplingParams -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class MoveDirectionality(Enum): - # One-way i1->i2 req move within batch - UNIDIRECTIONAL = 0 - # Two-way i1<->i2 req swap within batch - SWAP = 1 - - -# (index, params, output_tok_ids) tuples for new -# requests added to the batch. -AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]] -# (index 1, index 2, directionality) tuples representing -# one-way moves or two-way swaps of requests in batch -MovedRequest = tuple[int, int, MoveDirectionality] -# Batch indices of any removed requests. -RemovedRequest = int - - -@dataclasses.dataclass(frozen=True) -class BatchUpdate: - """Persistent batch state change info for logitsprocs""" - batch_size: int # Current num reqs in batch - - # Metadata for requests added to, removed from, and moved - # within the persistent batch. - # - # Note: each added request is represented as - # (index, params, output_tok_ids) - # Key assumption: output_tok_ids is a reference to the - # request's running output tokens list; in this way - # the logits processors always see the latest list of - # generated tokens - removed: Sequence[RemovedRequest] - moved: Sequence[MovedRequest] - added: Sequence[AddedRequest] - - -class BatchUpdateBuilder: - """Helps track persistent batch state changes and build - a batch update data structure for logitsprocs - - Assumptions: - * All information about requests removed from persistent batch - during a step is aggregated in self._removed through calls to - self.removed_append() at the beginning of a step. This must happen - before the first time that self.removed, self.pop_removed() - or self.peek_removed() are invoked in a given step - * After the first time that self.removed, self.pop_removed() - or self.peek_removed() are read in a step, no new removals - are registered using self.removed_append() - * Elements of self._removed are never directly modified, added or - removed (i.e. modification is only via self.removed_append() and - self.pop_removed()) - - Guarantees under above assumptions: - * self.removed is always sorted in descending order - * self.pop_removed() and self.peek_removed() both return - the lowest removed request index in the current step - """ - - _removed: list[RemovedRequest] - _is_removed_sorted: bool - moved: list[MovedRequest] - added: list[AddedRequest] - - def __init__( - self, - removed: Optional[list[RemovedRequest]] = None, - moved: Optional[list[MovedRequest]] = None, - added: Optional[list[AddedRequest]] = None, - ) -> None: - self._removed = removed or [] - self.moved = moved or [] - self.added = added or [] - self._is_removed_sorted = False - - def _ensure_removed_sorted(self) -> None: - """Sort removed request indices in - descending order. - - Idempotent after first call in a - given step, until reset. - """ - if not self._is_removed_sorted: - self._removed.sort(reverse=True) - self._is_removed_sorted = True - - @property - def removed(self) -> list[RemovedRequest]: - """Removed request indices sorted in - descending order""" - self._ensure_removed_sorted() - return self._removed - - def removed_append(self, index: int) -> None: - """Register the removal of a request from - the persistent batch. - - Must not be called after the first time - self.removed, self.pop_removed() or - self.peek_removed() are invoked. - - Args: - index: request index - """ - if self._is_removed_sorted: - raise RuntimeError("Cannot register new removed request after" - " self.removed has been read.") - self._removed.append(index) - - def has_removed(self) -> bool: - return bool(self._removed) - - def peek_removed(self) -> Optional[int]: - """Return lowest removed request index""" - if self.has_removed(): - self._ensure_removed_sorted() - return self._removed[-1] - return None - - def pop_removed(self) -> Optional[int]: - """Pop lowest removed request index""" - if self.has_removed(): - self._ensure_removed_sorted() - return self._removed.pop() - return None - - def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]: - """Generate a logitsprocs batch update data structure - and reset internal batch update builder state. - - Args: - batch_size: current persistent batch size - - Returns: - Frozen logitsprocs batch update instance; `None` if no updates - """ - # Reset removal-sorting logic - self._is_removed_sorted = False - if not any((self._removed, self.moved, self.added)): - # No update; short-circuit - return None - # Build batch state update - batch_update = BatchUpdate( - batch_size=batch_size, - removed=self._removed, - moved=self.moved, - added=self.added, - ) - # Reset removed/moved/added update lists - self._removed = [] - self.moved = [] - self.added = [] - return batch_update - - -class LogitsProcessor(ABC): - - @abstractmethod - def apply(self, logits: torch.Tensor) -> torch.Tensor: - raise NotImplementedError - @abstractmethod - def is_argmax_invariant(self) -> bool: - """True if logits processor has no impact on the - argmax computation in greedy sampling. - NOTE: may or may not have the same value for all - instances of a given LogitsProcessor subclass, - depending on subclass implementation. - TODO(andy): won't be utilized until logits - processors are user-extensible - """ - raise NotImplementedError - - @abstractmethod - def update_state( - self, - batch_update: Optional[BatchUpdate], - ) -> None: - """Called when there are new output tokens, prior - to each forward pass. - - Args: - batch_update is non-None iff there have been - changes to the batch makeup. - """ - raise NotImplementedError - - -@dataclass -class LogitsProcessorManager: - """Encapsulates initialized logitsproc objects.""" - argmax_invariant: list[LogitsProcessor] = field( - default_factory=list) # argmax-invariant logitsprocs - non_argmax_invariant: list[LogitsProcessor] = field( - default_factory=list) # non-argmax-invariant logitsprocs - - @property - def all(self) -> Iterator[LogitsProcessor]: - """Iterator over all logits processors.""" - return chain(self.argmax_invariant, self.non_argmax_invariant) - - -###### ----- Built-in LogitsProcessor impls below here +from vllm.v1.sample.logits_processor.interface import (BatchUpdate, + LogitsProcessor, + MoveDirectionality) + +if TYPE_CHECKING: + from vllm.config import VllmConfig class MinPLogitsProcessor(LogitsProcessor): - def __init__(self, max_num_reqs: int, pin_memory: bool, - device: DeviceLikeType): - super().__init__() + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + max_num_reqs = vllm_config.scheduler_config.max_num_seqs self.min_p_count: int = 0 self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ), dtype=torch.float32, device="cpu", - pin_memory=pin_memory) + pin_memory=is_pin_memory) self.min_p_cpu = self.min_p_cpu_tensor.numpy() - self.use_double_tensor = torch.device("cpu") != torch.device(device) + self.use_double_tensor = torch.device(device).type != "cpu" if self.use_double_tensor: # Pre-allocated device tensor @@ -260,8 +51,8 @@ def update_state(self, batch_update: Optional[BatchUpdate]): needs_update = False # Process added requests. - for index, params, _ in batch_update.added: - min_p = params.min_p if isinstance(params, SamplingParams) else 0.0 + for index, params, _, _ in batch_update.added: + min_p = params.min_p if self.min_p_cpu[index] != min_p: needs_update = True self.min_p_cpu[index] = min_p @@ -316,11 +107,10 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor: class LogitBiasLogitsProcessor(LogitsProcessor): - def __init__(self, pin_memory: bool, device: torch.device): - super().__init__() - self.biases: dict[int, dict[int, float]] = {} + def __init__(self, _, device: torch.device, is_pin_memory: bool): self.device = device - self.pin_memory = pin_memory + self.pin_memory = is_pin_memory + self.biases: dict[int, dict[int, float]] = {} self.bias_tensor: torch.Tensor = torch.tensor(()) self.logits_slice = (self._device_tensor([], torch.int32), @@ -337,9 +127,8 @@ def update_state(self, batch_update: Optional[BatchUpdate]): needs_update: bool = False # Process added requests. - for index, params, _ in batch_update.added: - if isinstance(params, SamplingParams) and (lb := - params.logit_bias): + for index, params, _, _ in batch_update.added: + if lb := params.logit_bias: self.biases[index] = lb needs_update = True else: @@ -400,12 +189,12 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor: class MinTokensLogitsProcessor(LogitsProcessor): - def __init__(self, pin_memory: bool, device: torch.device): + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): # index -> (min_toks, output_token_ids, stop_token_ids) - super().__init__() - self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {} self.device = device - self.pin_memory = pin_memory + self.pin_memory = is_pin_memory + self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {} # (req_idx_tensor,eos_tok_id_tensor) self.logits_slice: tuple[torch.Tensor, @@ -424,9 +213,8 @@ def update_state(self, batch_update: Optional[BatchUpdate]): if batch_update: # Process added requests. - for index, params, output_tok_ids in batch_update.added: - if (isinstance(params, SamplingParams) - and (min_tokens := params.min_tokens) + for index, params, _, output_tok_ids in batch_update.added: + if ((min_tokens := params.min_tokens) and len(output_tok_ids) < min_tokens): # Replace request metadata at batch index self.min_toks[index] = (min_tokens, output_tok_ids, @@ -499,35 +287,3 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor: # Inhibit EOS token for requests which have not reached min length logits[self.logits_slice] = -float("inf") return logits - - -def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int, - device: torch.device) -> LogitsProcessorManager: - """Construct 'builtin' vLLM logitsprocs which the engine - loads by default. - - Args: - pin_memory_available: pinned memory is available for use - for use by logitsproc - max_num_reqs: ceiling on request count in persistent batch - device: inference device - - Returns: - Data structure encapsulating loaded logitsprocs - """ - min_tokens_logitproc = MinTokensLogitsProcessor( - pin_memory=pin_memory_available, device=device) - logit_bias_logitproc = LogitBiasLogitsProcessor( - pin_memory=pin_memory_available, device=device) - min_p_logitproc = MinPLogitsProcessor( - pin_memory=pin_memory_available, - device=device, - # +1 for temporary swap space - max_num_reqs=max_num_reqs + 1) - return LogitsProcessorManager( - non_argmax_invariant=[ - min_tokens_logitproc, - logit_bias_logitproc, - ], - argmax_invariant=[min_p_logitproc], - ) diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py new file mode 100644 index 000000000000..12b4db24bff8 --- /dev/null +++ b/vllm/v1/sample/logits_processor/interface.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from collections.abc import Sequence +from dataclasses import dataclass +from enum import Enum, auto +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm import SamplingParams + +if TYPE_CHECKING: + from vllm.config import VllmConfig + + +class MoveDirectionality(Enum): + # One-way i1->i2 req move within batch + UNIDIRECTIONAL = auto() + # Two-way i1<->i2 req swap within batch + SWAP = auto() + + +# (index, params, prompt_tok_ids, output_tok_ids) tuples for new +# requests added to the batch. +AddedRequest = tuple[int, SamplingParams, list[int], list[int]] + +# (index 1, index 2, directionality) tuples representing +# one-way moves or two-way swaps of requests in batch +MovedRequest = tuple[int, int, MoveDirectionality] + +# Batch indices of any removed requests. +RemovedRequest = int + + +@dataclass(frozen=True) +class BatchUpdate: + """Persistent batch state change info for logitsprocs""" + batch_size: int # Current num reqs in batch + + # Metadata for requests added to, removed from, and moved + # within the persistent batch. + # + # Key assumption: the `output_tok_ids` list (which is an element of each + # tuple in `added`) is a reference to the request's running output tokens + # list; via this reference, the logits processors always see the latest + # list of generated output tokens + removed: Sequence[RemovedRequest] + moved: Sequence[MovedRequest] + added: Sequence[AddedRequest] + + +class LogitsProcessor(ABC): + + @abstractmethod + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool) -> None: + raise NotImplementedError + + @abstractmethod + def apply(self, logits: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + @abstractmethod + def is_argmax_invariant(self) -> bool: + """True if logits processor has no impact on the + argmax computation in greedy sampling. + NOTE: may or may not have the same value for all + instances of a given LogitsProcessor subclass, + depending on subclass implementation. + """ + raise NotImplementedError + + @abstractmethod + def update_state( + self, + batch_update: Optional["BatchUpdate"], + ) -> None: + """Called when there are new output tokens, prior + to each forward pass. + + Args: + batch_update is non-None iff there have been + changes to the batch makeup. + """ + raise NotImplementedError diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py new file mode 100644 index 000000000000..0f58b5249695 --- /dev/null +++ b/vllm/v1/sample/logits_processor/state.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterator +from itertools import chain +from typing import TYPE_CHECKING, Optional + +from vllm.v1.sample.logits_processor.interface import (AddedRequest, + BatchUpdate, + MovedRequest, + RemovedRequest) + +if TYPE_CHECKING: + from vllm.v1.sample.logits_processor.interface import LogitsProcessor + + +class BatchUpdateBuilder: + """Helps track persistent batch state changes and build + a batch update data structure for logitsprocs + Assumptions: + * All information about requests removed from persistent batch + during a step is aggregated in self._removed through calls to + self.removed_append() at the beginning of a step. This must happen + before the first time that self.removed, self.pop_removed() + or self.peek_removed() are invoked in a given step + * After the first time that self.removed, self.pop_removed() + or self.peek_removed() are read in a step, no new removals + are registered using self.removed_append() + * Elements of self._removed are never directly modified, added or + removed (i.e. modification is only via self.removed_append() and + self.pop_removed()) + Guarantees under above assumptions: + * self.removed is always sorted in descending order + * self.pop_removed() and self.peek_removed() both return + the lowest removed request index in the current step + """ + + _removed: list[RemovedRequest] + _is_removed_sorted: bool + moved: list[MovedRequest] + added: list[AddedRequest] + + def __init__( + self, + removed: Optional[list[RemovedRequest]] = None, + moved: Optional[list[MovedRequest]] = None, + added: Optional[list[AddedRequest]] = None, + ) -> None: + self._removed = removed or [] + self.moved = moved or [] + self.added = added or [] + self._is_removed_sorted = False + + def _ensure_removed_sorted(self) -> None: + """Sort removed request indices in + descending order. + Idempotent after first call in a + given step, until reset. + """ + if not self._is_removed_sorted: + self._removed.sort(reverse=True) + self._is_removed_sorted = True + + @property + def removed(self) -> list[RemovedRequest]: + """Removed request indices sorted in + descending order""" + self._ensure_removed_sorted() + return self._removed + + def removed_append(self, index: int) -> None: + """Register the removal of a request from the persistent batch. + + Must not be called after the first time self.removed, + self.pop_removed() or self.peek_removed() are invoked. + + Args: + index: request index + """ + if self._is_removed_sorted: + raise RuntimeError("Cannot register new removed request after" + " self.removed has been read.") + self._removed.append(index) + + def has_removed(self) -> bool: + return bool(self._removed) + + def peek_removed(self) -> Optional[int]: + """Return lowest removed request index""" + if self.has_removed(): + self._ensure_removed_sorted() + return self._removed[-1] + return None + + def pop_removed(self) -> Optional[int]: + """Pop lowest removed request index""" + if self.has_removed(): + self._ensure_removed_sorted() + return self._removed.pop() + return None + + def _is_update(self) -> bool: + """True if there is a batch state change""" + return any((self._removed, self.moved, self.added)) + + def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]: + """Generate a logitsprocs batch update data structure and reset + internal batch update builder state. + + Args: + batch_size: current persistent batch size + + Returns: + Frozen logitsprocs batch update instance; `None` if no updates + """ + # Reset removal-sorting logic + self._is_removed_sorted = False + if not self._is_update(): + # No update; short-circuit + return None + # Build batch state update + batch_update = BatchUpdate( + batch_size=batch_size, + removed=self._removed, + moved=self.moved, + added=self.added, + ) + self._removed = [] + self.moved = [] + self.added = [] + return batch_update + + +class LogitsProcessors: + """Encapsulates initialized logitsproc objects.""" + + def __init__( + self, + logitsprocs: Optional[Iterator["LogitsProcessor"]] = None) -> None: + self.argmax_invariant: list[LogitsProcessor] = [] + self.non_argmax_invariant: list[LogitsProcessor] = [] + if logitsprocs: + for logitproc in logitsprocs: + (self.argmax_invariant if logitproc.is_argmax_invariant() else + self.non_argmax_invariant).append(logitproc) + + @property + def all(self) -> Iterator["LogitsProcessor"]: + """Iterator over all logits processors.""" + return chain(self.argmax_invariant, self.non_argmax_invariant) diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index 1189b12f3077..9d6a87cea3d0 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -6,7 +6,7 @@ import torch -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors @dataclass @@ -40,4 +40,4 @@ class SamplingMetadata: bad_words_token_ids: dict[int, list[list[int]]] # Loaded logits processors - logitsprocs: LogitsProcessorManager + logitsprocs: LogitsProcessors diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 2469e09f8249..e718d9d5e0fb 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -18,8 +18,8 @@ from vllm.v1.outputs import LogprobsTensors from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, - MoveDirectionality, - init_builtin_logitsprocs) + LogitsProcessors, + MoveDirectionality) from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.utils import copy_slice @@ -78,8 +78,11 @@ def __init__( pin_memory: bool, vocab_size: int, block_sizes: list[int], # The block_size of each kv cache group + logitsprocs: Optional[LogitsProcessors] = None, is_spec_decode: bool = False, + is_pooling_model: bool = False, ): + self.is_pooling_model = is_pooling_model self.is_spec_decode = is_spec_decode self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len @@ -221,14 +224,6 @@ def __init__( # updates. Should reset each step. self.batch_update_builder = BatchUpdateBuilder() - # Define logits processors. - # TODO(andy): logits processor list should be extensible via engine - # constructor argument; for now the list is fixed. - self.logitsprocs = init_builtin_logitsprocs( - pin_memory_available=pin_memory, - max_num_reqs=max_num_reqs + 1, - device=device) - # TODO convert this to LogitsProcessor self.has_allowed_token_ids: set[str] = set() # NOTE(lufang): In the mask tensor, if the corresponding token allowed, @@ -244,6 +239,10 @@ def __init__( self.req_output_token_ids: list[Optional[list[int]]] = [] + # Store provided logitsprocs. If none are provided, initialize empty + # data structure + self.logitsprocs = logitsprocs or LogitsProcessors() + # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() @@ -255,28 +254,35 @@ def req_ids(self) -> list[str]: # while performing state updates to the batch. return cast(list[str], self._req_ids) - def _get_next_add_index(self) -> int: - if (req_index := self.batch_update_builder.pop_removed()) is not None: - # Fill the empty index. - return req_index - # Append to end - return self.num_reqs - def _register_add_request(self, request: "CachedRequestState") -> int: - """Track add-request operations""" - req_index = self._get_next_add_index() - assert req_index < self.max_num_reqs - params = (request.sampling_params - if request.sampling_params else request.pooling_params) + """Track add-request operations for logits processors. + Not applicable to pooling models. + """ + + # Detailed added request metadata is only required for non-pooling + # models, to support logitsprocs + assert request.sampling_params + + # Fill the next empty index if there is one. + if (new_req_index := self.batch_update_builder.pop_removed()) is None: + # Append to end otherwise. + new_req_index = self.num_reqs + + assert new_req_index < self.max_num_reqs self.batch_update_builder.added.append( - (req_index, params, request.output_token_ids)) - return req_index + (new_req_index, request.sampling_params, request.prompt_token_ids, + request.output_token_ids)) + return new_req_index def add_request( self, request: "CachedRequestState", ) -> int: - req_index = self._register_add_request(request) + if not self.is_pooling_model: + # New request index bookkeeping for autoregressive models. + req_index = self._register_add_request(request) + else: + req_index = self.num_reqs req_id = request.req_id if req_index == len(self._req_ids): @@ -411,7 +417,10 @@ def remove_request(self, req_id: str) -> Optional[int]: req_index = self.req_id_to_index.pop(req_id, None) if req_index is None: return None - self.batch_update_builder.removed_append(req_index) + if not self.is_pooling_model: + # Autoregressive models require bookkeeping of removed requests to + # support logitsprocs. + self.batch_update_builder.removed_append(req_index) self._req_ids[req_index] = None self.req_output_token_ids[req_index] = None @@ -446,6 +455,8 @@ def remove_request(self, req_id: str) -> Optional[int]: return req_index def swap_states(self, i1: int, i2: int) -> None: + # For autoregressive models, track detailed request reordering info + # to support logitsprocs self.batch_update_builder.moved.append( (i1, i2, MoveDirectionality.SWAP)) old_id_i1 = self._req_ids[i1] @@ -513,11 +524,18 @@ def condense(self) -> None: swaps: list of (from,to) swap tuples for moved requests empty_req_indices: indices not filled by condensation """ + num_reqs = self.num_reqs + + if self.is_pooling_model: + # Will be contiguous in pooling case, just trim the lists. + del self._req_ids[num_reqs:] + del self.req_output_token_ids[num_reqs:] + return + if not (empty_req_indices := self.batch_update_builder.removed): # All removed requests were replaced by added requests, or else no # requests were removed at all. No condense() needed return - num_reqs = self.num_reqs if num_reqs == 0: # The batched states are empty. self._req_ids.clear() @@ -541,6 +559,8 @@ def condense(self) -> None: # Move active request down into empty request # index. self.batch_update_builder.pop_removed() + # Autoregressive models require detailed tracking of condense + # operations to support logitsprocs self.batch_update_builder.moved.append( (last_req_index, empty_index, MoveDirectionality.UNIDIRECTIONAL)) @@ -596,15 +616,20 @@ def condense(self) -> None: last_req_index -= 1 # Trim lists to the batch size. - del self._req_ids[self.num_reqs:] - del self.req_output_token_ids[self.num_reqs:] + del self._req_ids[num_reqs:] + del self.req_output_token_ids[num_reqs:] def refresh_metadata(self): - """Apply batch updates, reset input batch at end of step + """Apply any batch updates to sampling metadata.""" - * Apply batch add/remove/permute to logits procs' states - * If batch state is modified, update sampling metadata - """ + if self.is_pooling_model: + # Batch changes every step for pooling models. + self.sampling_metadata = self._make_sampling_metadata() + return + + # For non-pooling models - generate and apply logitsprocs update; + # reset batch update tracking. + # Update sampling metadata if batch state is changed. batch_update = self.batch_update_builder.get_and_reset(self.num_reqs) for logit_proc in self.logitsprocs.all: logit_proc.update_state(batch_update) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5ee44a82574c..4219d9147ada 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -68,6 +68,7 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, ModelRunnerOutput) from vllm.v1.pool.metadata import PoolingMetadata +from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.sample.sampler import Sampler @@ -80,7 +81,6 @@ KVConnectorModelRunnerMixin, KVConnectorOutput) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from ..sample.logits_processor import LogitsProcessorManager from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache, gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) @@ -221,6 +221,11 @@ def __init__( vocab_size=self.model_config.get_vocab_size(), block_sizes=[self.cache_config.block_size], is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=build_logitsprocs( + self.vllm_config, self.device, self.pin_memory, + self.is_pooling_model, + self.vllm_config.model_config.logits_processors), + is_pooling_model=self.is_pooling_model, ) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. @@ -2447,7 +2452,7 @@ def _dummy_sampler_run( output_token_ids=[[] for _ in range(num_reqs)], allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) try: sampler_output = self.sampler(logits=logits, @@ -2968,6 +2973,8 @@ def may_reinitialize_input_batch(self, vocab_size=self.model_config.get_vocab_size(), block_sizes=block_sizes, is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=self.input_batch.logitsprocs, + is_pooling_model=self.is_pooling_model, ) def _allocate_kv_cache_tensors( From 1330105138400905947708ec6f4b6dd0c0ef96b5 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Sun, 17 Aug 2025 08:41:23 +0800 Subject: [PATCH 093/231] [Bugfix] fix qwen3 moe fp8 accuracy issue (#23031) Signed-off-by: Jinzhen Lin Signed-off-by: Duncan Moss --- vllm/model_executor/layers/quantization/fp8.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a49744913251..f07be0855492 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -125,6 +125,10 @@ def from_config(cls, config: dict[str, Any]) -> "Fp8Config": ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None) + if not ignored_layers: + ignored_layers = cls.get_from_keys_or(config, + ["modules_to_not_convert"], + None) return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, activation_scheme=activation_scheme, ignored_layers=ignored_layers, From 0c1d8f78c250ca355ea266a8a2012245b8aa5c38 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 22:16:42 -0400 Subject: [PATCH 094/231] [UX] Separate marlin moe config logic from triton moe (#23006) Signed-off-by: Duncan Moss --- .../layers/fused_moe/fused_marlin_moe.py | 20 ++++++------------- .../layers/fused_moe/fused_moe.py | 9 +-------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index a49d41c18438..3c6ece6737e4 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -1,14 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused MoE utilities for GPTQ.""" -import functools from typing import Optional import torch import vllm._custom_ops as ops -from vllm.model_executor.layers.fused_moe.fused_moe import ( - moe_align_block_size, try_get_optimal_moe_config) +from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_make_workspace_new, maybe_warn_marlin_atomic_add) from vllm.scalar_type import ScalarType, scalar_types @@ -98,17 +96,11 @@ def fused_marlin_moe(hidden_states: torch.Tensor, N = w2.shape[1] * 16 topk = topk_ids.shape[1] - get_config_func = functools.partial( - try_get_optimal_moe_config, - w1.shape, - w2.shape, - topk_ids.shape[1], - None, - is_marlin=True, - ) - config = get_config_func(M) - - block_size_m = config["BLOCK_SIZE_M"] + # M block size selection logic + # TODO: tune this further for specific models + for block_size_m in [8, 16, 32, 48, 64]: + if M * topk / E / block_size_m < 0.9: + break if global_num_experts == -1: global_num_experts = E diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index e58a9e568d4a..3579ca22bafc 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -801,7 +801,6 @@ def get_default_config( K: int, topk: int, dtype: Optional[str], - is_marlin: bool, block_shape: Optional[list[int]] = None, ) -> dict[str, int]: if dtype == "fp8_w8a8" and block_shape is not None: @@ -832,11 +831,6 @@ def get_default_config( config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1} else: config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1} - elif is_marlin: - for block_size_m in [8, 16, 32, 48, 64]: - if M * topk / E / block_size_m < 0.9: - break - return {"BLOCK_SIZE_M": block_size_m} elif M <= E: config = { "BLOCK_SIZE_M": 16, @@ -860,7 +854,6 @@ def try_get_optimal_moe_config( top_k: int, dtype: Optional[str], M: int, - is_marlin: bool = False, block_shape: Optional[list[int]] = None, ) -> dict[str, int]: from vllm.model_executor.layers.fused_moe import get_config @@ -883,7 +876,7 @@ def try_get_optimal_moe_config( else: # Else use the default config config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, - is_marlin, block_shape) + block_shape) return config From b1a32602a256e50aba8a924ed09061f30fc29ff5 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 17 Aug 2025 12:05:50 +0800 Subject: [PATCH 095/231] [Refactor] Defer tensor data construction in MultiModalKwargs (#23030) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- tests/multimodal/test_cache.py | 2 +- tests/v1/test_serial_utils.py | 34 +------ vllm/inputs/registry.py | 2 +- .../models/prithvi_geospatial_mae.py | 2 +- vllm/multimodal/base.py | 2 +- vllm/multimodal/cache.py | 2 +- vllm/multimodal/inputs.py | 96 +++++++++++-------- vllm/multimodal/processing.py | 2 +- vllm/multimodal/utils.py | 12 ++- vllm/sequence.py | 4 +- vllm/v1/serial_utils.py | 17 +--- vllm/v1/worker/gpu_input_batch.py | 2 +- 12 files changed, 73 insertions(+), 104 deletions(-) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index e07b73bd257d..2149f05b6af0 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -25,7 +25,7 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]): def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): - return MultiModalKwargs.from_items([ + return MultiModalKwargs([ _dummy_item(modality, size_by_key) for modality, size_by_key in size_by_key_modality.items() ]) diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index 0ab4e0bf59cf..586276ee08ae 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -100,38 +100,6 @@ class MyRequest(msgspec.Struct): def test_multimodal_kwargs(): - d = { - "foo": - torch.zeros(20000, dtype=torch.float16), - "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)], - "baz": [ - torch.rand((256), dtype=torch.float16), - [ - torch.rand((1, 12), dtype=torch.float32), - torch.rand((3, 5, 7), dtype=torch.float64), - ], [torch.rand((4, 4), dtype=torch.float16)] - ], - } - - # pack mm kwargs into a mock request so that it can be decoded properly - req = MyRequest(mm=[MultiModalKwargs(d)]) - - encoder = MsgpackEncoder() - decoder = MsgpackDecoder(MyRequest) - - encoded = encoder.encode(req) - - assert len(encoded) == 6 - - total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) - - # expected total encoding length, should be 44559, +-20 for minor changes - assert 44539 <= total_len <= 44579 - decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] - assert all(nested_equal(d[k], decoded[k]) for k in d) - - -def test_multimodal_items_by_modality(): e1 = MultiModalFieldElem("audio", "a0", torch.zeros(1000, dtype=torch.bfloat16), MultiModalBatchedField()) @@ -151,7 +119,7 @@ def test_multimodal_items_by_modality(): audio = MultiModalKwargsItem.from_elems([e1]) video = MultiModalKwargsItem.from_elems([e2]) image = MultiModalKwargsItem.from_elems([e3, e4]) - mm = MultiModalKwargs.from_items([audio, video, image]) + mm = MultiModalKwargs([audio, video, image]) # pack mm kwargs into a mock request so that it can be decoded properly req = MyRequest([mm]) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index dc3236508348..ef146fdfbf97 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -240,6 +240,6 @@ def dummy_data_for_profiling( return DummyData( seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids), - multi_modal_data=dec_data.multi_modal_data, + multi_modal_data=dec_data.multi_modal_data.get_data(), multi_modal_placeholders=dec_data.multi_modal_placeholders, ) diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 20f423cc7603..68488829071f 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -136,7 +136,7 @@ def apply( type="multimodal", prompt=prompt, prompt_token_ids=[1], - mm_kwargs=MultiModalKwargs.from_items(multimodal_kwargs_items), + mm_kwargs=MultiModalKwargs(multimodal_kwargs_items), mm_hashes=None, mm_placeholders=mm_placeholders, ) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7188ed14c573..ef8f1b2e17b4 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -99,7 +99,7 @@ def from_seq_group( seq_mm_placeholders = seq_group.multi_modal_placeholders if not seq_mm_data or not seq_mm_placeholders: - return MultiModalKwargs({}), {} + return MultiModalKwargs(), {} placeholder_maps = dict[str, MultiModalPlaceholderMap]() diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 6074a4d54f22..8c4136e06f81 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -46,7 +46,7 @@ def get_leaf_size( ) -> int: # MultiModalKwargs is not a subclass of dict if isinstance(leaf, MultiModalKwargs): - return cls.get_item_size(leaf.data, debug=debug) + return cls.get_item_size(leaf.get_data(), debug=debug) # MultiModalKwargsItem is not a subclass of dict if isinstance(leaf, MultiModalKwargsItem): diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index a33ce146995d..d3f57cf5338d 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -653,7 +653,7 @@ def dummy(modality: str): def from_elems(elems: Sequence[MultiModalFieldElem]): return MultiModalKwargsItem({elem.key: elem for elem in elems}) - def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None: + def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None: super().__init__(data) modalities = {elem.modality for elem in self.data.values()} @@ -668,9 +668,7 @@ def get_data(self) -> Mapping[str, NestedTensors]: return {key: elem.data for key, elem in self.items()} -# NOTE: UserDict is for V0 compatibility. -# V1 should access individual items via `get_item`. -class MultiModalKwargs(UserDict[str, NestedTensors]): +class MultiModalKwargs: """ A dictionary that represents the keyword arguments to [`torch.nn.Module.forward`][]. @@ -714,40 +712,16 @@ def from_hf_inputs( elems = [v[item_idx] for v in elems_in_modality.values()] items.append(MultiModalKwargsItem.from_elems(elems)) - return MultiModalKwargs.from_items(items) + return MultiModalKwargs(items) - @staticmethod - def from_items( - items: Sequence[MultiModalKwargsItem], - *, - pin_memory: bool = False, - ): - """Construct a new - [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] - from multiple items.""" - elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) - for item in items: - for key, elem in item.items(): - elems_by_key[key].append(elem) - - data = { - key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) - for key, elems in elems_by_key.items() if len(elems) > 0 - } - - return MultiModalKwargs(data, items=items) - - def __init__( - self, - data: Mapping[str, NestedTensors], - *, - items: Optional[Sequence[MultiModalKwargsItem]] = None, - ) -> None: - super().__init__(data) + def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None: + super().__init__() - items_by_modality = full_groupby(items or [], key=lambda x: x.modality) + items_by_modality = full_groupby(items, key=lambda x: x.modality) self._items_by_modality = dict(items_by_modality) + self._data: Optional[Mapping[str, NestedTensors]] = None + @property def modalities(self): return self._items_by_modality.keys() @@ -839,22 +813,41 @@ def as_kwargs( return cast(BatchedTensorInputs, json_mapped) - def __delitem__(self, key: str) -> None: - super().__delitem__(key) + def keys(self): + return self.get_data().keys() + + def values(self): + return self.get_data().values() + + def items(self): + return self.get_data().items() + + def get(self, key: str, /, default=None): + return self.get_data().get(key, default) + + def pop(self, key: str, *args, **kwargs): + data = dict(self.get_data()) + res = data.pop(key, *args, **kwargs) for items in self._items_by_modality.values(): for item in items: - item.pop(key, None) + item.pop(key, *args, **kwargs) + + self._data = None + + return res + + def __iter__(self): + return iter(self.get_data()) + + def __getitem__(self, key: str): + return self.get_data()[key] def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return False - if self._items_by_modality != other._items_by_modality: - return False - ks = self.keys() - return (ks == other.keys() - and all(nested_tensors_equal(self[k], other[k]) for k in ks)) + return self._items_by_modality == other._items_by_modality def _validate_modality(self, method_name: str, modality: str) -> None: if not self._items_by_modality: @@ -888,6 +881,25 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: self._validate_modality("get_items", modality) return self._items_by_modality[modality] + def get_data(self, + *, + pin_memory: bool = False) -> Mapping[str, NestedTensors]: + if self._data is not None: + return self._data + + elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) + for items in self._items_by_modality.values(): + for item in items: + for key, elem in item.items(): + elems_by_key[key].append(elem) + + data = { + key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) + for key, elems in elems_by_key.items() if len(elems) > 0 + } + self._data = data + return data + MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]] """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 38c5d5d99f63..4684bf6f3d83 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1480,7 +1480,7 @@ def _cached_apply_hf_processor( mm_missing_kwargs=mm_missing_kwargs, ) - mm_kwargs = MultiModalKwargs.from_items([ + mm_kwargs = MultiModalKwargs([ item for cache_items in mm_cache_items_merged.values() for item in cache_items ]) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index f914d0dc6c5e..a80f09bb1927 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -402,12 +402,14 @@ def group_mm_kwargs_by_modality( for modality, items in groupby(mm_kwargs, key=lambda item: item.modality): items_lst = list(items) - # mm_kwargs_group = MultiModalKwargs.from_items(items_lst, - # pin_memory=pin_memory) + # mm_kwargs_group = MultiModalKwargs(items_lst) \ + # .get_data(pin_memory=pin_memory) # if device is not None: - # mm_kwargs_group = json_map_leaves(lambda x: x.to(device=device), - # mm_kwargs_group.data) + # mm_kwargs_group = json_map_leaves( + # lambda x: x.to(device=device), + # mm_kwargs_group, + # ) # TODO: Once V0 is removed, we can use the merging logic above # to avoid creating an extra batch dimension (except for fields @@ -415,7 +417,7 @@ def group_mm_kwargs_by_modality( # We will also need to update each model to remove `flatten_bn`. mm_kwargs_group = MultiModalKwargs.as_kwargs( MultiModalKwargs.batch( - [MultiModalKwargs.from_items([item]) for item in items_lst], + [MultiModalKwargs([item]) for item in items_lst], pin_memory=pin_memory, ), device=device, diff --git a/vllm/sequence.py b/vllm/sequence.py index cbe63f8d1d4e..b3be10b6bb61 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -524,7 +524,7 @@ def multi_modal_data(self) -> MultiModalKwargs: if self.inputs["type"] == "multimodal": return self.inputs["mm_kwargs"] - return MultiModalKwargs({}) + return MultiModalKwargs() @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: @@ -780,7 +780,7 @@ def multi_modal_data(self) -> MultiModalKwargs: return self.first_seq.multi_modal_data elif self.encoder_seq is not None: return self.encoder_seq.multi_modal_data - return MultiModalKwargs({}) + return MultiModalKwargs() @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 3f0fad8a64d0..2857d8ef4290 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -117,16 +117,9 @@ def enc_hook(self, obj: Any) -> Any: return self._encode_mm_item(obj) if isinstance(obj, MultiModalKwargs): - mm: MultiModalKwargs = obj - if not mm.modalities: - # just return the main dict if there are no modalities. - return dict(mm) - - # ignore the main dict, it will be re-indexed. - # Any tensors *not* indexed by modality will be ignored. return [ self._encode_mm_item(item) - for itemlist in mm._items_by_modality.values() + for itemlist in obj._items_by_modality.values() for item in itemlist ] @@ -268,13 +261,7 @@ def dec_hook(self, t: type, obj: Any) -> Any: if issubclass(t, MultiModalKwargsItem): return self._decode_mm_item(obj) if issubclass(t, MultiModalKwargs): - if isinstance(obj, list): - return MultiModalKwargs.from_items( - self._decode_mm_items(obj)) - return MultiModalKwargs({ - k: self._decode_nested_tensors(v) - for k, v in obj.items() - }) + return MultiModalKwargs(self._decode_mm_items(obj)) if t is UtilityResult: return self._decode_utility_result(obj) return obj diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index e718d9d5e0fb..3d4cf27a6ccf 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -58,7 +58,7 @@ def num_tokens(self) -> int: @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be " "removed in v0.13. Please use `mm_kwargs` instead.") def mm_inputs(self) -> list[MultiModalKwargs]: - return [MultiModalKwargs.from_items([item]) for item in self.mm_kwargs] + return [MultiModalKwargs([item]) for item in self.mm_kwargs] def get_token_id(self, idx: int) -> int: if idx < self.num_prompt_tokens: From 049cef9a067e5f2518c8e16d2688faddce9b31e2 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sun, 17 Aug 2025 12:49:14 +0800 Subject: [PATCH 096/231] [Misc] method name typo fix (#23042) Signed-off-by: Andy Xie Signed-off-by: Duncan Moss --- vllm/v1/worker/cpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 11b96d946365..a7180afbd64b 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -29,7 +29,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.use_cuda_graph = False self.cascade_attn_enabled = False - self._postprocess_tenosrs() + self._postprocess_tensors() def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: """ @@ -59,7 +59,7 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: self.attn_groups[0][0].metadata_builder.reorder_batch( self.input_batch, scheduler_output) - def _postprocess_tenosrs(self) -> None: + def _postprocess_tensors(self) -> None: # Note: replace device tensors with cpu tensors def replace_tensor(obj: Any, cpu_attr_name: str, device_attr_name) -> None: From 96907479946f58396300039b47c1cc7b350ba152 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 17 Aug 2025 13:03:24 +0800 Subject: [PATCH 097/231] [Kernel] Add cuda kernel for gpt_oss activation (#22951) Signed-off-by: Jee Jee Li Signed-off-by: Duncan Moss --- csrc/activation_kernels.cu | 59 +++++++++++++++++++ csrc/ops.h | 2 + csrc/torch_bindings.cpp | 6 ++ tests/kernels/core/test_activation.py | 45 ++++++++++++-- vllm/model_executor/layers/activation.py | 41 ++++++++++++- .../layers/fused_moe/fused_marlin_moe.py | 22 ++----- .../layers/fused_moe/fused_moe.py | 18 ++---- .../layers/quantization/utils/mxfp4_utils.py | 4 +- vllm/model_executor/models/gpt_oss.py | 2 +- 9 files changed, 157 insertions(+), 42 deletions(-) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 55e659679701..a4a880f13cf7 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param( } } +template +__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up, + float alpha, float limit) { + // clamp gate: min=None, max=limit + const float gate_f = (float)gate; + const float clamped_gate = gate_f > limit ? limit : gate_f; + + // clamp up: min=-limit, max=limit + const float up_f = (float)up; + const float clamped_up = + up_f > limit ? limit : (up_f < -limit ? -limit : up_f); + + // glu = gate * sigmoid(gate * alpha) + const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha)); + const float glu = clamped_gate * sigmoid_val; + + // (up + 1) * glu + return (T)((clamped_up + 1.0f) * glu); +} + +template +__global__ void swigluoai_and_mul_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., 2, d] + const int d, const float alpha, const float limit) { + const int64_t token_idx = blockIdx.x; + // TODO: Vectorize loads and stores. + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + // gate = x[..., ::2] (even indices) + const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]); + // up = x[..., 1::2] (odd indices) + const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]); + + out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); + } +} + } // namespace vllm #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ @@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param( PARAM); \ }); +#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] { \ + vllm::swigluoai_and_mul_kernel> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d, ALPHA, \ + LIMIT); \ + }); + void fatrelu_and_mul(torch::Tensor& out, // [..., d], torch::Tensor& input, // [..., 2 * d] double threshold) { LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); } +void swigluoai_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., 2 * d] + double alpha, double limit) { + LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit); +} namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/ops.h b/csrc/ops.h index 6e39758f16a1..64bcec6ca152 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input, double threshold); +void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input, + double alpha = 1.702, double limit = 7.0); void gelu_new(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 5fee106335d3..7079671c2eb1 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -130,6 +130,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()"); ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul); + ops.def( + "swigluoai_and_mul(Tensor! out, Tensor input, float alpha=1.702, float " + "limit=7.0) " + "-> ()"); + ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul); + // GELU implementation used in GPT-2. ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_new", torch::kCUDA, &gelu_new); diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index 29c5e70a8ba8..ec5c60fd7b0e 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -11,7 +11,7 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, MulAndSilu, NewGELU, QuickGELU, - SiluAndMul) + SiluAndMul, SwigluOAIAndMul) from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -25,7 +25,15 @@ @pytest.mark.parametrize( "activation", - ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]) + [ + "silu_and_mul", + "mul_and_silu", + "gelu", + "gelu_tanh", + "fatrelu", + "swigluoai_and_mul", + ], +) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -59,18 +67,43 @@ def test_act_and_mul( threshold = random.uniform(0, 1) layer = FatreluAndMul(threshold) fn = torch.ops._C.fatrelu_and_mul + elif activation == "swigluoai_and_mul": + layer = SwigluOAIAndMul() + fn = torch.ops._C.swigluoai_and_mul out = layer(x) ref_out = layer.forward_native(x) - # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are - # equivalent to the native PyTorch implementations, so we can do exact - # comparison. - torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) + if activation == "swigluoai_and_mul": + + rtol = { + #For fp16, change the relative tolerance from 1e-3 to 2e-3 + torch.float16: + 2e-3, + torch.bfloat16: + 2e-2, + torch.float: + 1.3e-6 + } + + def _get_rtol(output) -> float: + return rtol[output.dtype] + + torch.testing.assert_close(out, + ref_out, + atol=get_default_atol(out), + rtol=_get_rtol(out)) + else: + # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are + # equivalent to the native PyTorch implementations, so we can do exact + # comparison. + torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) if activation == "fatrelu": opcheck(fn, (out, x, threshold)) + elif activation == "swigluoai_and_mul": + opcheck(fn, (out, x, layer.alpha, layer.limit)) else: opcheck(fn, (out, x)) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 7ce44174ead6..86ab4f546d12 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -239,6 +239,35 @@ def extra_repr(self) -> str: return f'approximate={repr(self.approximate)}' +@CustomOp.register("swigluoai_and_mul") +class SwigluOAIAndMul(CustomOp): + # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110 + def __init__(self, alpha: float = 1.702, limit: float = 7.0): + super().__init__() + self.alpha = alpha + self.limit = limit + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + + gate, up = x[..., ::2], x[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + gated_output = (up + 1) * glu + return gated_output + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit) + return out + + def extra_repr(self) -> str: + return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}" + + @CustomOp.register("gelu_new") class NewGELU(CustomOp): @@ -330,6 +359,7 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor: return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + #TODO : implement cuda kenrels return self.forward_native(x) @@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module: _ACTIVATION_AND_MUL_REGISTRY = LazyDict({ - "gelu": lambda: GeluAndMul(), - "silu": lambda: SiluAndMul(), - "geglu": lambda: GeluAndMul(), + "gelu": + lambda: GeluAndMul(), + "silu": + lambda: SiluAndMul(), + "geglu": + lambda: GeluAndMul(), + "swigluoai": + lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs), }) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 3c6ece6737e4..1e3ac6cd79f6 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -161,25 +161,13 @@ def fused_marlin_moe(hidden_states: torch.Tensor, if activation == "silu": torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - elif activation == "swiglu_oai": - # NOTE: in gpt-oss, the gate_proj and up_proj is interleaved - # - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2] - # - origin: gate, up = gate_up[..., :N], gate_up[..., N:] - - @torch.compile(dynamic=True) - def swiglu_oai(gate_up): - alpha = 1.702 - limit = 7.0 - gate, up = gate_up[..., ::2], gate_up[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - return (up + 1) * glu - - intermediate_cache2 = swiglu_oai(intermediate_cache1) + elif activation == "swigluoai": + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, 2 * N)) else: raise ValueError(f"Unsupported activation: {activation}. " - "Only silu and swiglu_oai activations are supported.") + "Only silu and swigluoai activations are supported.") if expert_map is not None: intermediate_cache3.zero_() diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3579ca22bafc..02b7b65f4a02 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1621,17 +1621,6 @@ def fused_experts_impl( block_shape=block_shape, B_bias=w1_bias) - # TODO fused kernel - def swiglu_oai(gate_up): - alpha = 1.702 - limit = 7.0 - gate, up = gate_up[..., ::2], gate_up[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - gated_output = (up + 1) * glu - return gated_output - # Activation function with multiplication if activation == "silu" and is_act_and_mul: torch.ops._C.silu_and_mul(intermediate_cache2, @@ -1639,13 +1628,16 @@ def swiglu_oai(gate_up): elif activation == "gelu" and is_act_and_mul: torch.ops._C.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + elif activation == "swigluoai" and is_act_and_mul: + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) # Activation function without multiplication elif activation == "silu": intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) elif activation == "gelu": intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) - elif activation == "swiglu_oai": - intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N)) + else: raise ValueError(f"Unsupported FusedMoe activation: {activation}, " f"with is_act_and_mul={is_act_and_mul}.") diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index deeb69bcad0e..48f9cc3737e4 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -61,14 +61,14 @@ def _can_support_mxfp4(use_grouped_topk: bool = False, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, scoring_func: str = "softmax", - activation: str = "swiglu_oai", + activation: str = "swigluoai", expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None): return not (use_grouped_topk or topk_group or num_expert_group or expert_map or custom_routing_function or e_score_correction_bias or apply_router_weight_on_input - or scoring_func != "softmax" or activation != "swiglu_oai" + or scoring_func != "softmax" or activation != "swigluoai" or expert_load_view or logical_to_physical_map or logical_replica_count) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 7c7712dbe106..2f5d9ddd9054 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -159,7 +159,7 @@ def __init__( prefix=f"{prefix}.experts", apply_router_weight_on_input=False, has_bias=True, - activation="swiglu_oai") + activation="swigluoai") def forward(self, x: torch.Tensor) -> torch.Tensor: t = self.norm(x) From 1e8a902b48fa59ac020053074b057c95739b9c12 Mon Sep 17 00:00:00 2001 From: 947132885 <947132885@qq.com> Date: Sun, 17 Aug 2025 16:46:36 +0800 Subject: [PATCH 098/231] [Bugfix] should use stack instead of concat (#22972) Signed-off-by: 947132885 <947132885@qq.com> Signed-off-by: Isotr0py Co-authored-by: Isotr0py Signed-off-by: Duncan Moss --- vllm/model_executor/models/transformers.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 4ec2b683fc33..f3b7263ca387 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -694,6 +694,17 @@ def compute_logits( return logits +def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor: + """Flatten until a list of tensors can be concatenated then do concat""" + + def _can_concat(x: list[torch.Tensor]): + return len(set(map(lambda _x: _x.shape[1:], x))) == 1 + + if _can_concat(x): + return torch.concat(x) + return flatten_and_concat(flatten_bn(x)) + + @MULTIMODAL_REGISTRY.register_processor( MultiModalProcessor, info=MultiModalProcessingInfo, @@ -766,8 +777,7 @@ def get_multimodal_embeddings(self, **kwargs): if isinstance(pixel_values, torch.Tensor): pixel_values = flatten_bn(pixel_values).to(self.dtype) elif is_list_of(pixel_values, torch.Tensor): - pixel_values = flatten_bn(flatten_bn(pixel_values), - concat=True).to(self.dtype) + pixel_values = flatten_and_concat(pixel_values).to(self.dtype) else: raise ValueError( f"Unsupported pixel_values type {type(pixel_values)}. " From db8f53501276f33d4f414fe39a49c81f1c525159 Mon Sep 17 00:00:00 2001 From: Kevinzz Date: Sun, 17 Aug 2025 16:56:20 +0800 Subject: [PATCH 099/231] [Misc] fix typo in the multimodal doc (#23051) Signed-off-by: Duncan Moss --- docs/features/multimodal_inputs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index cdd32924b566..9d51f9cf52f5 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -216,7 +216,7 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown from vllm import LLM, SamplingParams from qwen_vl_utils import process_vision_info - model_path = "Qwen/Qwen2.5-VL-3B-Instruct/" + model_path = "Qwen/Qwen2.5-VL-3B-Instruct" video_path = "https://content.pexels.com/videos/free-videos.mp4" llm = LLM( From e6bc394626c6df300524856718922d4516d61af7 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sun, 17 Aug 2025 11:52:04 -0400 Subject: [PATCH 100/231] [BugFix] Fix for IMA in FA3 varlen combine (#22967) Signed-off-by: Lucas Wilkinson Signed-off-by: Duncan Moss --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 4e2a0e4533e6..49defccbb1fa 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd + GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From f924f5ac9738a12258b2b8ecca2f7e0b65f76036 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 10:36:46 -0700 Subject: [PATCH 101/231] [Misc] Remove dead return (#23061) Signed-off-by: Woosuk Kwon Signed-off-by: Duncan Moss --- vllm/model_executor/models/qwen2_vl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f2d438b3850b..9e2f7ca42b4b 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1225,7 +1225,6 @@ def get_multimodal_embeddings(self, modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return [] - return None # The result multimodal_embeddings is tuple of tensors, with each # tensor correspoending to a multimodal data item (image or video). From 72d3950d9f3aa4509337cc638934baf963f5ad0b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 12:41:38 -0700 Subject: [PATCH 102/231] [Misc] Convert use_structured_output property into constant (#23060) Signed-off-by: Woosuk Kwon Signed-off-by: Duncan Moss --- vllm/v1/request.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 8b703b6191fe..4e99a9ccef46 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -54,8 +54,7 @@ def __init__( time.time() self.status = RequestStatus.WAITING - if sampling_params and sampling_params.guided_decoding is not None: - self.status = RequestStatus.WAITING_FOR_FSM + self.use_structured_output = False self.events: list[EngineCoreEvent] = [] self.stop_reason: Union[int, str, None] = None @@ -63,12 +62,15 @@ def __init__( self.kv_transfer_params: Optional[dict[str, Any]] = None if pooling_params is not None: + # Pooling models. self.max_tokens = 1 elif sampling_params is not None: + # Generative models. assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens if sampling_params.guided_decoding is not None: self.status = RequestStatus.WAITING_FOR_FSM + self.use_structured_output = True if sampling_params.extra_args is not None: self.kv_transfer_params = \ @@ -192,11 +194,6 @@ def get_num_encoder_tokens(self, input_id: int) -> int: num_tokens = self.mm_positions[input_id].length return num_tokens - @property - def use_structured_output(self) -> bool: - return self.sampling_params is not None and \ - self.sampling_params.guided_decoding is not None - def record_event( self, event_type: EngineCoreEventType, From 9310d154cba33c8bbbed4d39f0199038619be0ab Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Mon, 18 Aug 2025 05:45:42 +0800 Subject: [PATCH 103/231] [XPU] fix xpu to set cudagraph batch sizes (#23044) Signed-off-by: calvin chen Signed-off-by: Duncan Moss --- vllm/v1/worker/gpu_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4219d9147ada..adaa1306f6ca 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -232,8 +232,10 @@ def __init__( # The convention is different. # self.cudagraph_batch_sizes sorts in ascending order. # The batch sizes in the config are in descending order. - self.cudagraph_batch_sizes = list( - reversed(self.compilation_config.cudagraph_capture_sizes)) + if self.compilation_config.cudagraph_capture_sizes and \ + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + self.cudagraph_batch_sizes = list( + reversed(self.compilation_config.cudagraph_capture_sizes)) # Cache the device properties. self._init_device_properties() From 071fdbfca8b65854054571d62ee8aeb2d32a6358 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 17 Aug 2025 15:56:07 -0700 Subject: [PATCH 104/231] fix: gptq marlin weight loading failure (#23066) Signed-off-by: Duncan Moss --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index bd14ab9ef6c6..c5d1e017014f 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -56,7 +56,7 @@ def get_moe_quant_method( # Dynamic per module/layer rules may override base config override_config(cloned_config, prefix=prefix) - return moe_method_cls(cloned_config) + return moe_method_cls(cloned_config, layer.moe_config) return None From 625926c999e44bfb4e2329f6f54afe2ddba82773 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 18:16:03 -0700 Subject: [PATCH 105/231] [Misc] Minor code cleanup for _get_prompt_logprobs_dict (#23064) Signed-off-by: Woosuk Kwon Signed-off-by: Duncan Moss --- vllm/v1/worker/gpu_model_runner.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index adaa1306f6ca..fc320be1c3bd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1722,7 +1722,7 @@ def execute_model( # Compute prompt logprobs if needed. prompt_logprobs_dict = self._get_prompt_logprobs_dict( hidden_states[:num_scheduled_tokens], - scheduler_output, + scheduler_output.num_scheduled_tokens, ) # Get the valid generated tokens. @@ -2064,7 +2064,7 @@ def save_tensorized_model( def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, - scheduler_output: "SchedulerOutput", + num_scheduled_tokens: dict[str, int], ) -> dict[str, Optional[LogprobsTensors]]: num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs if not num_prompt_logprobs_dict: @@ -2077,8 +2077,7 @@ def _get_prompt_logprobs_dict( # maintainable loop over optimal performance. completed_prefill_reqs = [] for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items(): - - num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_tokens = num_scheduled_tokens[req_id] # Get metadata for this request. request = self.requests[req_id] From cf0a037f5e31c4124dcb812c9c58a5d407dc620b Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 18 Aug 2025 13:09:08 +0800 Subject: [PATCH 106/231] [Misc] enhance static type hint (#23059) Signed-off-by: Andy Xie Signed-off-by: Duncan Moss --- vllm/v1/worker/lora_model_runner_mixin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 2fbdee4724e3..84ed46989ea9 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -8,6 +8,7 @@ from typing import Union import numpy as np +import torch import torch.nn as nn from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig @@ -31,7 +32,8 @@ class LoRAModelRunnerMixin: def load_lora_model(self, model: nn.Module, model_config: ModelConfig, scheduler_config: SchedulerConfig, - lora_config: LoRAConfig, device: str) -> nn.Module: + lora_config: LoRAConfig, + device: torch.device) -> nn.Module: if not supports_lora(model): raise ValueError( From 0a3d765b69e67983a49120482b53b81ea3ccc549 Mon Sep 17 00:00:00 2001 From: double7 <33449816+DoubleVII@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:09:11 +0800 Subject: [PATCH 107/231] [Bugfix] fix Qwen2.5-Omni processor output mapping (#23058) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: double7 <33449816+DoubleVII@users.noreply.github.com> Co-authored-by: 杨森 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Duncan Moss --- vllm/model_executor/models/qwen2_5_omni_thinker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index e95295c31885..59411eb7503b 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -88,6 +88,11 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]): video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) video_grid_sizes = video_grid_thw.prod(-1) + # vllm use `second_per_grid_ts` to compute multimodal rotary embedding + video_second_per_grid = hf_inputs.get("video_second_per_grid", None) + if video_second_per_grid is not None: + hf_inputs["second_per_grid_ts"] = video_second_per_grid + return dict( input_audio_features=MultiModalFieldConfig.flat_from_sizes( "audio", audio_feature_lengths, dim=1), From d117d48728bf468e12f43bc15f5af10fdafaeb28 Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Mon, 18 Aug 2025 07:10:26 +0200 Subject: [PATCH 108/231] [Bugfix][CI] Machete kernels: deterministic ordering for more cache hits (#23055) Signed-off-by: Andy Lo Signed-off-by: Duncan Moss --- csrc/quantization/machete/generate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 9af7833d09f3..88b3f9c734a3 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -349,9 +349,12 @@ def _to_cute_constant(value: int): def unique_schedules(impl_configs: list[ImplConfig]): - return list( - set(sch for impl_config in impl_configs - for sch in impl_config.schedules)) + # Use dict over set for deterministic ordering + return list({ + sch: None + for impl_config in impl_configs + for sch in impl_config.schedules + }.keys()) def unsigned_type_with_bitwidth(num_bits): From d623acbaead386af0770a2fa8868ef7bdf455a84 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 18 Aug 2025 13:16:21 +0800 Subject: [PATCH 109/231] [Misc] refactor function name (#23029) Signed-off-by: Andy Xie Signed-off-by: Duncan Moss --- vllm/platforms/cpu.py | 2 +- vllm/v1/worker/cpu_worker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 0b16a8e1d1d8..fe258f76b9d7 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -268,7 +268,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: DEFAULT_MAX_NUM_BATCHED_TOKENS) @classmethod - def get_allowed_cpu_memory_node_list( + def get_allowed_cpu_core_node_list( cls) -> tuple[list[int], list[LogicalCPUInfo]]: assert platform.system() == "Linux" diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 2dc28d93049a..f83d6804840e 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -132,7 +132,7 @@ def _get_autobind_cpu_ids( """ allowed_numa_nodes, logical_cpu_list = \ - CpuPlatform.get_allowed_cpu_memory_node_list() + CpuPlatform.get_allowed_cpu_core_node_list() assert len(allowed_numa_nodes) >= self.parallel_config.world_size, ( f"No enough allowed NUMA nodes to bind threads of " f"{self.parallel_config.world_size} CPUWorkers. " From 3f9a58948a8b8260387c96445c64d666c24d3975 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 17 Aug 2025 23:33:29 -0700 Subject: [PATCH 110/231] [Misc] Fix backward compatibility from #23030 (#23070) Signed-off-by: Roger Wang Co-authored-by: Roger Wang Signed-off-by: Duncan Moss --- vllm/multimodal/base.py | 9 ++++++--- vllm/multimodal/inputs.py | 6 +++--- vllm/sequence.py | 4 +++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index ef8f1b2e17b4..c4bb8d56ce3e 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from vllm.sequence import SequenceGroupMetadata -from .inputs import MultiModalKwargs, PlaceholderRange +from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange _T = TypeVar("_T") @@ -56,7 +56,8 @@ def __init__(self): @classmethod def from_seq_group( cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]: + ) -> tuple[dict[str, NestedTensors], dict[str, + "MultiModalPlaceholderMap"]]: """ Returns the multi-modal items that intersect with the portion of a prompt (``seq_group``) represented by ``positions``, as well as a @@ -99,7 +100,7 @@ def from_seq_group( seq_mm_placeholders = seq_group.multi_modal_placeholders if not seq_mm_data or not seq_mm_placeholders: - return MultiModalKwargs(), {} + return MultiModalKwargs().get_data(), {} placeholder_maps = dict[str, MultiModalPlaceholderMap]() @@ -116,6 +117,8 @@ def from_seq_group( placeholder_maps[modality] = placeholder_map + seq_mm_data = seq_mm_data if isinstance( + seq_mm_data, dict) else seq_mm_data.get_data() return seq_mm_data, placeholder_maps def append_items_from_seq_group( diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index d3f57cf5338d..3e0bfce59c5f 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -664,7 +664,7 @@ def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None: def modality(self) -> str: return self._modality - def get_data(self) -> Mapping[str, NestedTensors]: + def get_data(self) -> dict[str, NestedTensors]: return {key: elem.data for key, elem in self.items()} @@ -720,7 +720,7 @@ def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None: items_by_modality = full_groupby(items, key=lambda x: x.modality) self._items_by_modality = dict(items_by_modality) - self._data: Optional[Mapping[str, NestedTensors]] = None + self._data: Optional[dict[str, NestedTensors]] = None @property def modalities(self): @@ -883,7 +883,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: def get_data(self, *, - pin_memory: bool = False) -> Mapping[str, NestedTensors]: + pin_memory: bool = False) -> dict[str, NestedTensors]: if self._data is not None: return self._data diff --git a/vllm/sequence.py b/vllm/sequence.py index b3be10b6bb61..2cb254381eff 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -22,6 +22,7 @@ from vllm.sampling_params import RequestOutputKind, SamplingParams if TYPE_CHECKING: + from vllm.multimodal.inputs import NestedTensors from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorOutput) @@ -978,7 +979,8 @@ class SequenceGroupMetadata( state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) token_type_ids: Optional[list[int]] = None - multi_modal_data: Optional[MultiModalKwargs] = None + multi_modal_data: Optional[Union[MultiModalKwargs, + dict[str, "NestedTensors"]]] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None encoder_seq_data: Optional[SequenceData] = None cross_block_table: Optional[list[int]] = None From f562f663d4d6a54319fe70bb25c9eed5b0d2eae0 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 18 Aug 2025 15:04:08 +0800 Subject: [PATCH 111/231] [XPU] Fix compile size for xpu (#23069) Signed-off-by: Kunshang Ji Signed-off-by: Duncan Moss --- vllm/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 51db277f65dc..cd2be212c23d 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3548,7 +3548,7 @@ def __post_init__(self): if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - if current_platform.is_cuda_alike(): + if current_platform.is_cuda_alike() or current_platform.is_xpu(): # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: From 2c4678694144683e8ed4b40b8885569bb116f8ec Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 18 Aug 2025 17:47:03 +0800 Subject: [PATCH 112/231] [XPU][CI]add xpu env vars in CI scripts (#22946) Signed-off-by: Kunshang Ji Signed-off-by: Duncan Moss --- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index deb61a9bafab..445cd2735c19 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -23,9 +23,13 @@ docker run \ --device /dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ --entrypoint="" \ + -e "HF_TOKEN=${HF_TOKEN}" \ + -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \ --name "${container_name}" \ "${image_name}" \ - sh -c ' + bash -c ' + set -e + echo $ZE_AFFINITY_MASK VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp @@ -35,8 +39,8 @@ docker run \ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_utils.py pytest -v -s v1/test_metrics_reader.py From 445e353b8d559025266db428696722ccf1a2c600 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 18 Aug 2025 17:52:00 +0800 Subject: [PATCH 113/231] [Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs (#23053) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- docs/api/README.md | 1 + docs/contributing/model/multimodal.md | 4 +- .../multimodal/processing/test_common.py | 14 +- .../multimodal/processing/test_glm4_1v.py | 3 +- .../multimodal/processing/test_h2ovl.py | 3 +- .../multimodal/processing/test_internvl.py | 3 +- .../multimodal/processing/test_llama4.py | 10 +- .../multimodal/processing/test_mllama.py | 6 +- .../multimodal/processing/test_mllama4.py | 10 +- .../multimodal/processing/test_nemotron_vl.py | 3 +- .../multimodal/processing/test_qwen2_vl.py | 3 +- tests/models/multimodal/test_tensor_schema.py | 2 +- tests/multimodal/test_cache.py | 11 +- tests/v1/test_serial_utils.py | 22 ++- vllm/executor/msgspec_utils.py | 9 +- vllm/model_executor/models/aria.py | 4 +- vllm/model_executor/models/aya_vision.py | 4 +- vllm/model_executor/models/blip2.py | 4 +- vllm/model_executor/models/chameleon.py | 4 +- vllm/model_executor/models/cohere2_vision.py | 4 +- vllm/model_executor/models/deepseek_vl2.py | 7 +- vllm/model_executor/models/florence2.py | 4 +- vllm/model_executor/models/fuyu.py | 4 +- vllm/model_executor/models/gemma3_mm.py | 4 +- vllm/model_executor/models/gemma3n_mm.py | 4 +- vllm/model_executor/models/glm4_1v.py | 10 +- vllm/model_executor/models/glm4v.py | 4 +- vllm/model_executor/models/granite_speech.py | 4 +- vllm/model_executor/models/h2ovl.py | 16 +- .../models/hyperclovax_vision.py | 27 +-- vllm/model_executor/models/idefics3.py | 4 +- vllm/model_executor/models/interns1.py | 13 +- vllm/model_executor/models/internvl.py | 34 ++-- vllm/model_executor/models/keye.py | 7 +- vllm/model_executor/models/kimi_vl.py | 4 +- vllm/model_executor/models/llava.py | 6 +- .../model_executor/models/llava_next_video.py | 4 +- vllm/model_executor/models/llava_onevision.py | 4 +- vllm/model_executor/models/minicpmo.py | 4 +- vllm/model_executor/models/minicpmv.py | 4 +- vllm/model_executor/models/mistral3.py | 4 +- vllm/model_executor/models/mllama.py | 7 +- vllm/model_executor/models/mllama4.py | 12 +- vllm/model_executor/models/molmo.py | 4 +- vllm/model_executor/models/nvlm_d.py | 13 +- vllm/model_executor/models/ovis.py | 9 +- vllm/model_executor/models/paligemma.py | 4 +- vllm/model_executor/models/phi3v.py | 4 +- vllm/model_executor/models/phi4_multimodal.py | 4 +- vllm/model_executor/models/phi4mm.py | 4 +- vllm/model_executor/models/pixtral.py | 7 +- .../models/prithvi_geospatial_mae.py | 7 +- .../models/qwen2_5_omni_thinker.py | 15 +- vllm/model_executor/models/qwen2_audio.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 7 +- vllm/model_executor/models/qwen_vl.py | 4 +- vllm/model_executor/models/skyworkr1v.py | 13 +- vllm/model_executor/models/step3_vl.py | 14 +- vllm/model_executor/models/tarsier.py | 4 +- vllm/model_executor/models/transformers.py | 6 +- vllm/model_executor/models/ultravox.py | 9 +- vllm/model_executor/models/voxtral.py | 7 +- vllm/model_executor/models/whisper.py | 4 +- vllm/multimodal/__init__.py | 4 +- vllm/multimodal/base.py | 9 +- vllm/multimodal/cache.py | 21 ++- vllm/multimodal/inputs.py | 172 ++++++++---------- vllm/multimodal/parse.py | 11 +- vllm/multimodal/processing.py | 38 ++-- vllm/multimodal/profiling.py | 4 +- vllm/multimodal/utils.py | 25 ++- vllm/sequence.py | 6 +- vllm/v1/engine/processor.py | 2 +- vllm/v1/serial_utils.py | 41 ++++- vllm/v1/worker/gpu_input_batch.py | 10 +- vllm/v1/worker/gpu_model_runner.py | 5 +- vllm/v1/worker/tpu_model_runner.py | 5 +- 77 files changed, 431 insertions(+), 383 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index 327472df1d52..57142e8f5625 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -77,6 +77,7 @@ Internal data structures. - [vllm.multimodal.inputs.MultiModalFieldElem][] - [vllm.multimodal.inputs.MultiModalFieldConfig][] - [vllm.multimodal.inputs.MultiModalKwargsItem][] +- [vllm.multimodal.inputs.MultiModalKwargsItems][] - [vllm.multimodal.inputs.MultiModalKwargs][] - [vllm.multimodal.inputs.MultiModalInputs][] diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 64a48be32645..76d0f067fd45 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -629,7 +629,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -778,7 +778,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 906966ddd064..a1744317b394 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -370,10 +370,16 @@ def _assert_inputs_equal( if ignore_mm_keys is None: ignore_mm_keys = set() - assert "mm_kwargs" in a and "mm_kwargs" in b, msg + a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"} + b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"} + + assert a_rest == b_rest, msg + + a_data = a["mm_kwargs"].get_data() + b_data = b["mm_kwargs"].get_data() for key in ignore_mm_keys: - a["mm_kwargs"].pop(key, None) - b["mm_kwargs"].pop(key, None) + a_data.pop(key, None) + b_data.pop(key, None) - assert a == b, msg + assert a_data == b_data, msg diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index a6d900ec5d89..a49842e1099c 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -45,7 +45,8 @@ def test_processor_override( video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token) video_tok_count = processed_inputs["prompt_token_ids"].count( video_token_id) - grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0] + grid_t, _, _ = processed_inputs["mm_kwargs"].get_data( + )["video_grid_thw"][0] assert grid_t == expected_grid_t assert video_tok_count == expected_toks_per_frame * grid_t diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 76e4acc67d4d..1adfe21352c4 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -108,7 +108,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index c3e2841a8f06..e4f25f5ac712 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -68,7 +68,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 5e14f0f9964d..bea4f43567ee 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -51,14 +51,14 @@ def test_processor_override( prompt = encode_tokens(tokenizer, prompt) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) - mm_kwargs = processed_inputs["mm_kwargs"] + mm_data = processed_inputs["mm_kwargs"].get_data() # place holder replacements prompt_token_ids = processed_inputs["prompt_token_ids"] assert prompt_token_ids.count(config.boi_token_index) == num_imgs assert prompt_token_ids.count(config.eoi_token_index) == num_imgs assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs - aspect_ratios = mm_kwargs["aspect_ratios"] + aspect_ratios = mm_data["aspect_ratios"] num_x_separators = num_y_separators = 0 for tiles_y, tiles_x in aspect_ratios: if tiles_x * tiles_y > 1: @@ -80,6 +80,6 @@ def test_processor_override( num_patches_per_chunk = processor.info.get_patch_per_chunk( config.vision_config) assert prompt_token_ids.count(config.image_token_index) \ - == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk - assert mm_kwargs["pixel_values"].shape[0] \ - == mm_kwargs["patches_per_image"].sum() + == sum(mm_data["patches_per_image"]) * num_patches_per_chunk + assert len(mm_data["pixel_values"]) \ + == sum(mm_data["patches_per_image"]) diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py index a6b20a1e3678..b42d3f89f3cb 100644 --- a/tests/models/multimodal/processing/test_mllama.py +++ b/tests/models/multimodal/processing/test_mllama.py @@ -49,18 +49,18 @@ def test_profiling( encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids) ] * max_num_seqs - mm_kwargs = processor.apply( + mm_data = processor.apply( prompt=dummy_mm_data.prompt, mm_data=dummy_mm_data.mm_data, hf_processor_mm_kwargs=dict(), - )["mm_kwargs"] + )["mm_kwargs"].get_data() # Get the actual number of encoder tokens for each sample. # Because attn_metadata.encoder_seq_lens only counts the last # group of images for each sample, which is used to cheat the # block manager to allocate blocks for those images only. # See MllamaMultiModalProcessor for more details. - num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")] + num_tiles = [[t] for t in mm_data.pop("num_tiles")] num_tokens_per_tile = calc_token_per_chunk(image_size) actual_encoder_seq_lens = [ sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index f3871b60c3f6..3be77b5da63f 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int): hf_config = ctx.get_hf_config(Llama4Config) - mm_kwargs = processor.apply( + mm_data = processor.apply( prompt=dummy_mm_data.prompt, mm_data=dummy_mm_data.mm_data, hf_processor_mm_kwargs=dict(), - )["mm_kwargs"] + )["mm_kwargs"].get_data() image_size = hf_config.vision_config.image_size patch_size = hf_config.vision_config.patch_size downsample_ratio = int( round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))) tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio - chunks_per_image = prod(mm_kwargs["patches_per_image"]) + chunks_per_image = prod(mm_data["patches_per_image"]) total_num_patches = chunks_per_image * tokens_per_patch - num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][ - 0][1] # x-y seperator tokens + num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][ + 1] # x-y seperator tokens total_tokens = total_num_patches.item() + num_tiles.item( ) + 3 # image start, image, image end diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index 6fbbab0d2612..d9f1965a053d 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -70,7 +70,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape) assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 9d1cd183387b..985f4188fdb6 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -48,7 +48,8 @@ def test_processor_override( hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values"].shape assert img_tok_count == expected_toks_per_img * num_imgs assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index 036624431c20..51e5b84b6c08 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -128,7 +128,7 @@ def create_batched_mm_kwargs( )["mm_kwargs"] items = [ item for modality in supported_mm_limits - for item in mm_kwargs.get_items(modality) + for item in mm_kwargs[modality] ] return group_mm_kwargs_by_modality(items) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 2149f05b6af0..088cd00db2e0 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -4,8 +4,8 @@ import torch from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata -from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, - MultiModalKwargsItem, +from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, + MultiModalKwargsItems, MultiModalSharedField) @@ -24,8 +24,8 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]): ]) -def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): - return MultiModalKwargs([ +def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]): + return MultiModalKwargsItems.from_seq([ _dummy_item(modality, size_by_key) for modality, size_by_key in size_by_key_modality.items() ]) @@ -37,7 +37,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): [ (_dummy_item("a", {"a1": 100}), 100), (_dummy_item("a", {"a1": 100, "a2": 110}), 210), - (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 + (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 + (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501 ], ) # yapf: enable diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index 586276ee08ae..118b40d0ef41 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -11,7 +11,8 @@ from vllm.multimodal.inputs import (MultiModalBatchedField, MultiModalFieldElem, MultiModalFlatField, - MultiModalKwargs, MultiModalKwargsItem, + MultiModalKwargsItem, + MultiModalKwargsItems, MultiModalSharedField, NestedTensors) from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder @@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch): class MyRequest(msgspec.Struct): - mm: Optional[list[MultiModalKwargs]] + mm: Optional[list[MultiModalKwargsItems]] def test_multimodal_kwargs(): @@ -119,7 +120,7 @@ def test_multimodal_kwargs(): audio = MultiModalKwargsItem.from_elems([e1]) video = MultiModalKwargsItem.from_elems([e2]) image = MultiModalKwargsItem.from_elems([e3, e4]) - mm = MultiModalKwargs([audio, video, image]) + mm = MultiModalKwargsItems.from_seq([audio, video, image]) # pack mm kwargs into a mock request so that it can be decoded properly req = MyRequest([mm]) @@ -133,19 +134,22 @@ def test_multimodal_kwargs(): total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) - # expected total encoding length, should be 14255, +-20 for minor changes - assert 14250 <= total_len <= 14300 - decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] + # expected total encoding length, should be 14306, +-20 for minor changes + assert 14275 <= total_len <= 14325 + decoded = decoder.decode(encoded).mm[0] + assert isinstance(decoded, MultiModalKwargsItems) # check all modalities were recovered and do some basic sanity checks - assert len(decoded.modalities) == 3 - images = decoded.get_items("image") + assert len(decoded) == 3 + images = decoded["image"] assert len(images) == 1 assert len(images[0].items()) == 2 assert list(images[0].keys()) == ["i0", "i1"] # check the tensor contents and layout in the main dict - assert all(nested_equal(mm[k], decoded[k]) for k in mm) + mm_data = mm.get_data() + decoded_data = decoded.get_data() + assert all(nested_equal(mm_data[k], decoded_data[k]) for k in mm_data) def nested_equal(a: NestedTensors, b: NestedTensors): diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index 852c8f5cffa0..4ce6d8dfad2c 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -4,11 +4,12 @@ from array import array from typing import Any, Type +from vllm.multimodal.inputs import MultiModalKwargs from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE def encode_hook(obj: Any) -> Any: - """Custom msgspec enc hook that supports array types. + """Custom msgspec enc hook that supports array types and MultiModalKwargs. See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ @@ -17,10 +18,12 @@ def encode_hook(obj: Any) -> Any: f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " f"Given array has a type code of {obj.typecode}.") return obj.tobytes() + if isinstance(obj, MultiModalKwargs): + return dict(obj) def decode_hook(type: Type, obj: Any) -> Any: - """Custom msgspec dec hook that supports array types. + """Custom msgspec dec hook that supports array types and MultiModalKwargs. See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ @@ -28,3 +31,5 @@ def decode_hook(type: Type, obj: Any) -> Any: deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) deserialized.frombytes(obj) return deserialized + if type is MultiModalKwargs: + return MultiModalKwargs(obj) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index e1368a3f6478..1c7960fa3e0a 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -22,7 +22,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -470,7 +470,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 5cd74bbba482..b02a973d942c 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -18,7 +18,7 @@ from vllm.config import VllmConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -242,7 +242,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 8e3505f872eb..2f2b880bb0e1 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -15,7 +15,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptIndexTargets, @@ -492,7 +492,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: tokenizer = self.info.get_tokenizer() vocab = tokenizer.get_vocab() diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 8d705f40ce8f..e6914ad4c495 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -31,7 +31,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -151,7 +151,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index f17583768f79..bc526fd661b6 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -241,7 +241,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index e0acca75d9dd..e881e9c6ddb6 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -21,7 +21,7 @@ from vllm.model_executor.models.transformers import replace_linear_class from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -252,7 +252,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) @@ -291,7 +291,8 @@ def _cached_apply_hf_processor( tokenization_kwargs: Mapping[str, object], *, return_mm_hashes: bool, - ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: + ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes], + bool]: # The processor logic is different for len(images) <= 2 vs > 2 # Since the processing cache assumes that the processor output is # invariant of how many images are passed per prompt, we only diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 56e456c2f1f2..4a8cb35a54dc 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -21,7 +21,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseProcessingInfo, EncDecMultiModalProcessor, @@ -860,7 +860,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() pad_token_id = hf_config.pad_token_id diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index b61e0361fe8c..90af859ab92e 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -32,7 +32,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -226,7 +226,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 9871b11b3799..bf5ad633b94a 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -17,7 +17,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) # yapf: disable @@ -311,7 +311,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.boi_token diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index a0c3bb50070b..79061fd30c39 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -24,7 +24,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems, MultiModalDataParser) # yapf: disable @@ -209,7 +209,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 88c53c836327..015577322ffe 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -59,7 +59,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, VideoItem) + MultiModalKwargsItems, VideoItem) from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -1158,7 +1158,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( @@ -1175,14 +1175,16 @@ def _get_prompt_updates( merge_length = image_processor.merge_size**2 def get_image_replacement_glm4v(item_idx: int): - grid_thw = out_mm_kwargs["image_grid_thw"][item_idx] + out_item = out_mm_kwargs["image"][item_idx] + grid_thw = out_item["image_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) num_tokens = int(grid_thw.prod()) // merge_length return [hf_processor.image_token_id] * num_tokens def get_video_replacement_glm4v(item_idx: int): - grid_thw = out_mm_kwargs["video_grid_thw"][item_idx] + out_item = out_mm_kwargs["video"][item_idx] + grid_thw = out_item["video_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) video, metadata = mm_items["video"][item_idx] diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 1751fccd08b0..bf33575859ae 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -30,7 +30,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -503,7 +503,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index c9e3b74e7c3c..c3ac3bb78c83 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -40,7 +40,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -118,7 +118,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> list[PromptUpdate]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index c3e4f81597ad..9ab3f4d0d9a1 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -17,7 +17,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs +from vllm.multimodal.inputs import MultiModalKwargsItems from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement, @@ -425,18 +425,19 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() - elif "image_embeds" in out_mm_kwargs: + elif "image_embeds" in out_mm_data: # TODO: Use image size information in dictionary embedding inputs # to compute num_patches (similar to Qwen2-VL) - image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] @@ -479,7 +480,8 @@ def _cached_apply_hf_processor( tokenization_kwargs: Mapping[str, object], *, return_mm_hashes: bool, - ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: + ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes], + bool]: # The processor logic is different for len(images) <= 1 vs > 1 # Since the processing cache assumes that the processor output is # invariant of how many images are passed per prompt, we only diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index e5c94c7f3a70..d3ddc47ea932 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -34,7 +34,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, ProcessingCache, @@ -295,7 +295,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() placeholder = { @@ -306,21 +306,22 @@ def _get_prompt_updates( def get_replacement_hyperclovax( item_idx: int, modality: str, - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ): - num_tokens = None + out_item = out_mm_kwargs[modality][item_idx] + if modality == "image": + lens = out_item["vision_query_lengths_images"].data num_tokens = self.info.get_num_image_tokens( - vision_query_length=out_mm_kwargs[ - "vision_query_lengths_images"][item_idx], ) - if modality == "video": + vision_query_length=lens) + elif modality == "video": + lens = out_item["vision_query_lengths_videos"].data num_tokens = self.info.get_num_video_tokens( - vision_query_length=out_mm_kwargs[ - "vision_query_lengths_videos"][item_idx], ) - assert isinstance(num_tokens, int) - return [ - placeholder[modality], - ] * num_tokens + vision_query_length=lens) + else: + raise NotImplementedError(modality) + + return [placeholder[modality]] * num_tokens return [ PromptReplacement( diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 3c01789b9006..63307470d959 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -34,7 +34,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import ImageProcessorItems, ImageSize # yapf conflicts with isort for this block # yapf: disable @@ -374,7 +374,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token, _, _ = self.info._get_image_token(hf_processor) diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index d952ced2fa69..c739e74b058f 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -24,7 +24,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -399,7 +399,7 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) img_context_token = hf_processor.image_token @@ -407,15 +407,16 @@ def _get_prompt_updates( end_image_token = hf_processor.end_image_token video_token = hf_processor.video_token - if "video_num_patches" in out_mm_kwargs: - video_num_patches = out_mm_kwargs["video_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "video_num_patches" in out_mm_data: + video_num_patches = out_mm_data["video_num_patches"] assert isinstance(video_num_patches, torch.Tensor) video_num_patches = video_num_patches.tolist() else: video_num_patches = [] - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() else: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 8e766dd4c476..da8ad8396725 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -28,7 +28,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -797,18 +797,19 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() - elif "image_embeds" in out_mm_kwargs: + elif "image_embeds" in out_mm_data: # TODO: Use image size information in dictionary embedding inputs # to compute num_patches (similar to Qwen2-VL) - image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] @@ -966,15 +967,19 @@ def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: - prompt_repl: list[PromptUpdate] = super()._get_prompt_updates( - mm_items, hf_processor_mm_kwargs, out_mm_kwargs) + prompt_repl = super()._get_prompt_updates( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + out_mm_kwargs=out_mm_kwargs, + ) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "video_num_patches" in out_mm_kwargs: - video_num_patches = out_mm_kwargs["video_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "video_num_patches" in out_mm_data: + video_num_patches = out_mm_data["video_num_patches"] assert isinstance(video_num_patches, torch.Tensor) video_num_patches = video_num_patches.tolist() else: @@ -992,12 +997,15 @@ def get_video_replacement_internvl(item_idx: int): video_context_token=hf_processor.video_token) if self.info.supports_video: - prompt_repl.append( + prompt_repl = [ + *prompt_repl, PromptReplacement( modality="video", target="

Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
wget http://images.cocodataset.org/zips/train2017.zip + + + ShareGPT4Video (Video) + ✅ + ✅ + + git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video + BurstGPT @@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ + --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -246,7 +254,7 @@ vllm bench serve \ ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ + --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -612,7 +620,7 @@ vllm bench serve \ --prefix-repetition-prefix-len 512 \ --prefix-repetition-suffix-len 128 \ --prefix-repetition-num-prefixes 5 \ - --prefix-repetition-output-len 128 + --prefix-repetition-output-len 128 ``` @@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \ --endpoint /v1/chat/completion ``` +### Videos (ShareGPT4Video) + +Start vLLM: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dtype bfloat16 \ + --limit-mm-per-prompt '{"video": 1}' \ + --allowed-local-media-path /path/to/sharegpt4video/videos +``` + +Send requests with videos: + +```bash +python benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \ + --num-prompts 100 \ + --save-result \ + --result-dir ~/vllm_benchmark_results \ + --save-detailed \ + --endpoint /v1/chat/completion +``` + diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index c62934ed94cb..e1a856026c4a 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]: ) +def process_video(video: Any) -> Mapping[str, Any]: + """ + Process a single video input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key + containing raw video data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(video, dict) and "bytes" in video: + video_bytes = video["bytes"] + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + return { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + } + + if isinstance(video, str): + video_url = ( + video if video.startswith(("http://", "file://")) else f"file://{video}" + ) + return {"type": "video_url", "video_url": {"url": video_url}} + + raise ValueError( + f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 + ) + + # ----------------------------------------------------------------------------- # Random Dataset Implementation (Synthetic Data) # ----------------------------------------------------------------------------- @@ -451,9 +486,10 @@ def sample( skip_min_output_len_check=output_len is not None, ): continue - # TODO: Also support ShareGPT4Video. if image_path := entry.get("image"): mm_content = process_image(image_path) + elif video_path := entry.get("video"): + mm_content = process_video(video_path) else: mm_content = None if enable_multimodal_chat: diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 3532a083fb4a..f4fbfad2d1d5 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]: """ Process a single image input and return a multimedia content dictionary. - Supports three input types: + Supports the following input types: 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key containing raw image data. - Loads the bytes as a PIL.Image.Image. @@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]: " or str or dictionary with raw image bytes.") +def process_video(video: Any) -> Mapping[str, Any]: + """ + Process a single video input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key + containing raw video data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(video, dict) and 'bytes' in video: + video_bytes = video['bytes'] + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + return { + "type": "video_url", + "video_url": { + "url": f"data:video/mp4;base64,{video_base64}" + }, + } + + if isinstance(video, str): + video_url = (video if video.startswith( + ("http://", "file://")) else f"file://{video}") + return {"type": "video_url", "video_url": {"url": video_url}} + + raise ValueError( + f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 + ) + # ----------------------------------------------------------------------------- # Random Dataset Implementation (Synthetic Data) # ----------------------------------------------------------------------------- @@ -474,9 +509,10 @@ def sample( skip_min_output_len_check=output_len is not None): continue - # TODO: Also support ShareGPT4Video. if image_path := entry.get("image"): mm_content = process_image(image_path) + elif video_path := entry.get("video"): + mm_content = process_video(video_path) else: mm_content = None if enable_multimodal_chat: From ec89a5200a158406655f01f7f7fa398e54bed9c1 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 19 Aug 2025 20:39:28 -0400 Subject: [PATCH 163/231] [Quantization] Bump Compressed Tensors Version (#23202) Signed-off-by: Kyle Sayers Co-authored-by: Dipika Sikka Co-authored-by: Michael Goin Signed-off-by: Duncan Moss --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 3c3ac0abf50f..365457436faa 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -39,7 +39,7 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.10.2 # required for compressed-tensors +compressed-tensors == 0.11.0 # required for compressed-tensors depyf==0.19.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files From 82061bc6f50ac30f7b22c17b2cec6e83ec23ce98 Mon Sep 17 00:00:00 2001 From: 633WHU Date: Wed, 20 Aug 2025 09:25:59 +0800 Subject: [PATCH 164/231] [Core] Optimize scheduler request removal for single completions (#21917) Signed-off-by: chiliu Signed-off-by: chiliu Co-authored-by: chiliu Signed-off-by: Duncan Moss --- vllm/v1/core/sched/scheduler.py | 14 ++++++-------- vllm/v1/core/sched/utils.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b3defa443186..f9a7e2101407 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -25,7 +25,7 @@ SchedulerOutput) from vllm.v1.core.sched.request_queue import (SchedulingPolicy, create_request_queue) -from vllm.v1.core.sched.utils import check_stop +from vllm.v1.core.sched.utils import check_stop, remove_all from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs) from vllm.v1.kv_cache_interface import KVCacheConfig @@ -872,9 +872,7 @@ def update_from_output( # Remove the stopped requests from the running and waiting queues. if stopped_running_reqs: - self.running = [ - req for req in self.running if req not in stopped_running_reqs - ] + self.running = remove_all(self.running, stopped_running_reqs) if stopped_preempted_reqs: # This is a rare case and unlikely to impact performance. self.waiting.remove_requests(stopped_preempted_reqs) @@ -1000,7 +998,7 @@ def finish_requests( else: request_ids = set(request_ids) - running_requests_to_remove = [] + running_requests_to_remove = set() waiting_requests_to_remove = [] valid_requests = [] @@ -1013,13 +1011,13 @@ def finish_requests( valid_requests.append(request) if request.status == RequestStatus.RUNNING: - running_requests_to_remove.append(request) + running_requests_to_remove.add(request) else: waiting_requests_to_remove.append(request) # Remove all requests from queues at once for better efficiency - for request in running_requests_to_remove: - self.running.remove(request) + if running_requests_to_remove: + self.running = remove_all(self.running, running_requests_to_remove) if waiting_requests_to_remove: self.waiting.remove_requests(waiting_requests_to_remove) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 42ec95091f96..42d3e5c68b4c 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib from typing import Optional import torch @@ -7,6 +8,38 @@ from vllm.v1.request import Request, RequestStatus +def remove_all(lst: list, items_to_remove: set) -> list: + """Remove all items from a list that are in the items_to_remove set. + + This method optimizes for the common case of removing a single item, + falling back to list comprehension for multiple items. + + Args: + lst: The list to remove items from + items_to_remove: Set of items to remove + + Returns: + Either the modified original list (for single item removal) or + a new list (for multiple item removal). Callers should use the + returned value. + + Note: + For single item removal, this modifies the original list in-place + and returns it. For multiple items, it creates and returns a new list. + """ + if not items_to_remove: + return lst + + if len(items_to_remove) == 1: + # Fast path for single item removal (most common case) + item = next(iter(items_to_remove)) + with contextlib.suppress(ValueError): + lst.remove(item) + return lst + # For multiple items, use list comprehension + return [item for item in lst if item not in items_to_remove] + + def check_stop(request: Request, max_model_len: int, pooler_output: Optional[torch.Tensor] = None) -> bool: From a1cb9fb129be1734da532f31cc5feb60a93b3b3a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 19 Aug 2025 22:18:52 -0400 Subject: [PATCH 165/231] [CI Perf] Only test bfloat16 for tests/compile/test_fusion_all_reduce.py (#23132) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- tests/compile/test_fusion_all_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 4c3cf6c2a10c..dd31e0db1f59 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -148,7 +148,7 @@ def ops_in_model_before(self): @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [8]) @pytest.mark.parametrize("hidden_size", [16]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") @pytest.mark.skipif( From c1537560441fefa58c1101f5416d0db17524da5f Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Tue, 19 Aug 2025 19:32:47 -0700 Subject: [PATCH 166/231] [Core] Add torch profiler CPU traces for AsyncLLM. (#21794) Signed-off-by: Chenheli Hua Signed-off-by: Duncan Moss --- vllm/envs.py | 6 ++++-- vllm/v1/engine/async_llm.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 861e4c6a1bbe..70068cca66f8 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -667,8 +667,10 @@ def get_vllm_port() -> Optional[int]: "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None), - # Enables torch profiler if set. Path to the directory where torch profiler - # traces are saved. Note that it must be an absolute path. + # Enables torch profiler if set. + # Both AsyncLLM's CPU traces as well as workers' + # traces (CPU & GPU) will be saved under this directory. + # Note that it must be an absolute path. "VLLM_TORCH_PROFILER_DIR": lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))), diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 664fec31a4da..342d7b24f8e9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import os +import socket import time from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, Optional, Union import numpy as np +import torch import vllm.envs as envs from vllm.config import ModelConfig, VllmConfig @@ -144,6 +147,26 @@ def __init__( except RuntimeError: pass + if envs.VLLM_TORCH_PROFILER_DIR: + logger.info( + "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 + envs.VLLM_TORCH_PROFILER_DIR) + worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + ], + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + envs.VLLM_TORCH_PROFILER_DIR, + worker_name=worker_name, + use_gzip=True)) + else: + logger.info( + "Torch profiler disabled. AsyncLLM CPU traces will not be collected." # noqa: E501 + ) + self.profiler = None + @classmethod @deprecate_kwargs( "disable_log_requests", @@ -562,10 +585,16 @@ async def check_health(self) -> None: raise self.dead_error async def start_profile(self) -> None: - await self.engine_core.profile_async(True) + coros = [self.engine_core.profile_async(True)] + if self.profiler is not None: + coros.append(asyncio.to_thread(self.profiler.start)) + await asyncio.gather(*coros) async def stop_profile(self) -> None: - await self.engine_core.profile_async(False) + coros = [self.engine_core.profile_async(False)] + if self.profiler is not None: + coros.append(asyncio.to_thread(self.profiler.stop)) + await asyncio.gather(*coros) async def reset_mm_cache(self) -> None: self.processor.mm_registry.reset_processor_cache(self.model_config) From e71f229bf4b9eec81d709707e8673f625a3ced5c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 10:33:41 +0800 Subject: [PATCH 167/231] [Doc] Update V1 status of various pooling models (#23189) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- docs/models/supported_models.md | 26 ++++++++++---------- tests/models/language/pooling/test_gritlm.py | 9 ++++--- vllm/model_executor/models/gritlm.py | 6 ++--- vllm/model_executor/models/interfaces.py | 11 ++++++--- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 1d165fa6f16b..7908e4238710 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -363,7 +363,7 @@ th { | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | @@ -436,17 +436,17 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | | -| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | -| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | | -| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | | -| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | | -| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | | +| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ | +| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | +| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | ✅︎ | +| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | ✅︎ | +| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | ✅︎ | +| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | ✅︎ | | `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) @@ -476,7 +476,7 @@ These models primarily support the [`LLM.classify`](./pooling_models.md#llmclass | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | @@ -493,12 +493,12 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | | +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index d21987571cba..17a55d916b1f 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -14,6 +14,7 @@ MODEL_NAME = "parasail-ai/GritLM-7B-vllm" MAX_MODEL_LEN = 4000 +ATOL = 0.002 def _arr(arr): @@ -97,16 +98,16 @@ def get_test_data(): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) - assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001) + assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=ATOL) cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1]) - assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001) + assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=ATOL) cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0]) - assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001) + assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=ATOL) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) - assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001) + assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=ATOL) def test_gritlm_offline_embedding(vllm_runner): diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 9e7490e3c4f0..3f6790269ae6 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -20,7 +20,7 @@ from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from .interfaces import SupportsV0Only +from .interfaces import default_pooling_type logger = init_logger(__name__) @@ -215,7 +215,8 @@ def forward( return build_output(pooled_data) -class GritLM(LlamaForCausalLM, SupportsV0Only): +@default_pooling_type("MEAN") +class GritLM(LlamaForCausalLM): """This class implements the embedding model for parasail-ai/GritLM-7B-vllm. The class inherits from LlamaForCausalLM and provides a custom pooling @@ -241,7 +242,6 @@ def __init__( prefix: str = "", **kwargs, ) -> None: - # Use full attention for pooling (this is why V1 is not supported yet) if vllm_config.model_config.runner_type == "pooling": hf_config = vllm_config.model_config.hf_config hf_config.is_causal = False diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index c425488f834b..9415e67924e7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -3,7 +3,7 @@ from collections.abc import Iterable, Mapping, MutableSequence from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, - Union, overload, runtime_checkable) + TypeVar, Union, overload, runtime_checkable) import numpy as np import torch @@ -641,11 +641,14 @@ def supports_cross_encoding( return is_pooling_model(model) and _supports_cross_encoding(model) -def default_pooling_type(pooling_type: str) -> object: +_T = TypeVar("_T", bound=type[torch.nn.Module]) + + +def default_pooling_type(pooling_type: str): """Set default_pooling_type decorator. """ - def func(model: object): - model.default_pooling_type = pooling_type + def func(model: _T) -> _T: + model.default_pooling_type = pooling_type # type: ignore return model return func From dd532aec4c0cc02a845887c1f6ac77276876b389 Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Tue, 19 Aug 2025 22:57:47 -0400 Subject: [PATCH 168/231] [Attention] Optimize make_local_attention_virtual_batches for Flash Attention (#23185) Signed-off-by: linzebing Signed-off-by: Duncan Moss --- vllm/v1/attention/backends/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 5e6bc331835b..94dd3d2629eb 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -464,8 +464,9 @@ def make_local_attention_virtual_batches( attn_chunk_size)[arange > 0] # convert from q_seqlens to cu_seqlens_q - cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\ - .astype(np.int32) + cu_seqlens_q_local = np.empty(virtual_batches + 1, dtype=np.int32) + np.cumsum(seqlens_q_local, out=cu_seqlens_q_local[1:]) + cu_seqlens_q_local[0] = 0 # compute the seqlens_k_local, # basically a full local attention block for all but the last block in each @@ -508,11 +509,10 @@ def make_local_attention_virtual_batches( # [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4]) # [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8]) # ] - block_indices= np.broadcast_to( - np.arange(pages_per_local_batch, dtype=np.int32), - (virtual_batches, pages_per_local_batch)) \ - + np.expand_dims(block_starts, axis=1) - block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1) + block_indices = (block_starts[:, None] + + np.arange(pages_per_local_batch, dtype=np.int32)) + block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] - + 1) batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32), local_blocks * pages_per_local_batch) block_table_local = block_table[batch_indices, block_indices]\ From b515118168adffc2eb01cb839551244f1bf2dfae Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 19 Aug 2025 20:14:32 -0700 Subject: [PATCH 169/231] Fix a performance comparison issue in Benchmark Suite (#23047) Signed-off-by: Tsai, Louie Signed-off-by: Louie Tsai Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Li, Jiang Signed-off-by: Duncan Moss --- .../scripts/compare-json-results.py | 146 ++++++++++++++---- 1 file changed, 119 insertions(+), 27 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 12c4ba6aa69a..50431d0cd4c5 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -3,44 +3,129 @@ import argparse import json import os +from importlib import util import pandas as pd +plotly_found = util.find_spec("plotly.express") is not None + def compare_data_columns( files, name_column, data_column, info_cols, drop_column, debug=False ): - print("\ncompare_data_column: " + data_column) + """ + Align concatenation by keys derived from info_cols instead of row order. + - Pick one canonical key list: subset of info_cols present in ALL files. + - For each file: set index to those keys, aggregate duplicates + - (mean for metric, first for names). + - Concat along axis=1 (indexes align), then reset_index so callers can + - group by columns. + - If --debug, add a _name column per file. + """ + print("\ncompare_data_column:", data_column) + frames = [] raw_data_cols = [] compare_frames = [] + + # 1) choose a canonical key list from info_cols that exists in ALL files + cols_per_file = [] + for f in files: + try: + df_tmp = pd.read_json(f, orient="records") + except Exception as err: + raise ValueError(f"Failed to read {f}") from err + cols_per_file.append(set(df_tmp.columns)) + + key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] + if not key_cols: + # soft fallback: use any info_cols present in the first file + key_cols = [c for c in info_cols if c in list(cols_per_file[0])] + if not key_cols: + raise ValueError( + "No common key columns found from info_cols across the input files." + ) + + # 2) build a single "meta" block (keys as columns) once, aligned by the key index + meta_added = False + for file in files: - data_df = pd.read_json(file) - serving_df = data_df.dropna(subset=[drop_column], ignore_index=True) - # Show all info columns in the first couple columns - if not frames: - for col in info_cols: - if col not in serving_df.columns: - print(f"Skipping missing column: {col}") - continue - frames.append(serving_df[col]) - # only show test name under debug mode - if debug is True: - serving_df = serving_df.rename(columns={name_column: file + "_name"}) - frames.append(serving_df[file + "_name"]) - - file = "/".join(file.split("/")[:-1]) - serving_df = serving_df.rename(columns={data_column: file}) - frames.append(serving_df[file]) - raw_data_cols.append(file) - compare_frames.append(serving_df[file]) + df = pd.read_json(file, orient="records") + + # Keep rows that actually have the compared metric (same as original behavior) + if drop_column in df.columns: + df = df.dropna(subset=[drop_column], ignore_index=True) + + # Stabilize numeric key columns (harmless if missing) + for c in ( + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", + ): + if c in df.columns: + df[c] = pd.to_numeric(df[c], errors="coerce") + + # Ensure all key columns exist + for c in key_cols: + if c not in df.columns: + df[c] = pd.NA + + # Set index = key_cols and aggregate duplicates → unique MultiIndex + df_idx = df.set_index(key_cols, drop=False) + + # meta (key columns), unique per key + meta = df_idx[key_cols] + if not meta.index.is_unique: + meta = meta.groupby(level=key_cols, dropna=False).first() + + # metric series for this file, aggregated to one row per key + file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) + s = df_idx[data_column] + if not s.index.is_unique: + s = s.groupby(level=key_cols, dropna=False).mean() + s.name = file_label # column label like original + + # add meta once (from first file) so keys are the leftmost columns + if not meta_added: + frames.append(meta) + meta_added = True + + # (NEW) debug: aligned test-name column per file + if debug and name_column in df_idx.columns: + name_s = df_idx[name_column] + if not name_s.index.is_unique: + name_s = name_s.groupby(level=key_cols, dropna=False).first() + name_s.name = f"{file_label}_name" + frames.append(name_s) + + frames.append(s) + raw_data_cols.append(file_label) + compare_frames.append(s) + + # Generalize ratio: for any file N>=2, add ratio (fileN / file1) if len(compare_frames) >= 2: - # Compare numbers among two files - ratio_df = compare_frames[1] / compare_frames[0] - frames.append(ratio_df) - compare_frames.pop(1) + base = compare_frames[0] + current = compare_frames[-1] + ratio = current / base + ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 + ratio.name = f"Ratio 1 vs {len(compare_frames)}" + frames.append(ratio) + # 4) concat on columns with aligned MultiIndex; + # then reset_index to return keys as columns concat_df = pd.concat(frames, axis=1) + concat_df = concat_df.reset_index(drop=True).reset_index() + if "index" in concat_df.columns: + concat_df = concat_df.drop(columns=["index"]) + + # Ensure key/info columns appear first (in your info_cols order) + front = [c for c in info_cols if c in concat_df.columns] + rest = [c for c in concat_df.columns if c not in front] + concat_df = concat_df[front + rest] + print(raw_data_cols) return concat_df, raw_data_cols @@ -67,6 +152,15 @@ def split_json_by_tp_pp( df = pd.DataFrame(data) + # Keep only "serving" tests + name_col = next( + (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None + ) + if name_col: + df = df[ + df[name_col].astype(str).str.contains(r"serving", case=False, na=False) + ].copy() + # Handle alias column names rename_map = { "tp_size": "TP Size", @@ -181,7 +275,6 @@ def split_json_by_tp_pp( f"Expected subset: {filtered_info_cols}, " f"but DataFrame has: {list(output_df.columns)}" ) - output_df_sorted = output_df.sort_values(by=existing_group_cols) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: @@ -189,8 +282,7 @@ def split_json_by_tp_pp( text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) - if plot is True: - import pandas as pd + if plot and plotly_found: import plotly.express as px df = group[raw_data_cols] From 648cdaf4375a136e95abb2aa06ed87ecab20ce9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B8=B8=EC=9E=AC=EC=9D=80?= Date: Wed, 20 Aug 2025 13:02:50 +0900 Subject: [PATCH 170/231] chore: support pytorch format in lora (#22790) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: jaeeun.kil Signed-off-by: 길재은 Signed-off-by: Duncan Moss --- vllm/lora/models.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e6b19d4748f4..3072047a2606 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -207,6 +207,7 @@ def from_local_checkpoint( """ lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") + lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") new_embeddings_tensor_path = os.path.join( lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, @@ -255,9 +256,10 @@ def check_unexpected_modules(modules: dict): check_unexpected_modules(f) for module in f.keys(): # noqa tensors[module] = f.get_tensor(module) - elif os.path.isfile(lora_bin_file_path): - # When a bin file is provided, we rely on config to find unexpected - # modules. + elif os.path.isfile(lora_bin_file_path) or os.path.isfile( + lora_pt_file_path): + # When a bin/pt file is provided, we rely on config to find + # unexpected modules. unexpected_modules = [] target_modules = peft_helper.target_modules if not isinstance(target_modules, list): @@ -279,7 +281,10 @@ def check_unexpected_modules(modules: dict): f" target modules in {expected_lora_modules}" f" but received {unexpected_modules}." f" Please verify that the loaded LoRA module is correct") - tensors = torch.load(lora_bin_file_path, + lora_file_path = (lora_bin_file_path + if os.path.isfile(lora_bin_file_path) else + lora_pt_file_path) + tensors = torch.load(lora_file_path, map_location=device, weights_only=True) else: From e85b346645c0d6fa460adcf33a5cc20905523c10 Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Tue, 19 Aug 2025 21:09:27 -0700 Subject: [PATCH 171/231] [CI/Build] Also check DP in benchmarks throughput script (#23038) Co-authored-by: Simon Mo Signed-off-by: Duncan Moss --- benchmarks/benchmark_throughput.py | 4 ++-- vllm/benchmarks/throughput.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c51b57968652..c7f290e1eb88 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -597,8 +597,8 @@ def validate_args(args): # https://github.com/vllm-project/vllm/issues/16222 if args.data_parallel_size > 1: raise ValueError( - "Data parallel is not supported in offline benchmark, \ - please use benchmark serving instead" + "Data parallel is not supported in offline benchmark, " + "please use benchmark serving instead" ) diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 0c19fa6dcfdd..f022a55e625f 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -434,6 +434,14 @@ def validate_args(args): if args.backend == "mii" and args.tokenizer != args.model: raise ValueError( "Tokenizer must be the same as the model for MII backend.") + + # --data-parallel is not supported currently. + # https://github.com/vllm-project/vllm/issues/16222 + if args.data_parallel_size > 1: + raise ValueError( + "Data parallel is not supported in offline benchmark, " + "please use benchmark serving instead" + ) def add_cli_args(parser: argparse.ArgumentParser): From c43ca52b21aeeed6d44579e569397fd8113ea306 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 13:06:42 +0800 Subject: [PATCH 172/231] [CI/Build] Sync multimodal tests (#23181) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- .../multimodal/processing/test_common.py | 10 +++++--- tests/models/registry.py | 24 +++++++++---------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0fdc182b9ee9..8aa0dc7e8e34 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -275,16 +275,17 @@ def _test_processing_correctness_one( "google/gemma-3n-E2B-it", "zai-org/glm-4v-9b", "zai-org/GLM-4.1V-9B-Thinking", + "zai-org/GLM-4.5V", "ibm-granite/granite-speech-3.3-2b", "h2oai/h2ovl-mississippi-800m", + "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", + "HuggingFaceM4/Idefics3-8B-Llama3", "internlm/Intern-S1", "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL3-1B", - "HuggingFaceM4/Idefics3-8B-Llama3", - "HuggingFaceTB/SmolVLM2-2.2B-Instruct", + "Kwai-Keye/Keye-VL-8B-Preview", "moonshotai/Kimi-VL-A3B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", @@ -315,10 +316,13 @@ def _test_processing_correctness_one( "Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2.5-Omni-3B", "Skywork/Skywork-R1V-38B", + "HuggingFaceTB/SmolVLM2-2.2B-Instruct", + "stepfun-ai/step3", "fixie-ai/ultravox-v0_5-llama-3_2-1b", "openai/whisper-large-v3", "omni-research/Tarsier-7b", "omni-research/Tarsier2-Recap-7b", + "mistralai/Voxtral-Mini-3B-2507", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) diff --git a/tests/models/registry.py b/tests/models/registry.py index cbdc9edbbc9d..28fe9063169e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -215,9 +215,6 @@ def check_available_online( "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", trust_remote_code=True, is_available_online=False), - "HCXVisionForCausalLM": _HfExamplesInfo( - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", - trust_remote_code=True), "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", trust_remote_code=True), "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", @@ -298,8 +295,7 @@ def check_available_online( "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3", - trust_remote_code=True, - is_available_online=False), + trust_remote_code=True), "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct", trust_remote_code=True), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", @@ -405,22 +401,24 @@ def check_available_online( hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), # noqa: E501 "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V", - is_available_online=False), # noqa: E501 + min_transformers_version="4.56"), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", trust_remote_code=True, extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 + "HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", # noqa: E501 + trust_remote_code=True), "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, # noqa: E501 min_transformers_version="4.55.1", transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 + "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", + trust_remote_code=True), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), - "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", - trust_remote_code=True), "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 @@ -464,9 +462,10 @@ def check_available_online( transformers_version_reason="HF model is not compatible", # noqa: E501 extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 - "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True, - max_transformers_version="4.53", - transformers_version_reason="HF model is not compatible"), # noqa: E501 + "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", + trust_remote_code=True, + max_transformers_version="4.53", + transformers_version_reason="HF model is not compatible"), # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", @@ -496,8 +495,7 @@ def check_available_online( min_transformers_version="4.55.1", transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3", - trust_remote_code=True, - is_available_online=False), + trust_remote_code=True), "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501 From 44862d8671fa51755d14f197314adce3b2d4adb2 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 19 Aug 2025 22:50:29 -0700 Subject: [PATCH 173/231] [BugFix] Fix stuck stats/metrics after requests are aborted (#22995) Signed-off-by: Nick Hill Signed-off-by: Duncan Moss --- tests/entrypoints/openai/test_metrics.py | 95 +++++++++++++++++++++++- vllm/v1/core/block_pool.py | 7 +- vllm/v1/core/sched/scheduler.py | 9 ++- 3 files changed, 106 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 9107d089834b..ff2e7004ff9f 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import asyncio import subprocess import sys import tempfile @@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer, assert metric in response.text +@pytest.mark.asyncio +async def test_abort_metrics_reset(server: RemoteOpenAIServer, + client: openai.AsyncClient, use_v1: bool): + + running_requests, waiting_requests, kv_cache_usage = ( + _get_running_metrics_from_api(server)) + + # Expect no running requests or kvcache usage + assert running_requests == 0 + assert waiting_requests == 0 + assert kv_cache_usage == 0.0 + + # Start some long-running requests that we can abort + tasks = [] + for _ in range(3): + task = asyncio.create_task( + client.completions.create( + model=MODEL_NAME, + prompt=_TOKENIZED_PROMPT, + max_tokens=100, # Long generation to give time to abort + temperature=0.0)) + tasks.append(task) + + # Wait a bit for requests to start processing + await asyncio.sleep(0.5) + + # Check that we have running requests + running_requests, waiting_requests, kv_cache_usage = ( + _get_running_metrics_from_api(server)) + + # Expect running requests and kvcache usage + assert running_requests > 0 + assert kv_cache_usage > 0 + + # Cancel all tasks to abort the requests + for task in tasks: + task.cancel() + + # Wait for cancellations to be processed + await asyncio.sleep(1.0) + + # Check that metrics have reset to zero + response = requests.get(server.url_for("metrics")) + assert response.status_code == HTTPStatus.OK + + # Verify running and waiting requests counts and KV cache usage are zero + running_requests_after, waiting_requests_after, kv_cache_usage_after = ( + _get_running_metrics_from_api(server)) + + assert running_requests_after == 0,\ + (f"Expected 0 running requests after abort, got " + f"{running_requests_after}") + assert waiting_requests_after == 0,\ + (f"Expected 0 waiting requests after abort, got " + f"{waiting_requests_after}") + assert kv_cache_usage_after == 0,\ + (f"Expected 0% KV cache usage after abort, got " + f"{kv_cache_usage_after}") + + +def _get_running_metrics_from_api(server: RemoteOpenAIServer): + """Return (running_count, waiting_count, kv_cache_usage)""" + + response = requests.get(server.url_for("metrics")) + assert response.status_code == HTTPStatus.OK + + # Verify running and waiting requests counts and KV cache usage are zero + running_requests, waiting_requests, kv_cache_usage = None, None, None + + for family in text_string_to_metric_families(response.text): + if family.name == "vllm:num_requests_running": + for sample in family.samples: + if sample.name == "vllm:num_requests_running": + running_requests = sample.value + break + elif family.name == "vllm:num_requests_waiting": + for sample in family.samples: + if sample.name == "vllm:num_requests_waiting": + waiting_requests = sample.value + break + elif family.name == "vllm:gpu_cache_usage_perc": + for sample in family.samples: + if sample.name == "vllm:gpu_cache_usage_perc": + kv_cache_usage = sample.value + break + + assert running_requests is not None + assert waiting_requests is not None + assert kv_cache_usage is not None + + return running_requests, waiting_requests, kv_cache_usage + + def test_metrics_exist_run_batch(use_v1: bool): input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501 diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 839297135fe0..fdd96c3e9557 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -298,7 +298,12 @@ def get_usage(self) -> float: Returns: The KV cache usage (between 0.0 and 1.0). """ - return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks) + + # Subtract 1 to account for null block. + total_gpu_blocks = self.num_gpu_blocks - 1 + if not total_gpu_blocks: + return 0 + return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks) def take_events(self) -> list[KVCacheEvent]: """Atomically takes all events and clears the queue. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index f9a7e2101407..4b167da5c8f8 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -902,10 +902,13 @@ def update_from_output( finished_requests=finished_set) finished_req_ids.clear() - if engine_core_outputs: + if (stats := self.make_stats(spec_decoding_stats)) is not None: # Return stats to only one of the front-ends. - next(iter(engine_core_outputs.values())).scheduler_stats = ( - self.make_stats(spec_decoding_stats)) + if (eco := next(iter(engine_core_outputs.values()), None)) is None: + # We must return the stats even if there are no request + # outputs this step. + engine_core_outputs[0] = eco = EngineCoreOutputs() + eco.scheduler_stats = stats return engine_core_outputs From 536b4a264414d1c608b308aee903189054dc893a Mon Sep 17 00:00:00 2001 From: who who who Date: Wed, 20 Aug 2025 14:24:37 +0800 Subject: [PATCH 174/231] fix cuda graph (#22721) Signed-off-by: fsx950223 Signed-off-by: Duncan Moss --- vllm/v1/attention/backends/rocm_aiter_fa.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 7d09ac0a4a3a..36b5853bfdcb 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with AiterFlashAttention.""" from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch @@ -11,7 +11,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec @@ -231,7 +232,7 @@ class AiterFlashAttentionMetadata: class AiterFlashAttentionMetadataBuilder( AttentionMetadataBuilder[AiterFlashAttentionMetadata]): - full_cudagraph_supported: ClassVar[bool] = True + cudagraph_support = AttentionCGSupport.ALWAYS def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): From b2fd7cca83a2abd7dcfebf70a845c9fd5bc115d6 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Wed, 20 Aug 2025 18:16:27 +0800 Subject: [PATCH 175/231] [Model] use autoWeightsLoader for gptoss (#22446) Signed-off-by: calvin chen Signed-off-by: Duncan Moss --- vllm/model_executor/models/gpt_oss.py | 432 +++++++++++++------------- 1 file changed, 224 insertions(+), 208 deletions(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 2f5d9ddd9054..cd93f0ef1e31 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -27,7 +27,8 @@ from vllm.sequence import IntermediateTensors from vllm.utils import cdiv -from .utils import extract_layer_index, maybe_prefix +from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index, + maybe_prefix) class OAIAttention(nn.Module): @@ -203,6 +204,7 @@ def __init__( super().__init__() self.config = vllm_config.model_config.hf_config self.quant_config = vllm_config.quant_config + self.parallel_config = vllm_config.parallel_config self.config.hidden_size = self.config.hidden_size self.embedding = VocabParallelEmbedding( self.config.vocab_size, @@ -225,64 +227,26 @@ def forward(self, input_ids: torch.Tensor, x = self.norm(x) return x - -class GptOssForCausalLM(nn.Module): - - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): - super().__init__() - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config.hf_config - self.model = GptOssModel( - vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model"), - ) - self.lm_head = ParallelLMHead( - self.model_config.vocab_size, - self.model_config.hidden_size, - ) - self.logits_processor = LogitsProcessor(self.model_config.vocab_size) - - def forward(self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: - assert intermediate_tensors is None - assert inputs_embeds is None - return self.model(input_ids, positions) - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - def _load_weights_mxfp4( - self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - rename_mapping = { - "self_attn": "attn", - "input_layernorm.weight": "attn.norm.weight", - "post_attention_layernorm.weight": "mlp.norm.weight", - "embed_tokens": "embedding", - } - - def maybe_rename(name: str) -> str: - for remap_name, new_name in rename_mapping.items(): - if remap_name in name: - return name.replace(remap_name, new_name) - return name - + self, + ep_rank_end: int, + ep_rank_start: int, + heads_per_rank: int, + head_start: int, + weights: Iterable[tuple[str, torch.Tensor]], + stacked_params_mapping: list[tuple[str, ...]], + ) -> set[str]: params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() + mxfp4_block = 32 + use_ep = self.parallel_config.enable_expert_parallel + num_experts = self.config.num_local_experts tp_rank = get_tensor_model_parallel_rank() tp_size = get_tensor_model_parallel_world_size() - intermediate_size = self.model_config.intermediate_size + + intermediate_size = self.config.intermediate_size intermediate_size_block = intermediate_size // mxfp4_block per_rank_intermediate_size_block = cdiv(intermediate_size_block, tp_size) @@ -294,33 +258,12 @@ def maybe_rename(name: str) -> str: tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, intermediate_size) - # Attention heads per rank - heads_per_rank = self.model_config.num_attention_heads // tp_size - head_start = tp_rank * heads_per_rank - - use_ep = self.vllm_config.parallel_config.enable_expert_parallel - ep_size = get_ep_group().world_size - ep_rank = get_ep_group().rank - num_experts = self.model_config.num_local_experts - experts_per_rank = num_experts // ep_size - ep_rank_start = ep_rank * experts_per_rank - ep_rank_end = (ep_rank + 1) * experts_per_rank - for name, weight in weights: # FIXME(woosuk): Remove this after testing. weight = weight.cuda() - if "gate_up_proj_blocks" in name: - # Handle MLP gate and up projection weights - new_name = name.replace("gate_up_proj_blocks", "w13_weight") - - # flat weight from (E, 2 * N, block_size, entry_per_block) - # to (E, 2 * N, -1), shouldn't trigger copy for contiguous - weight = weight.view(num_experts, 2 * intermediate_size, - -1).contiguous() - - # Extract gate and up projection parts - # since the weight is shuffled, we can slice directly + if ".w13_weight_scale" in name: + # Handle MLP gate and up projection weights scale if use_ep: narrow_weight = weight[ep_rank_start:ep_rank_end, ...] else: @@ -328,43 +271,44 @@ def maybe_rename(name: str) -> str: 2 * tp_rank_start:2 * tp_rank_end, ...] - param = params_dict[new_name] + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, narrow_weight, - weight_name=new_name, + weight_name=name, shard_id=None, expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_blocks" in name: + loaded_params.add(name) + continue + elif ".w2_weight_scale" in name: # Handle MLP down projection weights - new_name = name.replace("down_proj_blocks", "w2_weight") - # same flatten here, but since 2 mx4 value are packed in 1 - # uint8, divide by 2 - weight = weight.view(num_experts, -1, - intermediate_size // 2).contiguous() if use_ep: narrow_weight = weight[ep_rank_start:ep_rank_end, ...] else: - narrow_weight = weight[..., - tp_rank_start // 2:tp_rank_end // 2] + narrow_weight = weight[..., tp_rank_start // + mxfp4_block:tp_rank_end // + mxfp4_block] - param = params_dict[new_name] + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, narrow_weight, - weight_name=new_name, + weight_name=name, shard_id=None, expert_id=None) - loaded_params.add(new_name) + loaded_params.add(name) + continue + elif ".w13_weight" in name: + # Handle MLP gate and up projection weights + # flat weight from (E, 2 * N, block_size, entry_per_block) + # to (E, 2 * N, -1), shouldn't trigger copy for contiguous + weight = weight.view(num_experts, 2 * intermediate_size, + -1).contiguous() - elif "gate_up_proj_scales" in name: - # Handle MLP gate and up projection weights scale - new_name = name.replace("gate_up_proj_scales", - "w13_weight_scale") + # Extract gate and up projection parts + # since the weight is shuffled, we can slice directly if use_ep: narrow_weight = weight[ep_rank_start:ep_rank_end, ...] else: @@ -372,39 +316,40 @@ def maybe_rename(name: str) -> str: 2 * tp_rank_start:2 * tp_rank_end, ...] - param = params_dict[new_name] + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, narrow_weight, - weight_name=new_name, + weight_name=name, shard_id=None, expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_scales" in name: + loaded_params.add(name) + continue + elif ".w2_weight" in name: # Handle MLP down projection weights - new_name = name.replace("down_proj_scales", "w2_weight_scale") + # same flatten here, but since 2 mx4 value are packed in 1 + # uint8, divide by 2 + weight = weight.view(num_experts, -1, + intermediate_size // 2).contiguous() if use_ep: narrow_weight = weight[ep_rank_start:ep_rank_end, ...] else: - narrow_weight = weight[..., tp_rank_start // - mxfp4_block:tp_rank_end // - mxfp4_block] + narrow_weight = weight[..., + tp_rank_start // 2:tp_rank_end // 2] - param = params_dict[new_name] + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, narrow_weight, - weight_name=new_name, + weight_name=name, shard_id=None, expert_id=None) - loaded_params.add(new_name) - elif "gate_up_proj_bias" in name: + loaded_params.add(name) + continue + elif ".w13_bias" in name: # Handle MLP gate and up projection biases - new_name = name.replace("gate_up_proj_bias", "w13_bias") - # Extract gate and up projection bias parts if use_ep: narrow_weight = weight[ep_rank_start:ep_rank_end, ...] @@ -412,20 +357,19 @@ def maybe_rename(name: str) -> str: narrow_weight = weight[:, 2 * tp_rank_start:2 * tp_rank_end] - param = params_dict[new_name] + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, narrow_weight, - weight_name=new_name, + weight_name=name, shard_id=None, expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_bias" in name: + loaded_params.add(name) + continue + elif ".w2_bias" in name: # Handle MLP down projection bias - new_name = name.replace("down_proj_bias", "w2_bias") - param = params_dict[new_name] + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) if use_ep: @@ -436,87 +380,69 @@ def maybe_rename(name: str) -> str: weight.zero_() weight_loader(param, weight, - weight_name=new_name, + weight_name=name, shard_id=None, expert_id=None) - loaded_params.add(new_name) + loaded_params.add(name) + continue elif "sinks" in name: # Handle attention sinks (distributed across ranks) - name = name.replace("self_attn", "attn") param = params_dict[name] narrow_weight = weight.narrow(0, head_start, heads_per_rank) param.data.copy_(narrow_weight) loaded_params.add(name) - elif "q_proj" in name or "k_proj" in name or "v_proj" in name: - shard_id = ("q" if "q_proj" in name else - "k" if "k_proj" in name else "v") - name = name.replace("self_attn", "attn") - param_name = name.replace(f"{shard_id}_proj", "qkv") - param = params_dict[param_name] - weight_loader = param.weight_loader - weight_loader(param, weight, loaded_shard_id=shard_id) - loaded_params.add(param_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, weight) + else: + weight_loader(param, weight, shard_id) + break else: # Handle all other weights with potential renaming - renamed_name = maybe_rename(name) - if renamed_name not in params_dict: + if name not in params_dict: continue - param = params_dict[renamed_name] + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, weight) - loaded_params.add(renamed_name) - + loaded_params.add(name) return loaded_params def _load_weights_other( - self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - rename_mapping = { - "self_attn": "attn", - "input_layernorm.weight": "attn.norm.weight", - "post_attention_layernorm.weight": "mlp.norm.weight", - "embed_tokens": "embedding", - } - - def maybe_rename(name: str) -> str: - for remap_name, new_name in rename_mapping.items(): - if remap_name in name: - return name.replace(remap_name, new_name) - return name - + self, + ep_rank_start: int, + ep_rank_end: int, + heads_per_rank: int, + head_start: int, + weights: Iterable[tuple[str, torch.Tensor]], + stacked_params_mapping: list[tuple[str, ...]], + ) -> set[str]: params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() + use_ep = self.parallel_config.enable_expert_parallel + tp_rank = get_tensor_model_parallel_rank() tp_size = get_tensor_model_parallel_world_size() - intermediate_size = self.model_config.intermediate_size + intermediate_size = self.config.intermediate_size per_rank_intermediate_size = cdiv(intermediate_size, tp_size) # Calculate common slicing bounds for current rank tp_rank_start = tp_rank * per_rank_intermediate_size tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, intermediate_size) - # Attention heads per rank - heads_per_rank = self.model_config.num_attention_heads // tp_size - head_start = tp_rank * heads_per_rank - - use_ep = self.vllm_config.parallel_config.enable_expert_parallel - ep_size = get_ep_group().world_size - ep_rank = get_ep_group().rank - num_experts = self.model_config.num_local_experts - experts_per_rank = num_experts // ep_size - ep_rank_start = ep_rank * experts_per_rank - ep_rank_end = (ep_rank + 1) * experts_per_rank - for name, weight in weights: - if ".experts.gate_up_proj" in name and "bias" not in name: + if ".w13_weight" in name: # Handle MLP gate and up projection weights - new_name = name.replace(".experts.gate_up_proj", - ".experts.w13_weight") - # Extract gate and up projection parts - # since the weight is shuffled, we can slice directly if use_ep: narrow_weight = weight[ep_rank_start:ep_rank_end, ...] else: @@ -524,30 +450,25 @@ def maybe_rename(name: str) -> str: 2 * tp_rank_start:2 * tp_rank_end] narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() - param = params_dict[new_name] + param = params_dict[name] param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif ".experts.down_proj" in name and "bias" not in name: + loaded_params.add(name) + continue + elif ".w2_weight" in name: # Handle MLP down projection weights - new_name = name.replace(".experts.down_proj", - ".experts.w2_weight") - if use_ep: narrow_weight = weight[ep_rank_start:ep_rank_end, ...] else: narrow_weight = weight[:, tp_rank_start:tp_rank_end, :] narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() - param = params_dict[new_name] + param = params_dict[name] param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif "gate_up_proj_bias" in name: + loaded_params.add(name) + continue + elif ".w13_bias" in name: # Handle MLP gate and up projection biases - new_name = name.replace("gate_up_proj_bias", "w13_bias") - # Extract gate and up projection bias parts if use_ep: narrow_weight = weight[ep_rank_start:ep_rank_end, ...] @@ -555,60 +476,155 @@ def maybe_rename(name: str) -> str: narrow_weight = weight[:, 2 * tp_rank_start:2 * tp_rank_end] - param = params_dict[new_name] - + param = params_dict[name] param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif "down_proj_bias" in name: + loaded_params.add(name) + continue + elif ".w2_bias" in name: # Handle MLP down projection bias - new_name = name.replace("down_proj_bias", "w2_bias") - if use_ep: weight = weight[ep_rank_start:ep_rank_end, ...] else: # (only load on rank 0 to avoid duplication) if tp_rank != 0: weight.zero_() - param = params_dict[new_name] + param = params_dict[name] param.copy_(weight) - loaded_params.add(new_name) + loaded_params.add(name) + continue elif "sinks" in name: # Handle attention sinks (distributed across ranks) - name = name.replace("self_attn", "attn") param = params_dict[name] narrow_weight = weight.narrow(0, head_start, heads_per_rank) param.data.copy_(narrow_weight) loaded_params.add(name) - elif "q_proj" in name or "k_proj" in name or "v_proj" in name: - shard_id = ("q" if "q_proj" in name else - "k" if "k_proj" in name else "v") - name = name.replace("self_attn", "attn") - param_name = name.replace(f"{shard_id}_proj", "qkv") - param = params_dict[param_name] - weight_loader = param.weight_loader - weight_loader(param, weight, loaded_shard_id=shard_id) - loaded_params.add(param_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, weight) + else: + weight_loader(param, weight, shard_id) + break else: # Handle all other weights with potential renaming - - renamed_name = maybe_rename(name) - if renamed_name not in params_dict: + if name not in params_dict: continue - param = params_dict[renamed_name] + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, weight) - loaded_params.add(renamed_name) - + loaded_params.add(name) return loaded_params def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - quant_method = (self.model_config.quantization_config['quant_method'] - if hasattr(self.model_config, "quantization_config") - else None) + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv", ".q_proj", "q"), + (".qkv", ".k_proj", "k"), + (".qkv", ".v_proj", "v"), + ] + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + + # Attention heads per rank + heads_per_rank = self.config.num_attention_heads // tp_size + head_start = tp_rank * heads_per_rank + + ep_size = get_ep_group().world_size + ep_rank = get_ep_group().rank + num_experts = self.config.num_local_experts + experts_per_rank = num_experts // ep_size + ep_rank_start = ep_rank * experts_per_rank + ep_rank_end = (ep_rank + 1) * experts_per_rank + + quant_method = (self.config.quantization_config['quant_method'] if + hasattr(self.config, "quantization_config") else None) if quant_method == "mxfp4": - return self._load_weights_mxfp4(weights) + return self._load_weights_mxfp4(ep_rank_end, ep_rank_start, + heads_per_rank, head_start, + weights, stacked_params_mapping) else: - return self._load_weights_other(weights) + return self._load_weights_other(ep_rank_end, ep_rank_start, + heads_per_rank, head_start, + weights, stacked_params_mapping) + + +class GptOssForCausalLM(nn.Module): + packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]} + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".self_attn.": ".attn.", + ".post_attention_layernorm.": ".mlp.norm.", + }, + orig_to_new_suffix={ + ".embed_tokens.weight": ".embedding.weight", + ".input_layernorm.weight": ".attn.norm.weight", + ".post_attention_layernorm.weight": ".mlp.norm.weight", + + # MoE MXFP4 weights + ".gate_up_proj_blocks": ".w13_weight", + ".down_proj_blocks": ".w2_weight", + ".gate_up_proj_scales": ".w13_weight_scale", + ".down_proj_scales": ".w2_weight_scale", + + # MoE other weights + ".gate_up_proj": ".w13_weight", + ".down_proj": ".w2_weight", + + # MoE Bias + ".gate_up_proj_bias": ".w13_bias", + ".down_proj_bias": ".w2_bias", + }, + ) + + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + ): + super().__init__() + self.vllm_config = vllm_config + self.config = vllm_config.model_config.hf_config + + self.model = GptOssModel( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + ) + self.lm_head = ParallelLMHead( + self.config.vocab_size, + self.config.hidden_size, + ) + self.logits_processor = LogitsProcessor(self.config.vocab_size) + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: + assert intermediate_tensors is None + assert inputs_embeds is None + return self.model(input_ids, positions) + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From 3cfcd131507e91125b189e151a95b0a5e260a62d Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Wed, 20 Aug 2025 18:46:59 +0800 Subject: [PATCH 176/231] Fix missing quotes (#23242) Signed-off-by: Shiming Zhang Signed-off-by: Duncan Moss --- docs/deployment/frameworks/dstack.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index 23dc58c974ed..fe4d87f78f2a 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -9,7 +9,7 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```bash -pip install "dstack[all] +pip install dstack[all] dstack server ``` From 3e3270424a090dc1950a5f373409395b352a219a Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Wed, 20 Aug 2025 04:01:31 -0700 Subject: [PATCH 177/231] [Model] Support deepseek with eagle (#21086) Signed-off-by: Xin Yang Signed-off-by: Duncan Moss --- tests/models/registry.py | 3 + tests/v1/e2e/test_spec_decode.py | 6 +- vllm/model_executor/models/deepseek_eagle.py | 246 +++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/deepseek_eagle.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 28fe9063169e..739d96227971 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -530,6 +530,9 @@ def check_available_online( "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random", speculative_model="luccafong/deepseek_mtp_draft_random", # noqa: E501 trust_remote_code=True), + "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random", + speculative_model="eagle618/eagle-deepseek-v3-random", # noqa: E501 + trust_remote_code=True), "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B", trust_remote_code=True, speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B", diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 7b3f45831279..bd0fa6b80781 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -144,6 +144,8 @@ def test_ngram_correctness( "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), True, marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + (("eagle", "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", 1), False), ], ids=[ # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 @@ -151,7 +153,8 @@ def test_ngram_correctness( "llama3_eagle", "llama3_eagle3", "llama4_eagle", - "llama4_eagle_mm" + "llama4_eagle_mm", + "deepseek_eagle" ]) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @@ -177,6 +180,7 @@ def test_eagle_correctness( ''' with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_MLA_DISABLE", "1") m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) if (attn_backend == "TRITON_ATTN_VLLM_V1" diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py new file mode 100644 index 000000000000..0c9c83cf6100 --- /dev/null +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn as nn + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer, + DeepseekV3ForCausalLM) +from vllm.model_executor.sampling_metadata import SamplingMetadata + +from .utils import AutoWeightsLoader, maybe_prefix + + +@support_torch_compile +class DeepseekV2Model(nn.Module): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + start_layer_id: int = 0, + ) -> None: + super().__init__() + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.vocab_size = self.config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "embed_tokens"), + ) + + self.layers = nn.ModuleList([ + DeepseekV2DecoderLayer( + self.config, + prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ) for i in range(self.config.num_hidden_layers) + ]) + + self.fc = nn.Linear( + self.config.model.hidden_size * 2, + self.config.model.hidden_size, + bias=False, + ) + + self.enorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.hnorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.norm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + input_embeds = self.embed_tokens(input_ids) + + inputs = torch.cat( + [self.enorm(input_embeds), + self.hnorm(hidden_states)], dim=-1) + hidden_states = self.fc(inputs) + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states, hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ("fused_qkv_a_proj", "q_a_proj", 0), + ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name_mapped = name.replace(weight_name, param_name) + + # QKV fusion is optional, fall back to normal + # weight loading if it's not enabled + # if go with fusion option, then update name + if ((param_name == "fused_qkv_a_proj") + and name_mapped not in params_dict): + continue + else: + name = name_mapped + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # if PP disabled then draft will share embed with target + if get_pp_group().world_size == 1 and \ + "embed_tokens." in name: + continue + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config + quant_config = vllm_config.quant_config + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) + self.model = DeepseekV2Model(vllm_config=vllm_config, + prefix="model", + start_layer_id=target_layer_num) + + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config) + + logit_scale = getattr(self.config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.config.vocab_size, + scale=logit_scale) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if inputs_embeds is not None: + raise NotImplementedError( + f"{type(self).__name__} does not support multimodal inputs yet." + ) + return self.model(input_ids, positions, hidden_states) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader( + self, + skip_prefixes=None, + ) + + model_weights = {} + for name, loaded_weight in weights: + if "lm_head" not in name: + name = "model." + name + model_weights[name] = loaded_weight + loader.load_weights(model_weights.items()) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8728684d8e68..a94231b0f846 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -264,6 +264,7 @@ "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), From bd65d52494360392f0d60b266943365b7376ec37 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 19:09:18 +0800 Subject: [PATCH 178/231] [Bugfix] Ensure correctness of Cohere2Vision processing (#23245) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- .../multimodal/processing/test_common.py | 1 + vllm/model_executor/models/aya_vision.py | 3 +- vllm/model_executor/models/cohere2_vision.py | 71 ++++++++++++++----- 3 files changed, 56 insertions(+), 19 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 8aa0dc7e8e34..d5b1de834a61 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -268,6 +268,7 @@ def _test_processing_correctness_one( "CohereForAI/aya-vision-8b", "Salesforce/blip2-opt-2.7b", "facebook/chameleon-7b", + "CohereLabs/command-a-vision-07-2025", "deepseek-ai/deepseek-vl2-tiny", "microsoft/Florence-2-base", "adept/fuyu-8b", diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index b02a973d942c..687c82ded9d0 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -250,8 +250,7 @@ def _get_prompt_updates( image_processor = hf_processor.image_processor def get_replacement(item_idx: int): - images: ImageProcessorItems = mm_items.get("image", - ImageProcessorItems) + images = mm_items.get_items("image", ImageProcessorItems) image_size: ImageSize = images.get_image_size(item_idx) num_patches = self.info.get_num_patches( image_width=image_size.width, diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index bc526fd661b6..4682a8a428a0 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -10,6 +10,8 @@ from torch import nn from transformers import BatchFeature, PretrainedConfig from transformers.models.cohere2_vision import Cohere2VisionConfig +from transformers.models.cohere2_vision.image_processing_cohere2_vision_fast import ( # noqa: E501 + get_optimal_tiled_canvas) from transformers.models.cohere2_vision.processing_cohere2_vision import ( Cohere2VisionProcessor) @@ -150,14 +152,46 @@ def get_image_size_with_most_features(self) -> ImageSize: max_patches = image_processor.max_patches return ImageSize(height=height * max_patches, width=width) - def get_num_patches(self, image_width: int, image_height: int) -> int: + def get_num_patches( + self, + *, + image_width: int, + image_height: int, + processor: Optional[Cohere2VisionProcessor], + ) -> int: """ Calculate the number of image patches for a given image. Uses the HF processor to determine the actual number of patches. """ - return self.get_hf_processor( - ).image_processor.get_number_of_image_patches(image_height, - image_width, {}) + if processor is None: + processor = self.get_hf_processor() + + image_processor = processor.image_processor + + # The current implementation of get_number_of_image_patches + # is incorrect, so we patch it here. + # return image_processor.get_number_of_image_patches(image_height, + # image_width, {}) + + min_patches = image_processor.min_patches + max_patches = image_processor.max_patches + patch_size = image_processor.size + crop_to_patches = image_processor.crop_to_patches + + if not crop_to_patches: + return 1 + + num_columns, num_rows = get_optimal_tiled_canvas( + (image_height, image_width), + (patch_size["height"], patch_size["width"]), + min_patches, + max_patches, + ) + num_patches = num_columns * num_rows + if num_patches > 1: + num_patches += 1 # Thumbnail image + + return num_patches class Cohere2VisionDummyInputsBuilder( @@ -208,6 +242,8 @@ def _call_hf_processor( # Ensure num_patches is available for proper tensor splitting if "num_patches" not in processed_outputs and ( images := mm_data.get("images")) is not None: + hf_processor = self.info.get_hf_processor(**mm_kwargs) + # Fallback calculation if HF processor didn't provide num_patches parsed_images = self._get_data_parser().parse_mm_data({ "image": @@ -217,8 +253,9 @@ def _call_hf_processor( num_patches = [ self.info.get_num_patches( image_width=parsed_images.get_image_size(i).width, - image_height=parsed_images.get_image_size(i).height) - for i in range(len(parsed_images)) + image_height=parsed_images.get_image_size(i).height, + processor=hf_processor, + ) for i in range(len(parsed_images)) ] processed_outputs["num_patches"] = torch.tensor(num_patches) @@ -245,25 +282,25 @@ def _get_prompt_updates( ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token + img_tokens_per_tile = int(hf_processor.patch_size**2) img_line_break_token = hf_processor.img_line_break_token boi_token = hf_processor.boi_token eoi_token = hf_processor.eoi_token def get_replacement(item_idx: int): - images: ImageProcessorItems = mm_items.get("image", - ImageProcessorItems) + images = mm_items.get_items("image", ImageProcessorItems) image_size: ImageSize = images.get_image_size(item_idx) - num_patches = self.info.get_num_patches(image_size.height, - image_size.width) - img_tokens_per_tile = int(hf_processor.patch_size**2) - single_tile_tokens = image_token * img_tokens_per_tile + \ - img_line_break_token - img_string = f"{boi_token}\ - {single_tile_tokens * num_patches}\ - {eoi_token}" + num_patches = self.info.get_num_patches( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + patch_tokens = (image_token * img_tokens_per_tile + + img_line_break_token) + repl = f"{boi_token}{patch_tokens * num_patches}{eoi_token}" - return PromptUpdateDetails.select_text(img_string, image_token) + return PromptUpdateDetails.select_text(repl, image_token) return [ PromptReplacement( From ee0dd04db34cb39d8727f8be6f00514ac880ced7 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 08:05:54 -0400 Subject: [PATCH 179/231] Update to flashinfer-python==0.2.12 and disable AOT compile for non-release image (#23129) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- .buildkite/release-pipeline.yaml | 2 +- docker/Dockerfile | 52 ++++++++++++++++++++------------ setup.py | 2 +- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 85d3e5638742..e20ce54ca795 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -68,7 +68,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Annotate release workflow" diff --git a/docker/Dockerfile b/docker/Dockerfile index 74938917781a..cfaa59868215 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -372,31 +372,45 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" -# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt -# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel. -ARG FLASHINFER_GIT_REF="v0.2.11" +# Keep this in sync with "flashinfer" extra in setup.py +ARG FLASHINFER_GIT_REF="v0.2.12" +# Flag to control whether to compile FlashInfer AOT kernels +# Set to "true" to enable AOT compilation: +# docker build --build-arg FLASHINFER_AOT_COMPILE=true ... +ARG FLASHINFER_AOT_COMPILE=false RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment git clone --depth 1 --recursive --shallow-submodules \ --branch ${FLASHINFER_GIT_REF} \ ${FLASHINFER_GIT_REPO} flashinfer - # Exclude CUDA arches for older versions (11.x and 12.0-12.7) - # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. - if [[ "${CUDA_VERSION}" == 11.* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" - elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" - else - # CUDA 12.8+ supports 10.0a and 12.0 - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" - fi - echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}" - # Needed to build AOT kernels pushd flashinfer - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer.aot - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - uv pip install --system --no-build-isolation --force-reinstall --no-deps . + if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then + # Exclude CUDA arches for older versions (11.x and 12.0-12.7) + # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. + if [[ "${CUDA_VERSION}" == 11.* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" + elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" + else + # CUDA 12.8+ supports 10.0a and 12.0 + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" + fi + echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" + # Build AOT kernels + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer.aot + # Install with no-build-isolation since we already built AOT kernels + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + uv pip install --system --no-build-isolation . \ + --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + # Download pre-compiled cubins + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." + else + echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode" + uv pip install --system . \ + --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + fi popd rm -rf flashinfer BASH diff --git a/setup.py b/setup.py index cc3037ebb72c..6a3013de7937 100644 --- a/setup.py +++ b/setup.py @@ -685,7 +685,7 @@ def _read_requirements(filename: str) -> list[str]: "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.11"], + "flashinfer": ["flashinfer-python==0.2.12"], }, cmdclass=cmdclass, package_data=package_data, From 843e77bd9e49c862d1e918cad529e337948f3892 Mon Sep 17 00:00:00 2001 From: xyxinyang <43821961+xyxinyang@users.noreply.github.com> Date: Wed, 20 Aug 2025 20:41:55 +0800 Subject: [PATCH 180/231] [Model][V1] Support Ernie MTP (#22169) Signed-off-by: zhouchong Co-authored-by: zhouchong Signed-off-by: Duncan Moss --- tests/models/registry.py | 3 + vllm/config/__init__.py | 31 ++- vllm/model_executor/models/ernie_mtp.py | 287 ++++++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/v1/spec_decode/eagle.py | 2 +- vllm/worker/worker.py | 3 +- 6 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 vllm/model_executor/models/ernie_mtp.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 739d96227971..6e6acfb8cd22 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -556,6 +556,9 @@ def check_available_online( is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), + "ErnieMTPModel": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT", + trust_remote_code=True, + speculative_model="baidu/ERNIE-4.5-21B-A3B-PT"), "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5", speculative_model="zai-org/GLM-4.5", min_transformers_version="4.54", diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 56a749789b6a..801fa97fe5da 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1463,7 +1463,8 @@ def get_layers_start_end_indices( from vllm.distributed.utils import get_pp_indices if (self.hf_text_config.model_type == "deepseek_mtp" or self.hf_config.model_type == "mimo_mtp" - or self.hf_config.model_type == "glm4_moe_mtp"): + or self.hf_config.model_type == "glm4_moe_mtp" + or self.hf_config.model_type == "ernie_mtp"): total_num_hidden_layers = getattr(self.hf_text_config, "num_nextn_predict_layers", 0) else: @@ -1911,7 +1912,8 @@ def __post_init__(self): SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", - "mlp_speculator", "draft_model", "deepseek_mtp"] + "mlp_speculator", "draft_model", "deepseek_mtp", + "ernie_mtp"] @config @@ -2044,6 +2046,16 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: "architectures": ["Glm4MoeMTPModel"] }) + if hf_config.model_type == "ernie4_5_moe": + hf_config.model_type = "ernie_mtp" + if hf_config.model_type == "ernie_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "n_predict": n_predict, + "architectures": ["ErnieMTPModel"] + }) + return hf_config + return hf_config def __post_init__(self): @@ -2062,8 +2074,8 @@ def __post_init__(self): if self.target_model_config and \ (self.target_model_config.hf_text_config.model_type \ == "deepseek_v3" or - self.target_model_config.hf_text_config.model_type \ - == "mimo"): + self.target_model_config.hf_text_config.model_type in + ("mimo","ernie4_5_moe")): # use the draft model from the same model: self.model = self.target_model_config.model elif self.method in ("ngram", "[ngram]"): @@ -2161,6 +2173,15 @@ def __post_init__(self): "one layer. Might need some code changes " \ "to support multiple layers." ) + elif (self.draft_model_config.hf_config.model_type == + "ernie_mtp"): + self.method = "ernie_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Ernie MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) else: self.method = "draft_model" raise NotImplementedError( @@ -2376,7 +2397,7 @@ def num_lookahead_slots(self) -> int: return self.num_speculative_tokens def use_eagle(self) -> bool: - return self.method in ("eagle", "eagle3", "deepseek_mtp") + return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp") def __repr__(self) -> str: method = self.method diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py new file mode 100644 index 000000000000..90a1267b28f0 --- /dev/null +++ b/vllm/model_executor/models/ernie_mtp.py @@ -0,0 +1,287 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The Baidu team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Ernie-MTP model.""" +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsPP +from .llama import LlamaDecoderLayer +from .utils import is_pp_missing_parameter, maybe_prefix + + +class ErnieMultiTokenPredictorLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + + self.mtp_emb_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.mtp_hidden_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.mtp_linear_proj = nn.Linear(config.hidden_size * 2, + config.hidden_size, + bias=False) + self.mtp_block = LlamaDecoderLayer(config, cache_config, quant_config, + prefix) + + def forward( + self, + inputs_embeds: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + spec_step_index: int = 0, + ) -> torch.Tensor: + assert inputs_embeds is not None + # masking inputs at position 0, as not needed by MTP + inputs_embeds[positions == 0] = 0 + + inputs_embeds = self.mtp_emb_norm(inputs_embeds) + previous_hidden_states = self.mtp_hidden_norm(previous_hidden_states) + + hidden_states = self.mtp_linear_proj( + torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) + + hidden_states, residual = self.mtp_block(positions=positions, + hidden_states=hidden_states, + residual=None) + hidden_states = residual + hidden_states + + return hidden_states + + +class ErnieMultiTokenPredictor(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = config.num_nextn_predict_layers + # to map the exact layer index from weights + self.layers = torch.nn.ModuleDict({ + str(idx): + ErnieMultiTokenPredictorLayer( + config, + f"{prefix}.layers.{idx}", + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + ) + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + }) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.logits_processor = LogitsProcessor(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]( + inputs_embeds, + positions, + previous_hidden_states, + spec_step_idx, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + lm_head: ParallelLMHead, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> torch.Tensor: + self.layers[str(self.mtp_start_layer_idx + spec_step_idx)] + logits = self.logits_processor(lm_head, hidden_states, + sampling_metadata) + return logits + + +class ErnieMTP(nn.Module, SupportsPP): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + self.config = vllm_config.model_config.hf_config + self.model = ErnieMultiTokenPredictor(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size) + self.sampler = get_sampler() + + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + assert spec_step_idx == 0, "ernie_mtp only support predict one token" + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> Optional[torch.Tensor]: + return self.model.compute_logits(hidden_states, self.lm_head, + sampling_metadata, spec_step_idx) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + + if self.config.tie_word_embeddings and name.endswith( + "lm_head.weight"): + continue + if "rotary_emb.inv_freq" in name: + continue + if "mtp" in name: + name = self._rewrite_spec_layer_name(self.config, name) + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + if "mtp" not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + + # According to DeepSeek-V3 Technical Report, MTP modules + # shares embedding layer. We only load the first weights. + if "mtp_" not in name and ("embed_tokens" not in name + and "lm_head" not in name): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def _rewrite_spec_layer_name(self, config: PretrainedConfig, + name: str) -> str: + """ + Rewrite the weight name to match the format of the original model. + """ + spec_layer_weight_names = [ + "embed_tokens", "mtp_emb_norm", "mtp_hidden_norm", + "mtp_linear_proj" + ] + layer_idx = config.num_hidden_layers + for weight_name in spec_layer_weight_names: + if weight_name in name: + name = name.replace( + f"model.{weight_name}.0.", + f"model.layers.{layer_idx}.{weight_name}.") + return name + name = name.replace("model.mtp_block.0.", + f"model.layers.{layer_idx}.mtp_block.") + return name diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a94231b0f846..78ef270598b8 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -266,6 +266,7 @@ # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), + "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), # Temporarily disabled. diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index a8a160a0f995..8cd2ad12cfa3 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -194,7 +194,7 @@ def propose( hidden_states=self.hidden_states[:num_input_tokens], inputs_embeds=inputs_embeds, ) - if self.method == "deepseek_mtp": + if self.method in ("deepseek_mtp", "ernie_mtp"): last_hidden_states = ret_hidden_states else: last_hidden_states, hidden_states = ret_hidden_states diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9dfea947568d..7a01e585ba6d 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -77,7 +77,8 @@ def __init__( "eagle", "deepseek_mtp", "glm4_moe_mtp", - "mimo_mtp")) \ + "mimo_mtp", + "ernie_mtp")) \ else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner From b96ca949d75d50bdd3b6cd936059ff2b327f0656 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 20 Aug 2025 20:47:05 +0800 Subject: [PATCH 181/231] [Model] Improve olmo and olmo2 (#23228) Signed-off-by: Jee Jee Li Signed-off-by: Duncan Moss --- docs/models/supported_models.md | 4 ++-- vllm/model_executor/models/olmo.py | 22 +++++++++++++++++++--- vllm/model_executor/models/olmo2.py | 17 +++++++++++++++-- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7908e4238710..7308d0010690 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -384,8 +384,8 @@ th { | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ | -| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 1dc4df85c1bc..01639d398126 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -47,7 +47,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -91,6 +91,7 @@ def __init__( self.total_num_heads, bias=config.attention_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) # Rotary embeddings. @@ -114,6 +115,7 @@ def __init__( self.hidden_size, bias=config.attention_bias, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) def forward( @@ -142,6 +144,7 @@ def __init__( self, config: OlmoConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -154,6 +157,7 @@ def __init__( [self.intermediate_size] * 2, bias=False, quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", ) # Activation function. @@ -165,6 +169,7 @@ def __init__( self.hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.down_proj", ) def forward( @@ -197,7 +202,7 @@ def __init__(self, prefix=f"{prefix}.self_attn") # MLP block. - self.mlp = OlmoMLP(config, quant_config) + self.mlp = OlmoMLP(config, quant_config, prefix=f"{prefix}.mlp") # LayerNorm self.input_layernorm = nn.LayerNorm(config.hidden_size, @@ -326,10 +331,21 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class OlmoForCausalLM(nn.Module, SupportsPP): +class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): """ Extremely barebones HF model wrapper. """ + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 499e6d30ed6b..66a0f9115585 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -33,6 +33,7 @@ from transformers import Olmo2Config from vllm.attention import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed.communication_op import tensor_model_parallel_all_gather @@ -48,7 +49,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -253,6 +254,7 @@ def forward( return hidden_states +@support_torch_compile class Olmo2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -354,10 +356,21 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class Olmo2ForCausalLM(nn.Module, SupportsPP): +class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): """ Extremely barebones HF model wrapper. """ + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() From 29f58a0740d9f2fc1d630df650f80b81775d1ec4 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Wed, 20 Aug 2025 21:34:49 +0800 Subject: [PATCH 182/231] [Fix] fix offline env use local mode path (#22526) Signed-off-by: rongfu.leng Signed-off-by: Duncan Moss --- .../offline_mode/test_offline_mode.py | 35 +++++++++++++++++++ vllm/engine/arg_utils.py | 10 +++++- vllm/transformers_utils/config.py | 23 ++++++++++-- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index a606eeab5887..dd8d63ad319a 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for HF_HUB_OFFLINE mode""" +import dataclasses import importlib import sys @@ -9,6 +10,7 @@ from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory +from vllm.engine.arg_utils import EngineArgs MODEL_CONFIGS = [ { @@ -108,3 +110,36 @@ def _re_import_modules(): # Error this test if reloading a module failed if reload_exception is not None: raise reload_exception + + +@pytest.mark.skip_global_cleanup +@pytest.mark.usefixtures("cache_models") +def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch): + # Set HF to offline mode and ensure we can still construct an LLM + with monkeypatch.context() as m: + try: + m.setenv("HF_HUB_OFFLINE", "1") + m.setenv("VLLM_NO_USAGE_STATS", "1") + + def disable_connect(*args, **kwargs): + raise RuntimeError("No http calls allowed") + + m.setattr( + urllib3.connection.HTTPConnection, + "connect", + disable_connect, + ) + m.setattr( + urllib3.connection.HTTPSConnection, + "connect", + disable_connect, + ) + # Need to re-import huggingface_hub + # and friends to setup offline mode + _re_import_modules() + engine_args = EngineArgs(model="facebook/opt-125m") + LLM(**dataclasses.asdict(engine_args)) + finally: + # Reset the environment after the test + # NB: Assuming tests are run in online mode + _re_import_modules() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 679905aed9ec..48d9cd08af03 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,6 +15,7 @@ Literal, Optional, Type, TypeVar, Union, cast, get_args, get_origin) +import huggingface_hub import regex as re import torch from pydantic import TypeAdapter, ValidationError @@ -39,7 +40,7 @@ from vllm.ray.lazy_utils import is_ray_initialized from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 -from vllm.transformers_utils.config import is_interleaved +from vllm.transformers_utils.config import get_model_path, is_interleaved from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) @@ -457,6 +458,13 @@ def __post_init__(self): # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() + # when use hf offline,replace model id to local model path + if huggingface_hub.constants.HF_HUB_OFFLINE: + model_id = self.model + self.model = get_model_path(self.model, self.revision) + logger.info( + "HF_HUB_OFFLINE is True, replace model_id [%s] " \ + "to model_path [%s]",model_id, self.model) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d8c964fb2a4a..fe345bd8f0a2 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -14,7 +14,7 @@ from huggingface_hub import list_repo_files as hf_list_repo_files from huggingface_hub import try_to_load_from_cache from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, - HFValidationError, LocalEntryNotFoundError, + LocalEntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError) from transformers import GenerationConfig, PretrainedConfig @@ -335,6 +335,7 @@ def maybe_override_with_speculators_target_model( gguf_model_repo = Path(model).parent else: gguf_model_repo = None + kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE config_dict, _ = PretrainedConfig.get_config_dict( model if gguf_model_repo is None else gguf_model_repo, revision=revision, @@ -400,6 +401,7 @@ def get_config( raise ValueError(error_message) from e if config_format == ConfigFormat.HF: + kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE config_dict, _ = PretrainedConfig.get_config_dict( model, revision=revision, @@ -532,7 +534,7 @@ def try_get_local_file(model: Union[str, Path], revision=revision) if isinstance(cached_filepath, str): return Path(cached_filepath) - except HFValidationError: + except ValueError: ... return None @@ -908,3 +910,20 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int: exc_info=e) return max_position_embeddings + + +def get_model_path(model: Union[str, Path], revision: Optional[str] = None): + if os.path.exists(model): + return model + assert huggingface_hub.constants.HF_HUB_OFFLINE + common_kwargs = { + "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE, + "revision": revision, + } + + if envs.VLLM_USE_MODELSCOPE: + from modelscope.hub.snapshot_download import snapshot_download + return snapshot_download(model_id=model, **common_kwargs) + + from huggingface_hub import snapshot_download + return snapshot_download(repo_id=model, **common_kwargs) From 7bc51c27f664c2ee056212237dacab816d7955c4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 22:19:30 +0800 Subject: [PATCH 183/231] [Bugfix] Ensure correctness of HCXVision processing (#23254) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- .../multimodal/processing/test_common.py | 2 +- .../models/hyperclovax_vision.py | 118 ++++++++---------- 2 files changed, 56 insertions(+), 64 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index d5b1de834a61..02aecfad8281 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -102,7 +102,7 @@ def _test_processing_correctness( partial(random_video, rng, min_frames=2, - max_frames=8, + max_frames=16, min_wh=128, max_wh=256), "audio": diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index d3ddc47ea932..f8b30d8d98e5 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -53,6 +53,21 @@ VIDEO_TOKEN: str = "<|_unuse_missing_100270|>" +# Based on combine_frames_into_images in +# https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B/blob/main/processing_hyperclovax.py +def get_num_combined_frames( + num_frames: int, + max_grid_shape: tuple[int, int] = (3, 3), +) -> int: + max_num_grids = max_grid_shape[0] * max_grid_shape[1] + + # Calculate the number of canvases needed. + num_canvases = num_frames // max_num_grids + leftover_frames = num_frames % max_num_grids + + return num_canvases + (leftover_frames > 0) + + class HCXVisionMultimodalPixelInputs(TypedDict): type: Literal["pixel_values"] pixel_values_images: list[torch.Tensor] @@ -172,23 +187,20 @@ def _call_hf_processor( def replace_multimodal_token( token_ids: torch.Tensor, target_token: int, - repeats: list, + repeats: list[int], ): - output = list() + output = list[int]() _repeats_idx = 0 for token_id in token_ids: if token_id == target_token: - output += [ - token_id.item(), - ] * repeats[_repeats_idx] + output += [token_id.item()] * repeats[_repeats_idx] _repeats_idx += 1 else: - output += [ - token_id.item(), - ] + output += [token_id.item()] + return torch.tensor(output, device=token_ids.device) - for video_idx, video_arr in enumerate(mm_data.get("videos", list())): + for video_idx, video_arr in enumerate(mm_data.get("videos", [])): if video_arr.dtype == np.uint8: continue mm_data["videos"][video_idx] = video_arr.astype(np.uint8) @@ -205,88 +217,68 @@ def replace_multimodal_token( if len(mm_data) > 0: # batchify input as a single item images = mm_data.get("images", None) - num_images = 0 - if images is not None: - num_images = len(images) - images = [ - images, - ] # batchify - - videos = mm_data.get("videos", - None) # list of video in single conversation - num_videos = 0 - if videos is not None: - num_videos = len(videos) - videos = [ - videos, - ] # batchify + batched_images = None if images is None else [images] + + # list of video in single conversation + videos = mm_data.get("videos", None) + batched_videos = None if videos is None else [videos] _processed_outputs = self.info.ctx.call_hf_processor( hf_processor=self.info.get_hf_processor(**mm_kwargs), data=dict( text=None, - images=images, - videos=videos, + images=batched_images, + videos=batched_videos, ), ) # mm-only for k, v in _processed_outputs.items(): - if len(v) < 1: - continue - elif k.endswith("_images"): - # list of list of 4D tensor -> list of 4D tensor + if isinstance(v, list) and len(v) > 0: + assert len(v) == 1 _processed_outputs[k] = v[0] - elif k.endswith("_videos"): - # list of list of 4D tensor -> list of 4D tensor - v = v[0] - if k == "pixel_values_videos": - v = torch.cat(v, dim=0) - _c, _w, _h = v.shape[-3:] - v = v.reshape(num_videos, -1, _c, _w, _h) - v = list(torch.unbind(v, dim=0)) - _processed_outputs[k] = v - - if num_images > 0: + + if images: tokenizer = self.info.get_tokenizer() + image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) processed_outputs["input_ids"] = torch.stack([ replace_multimodal_token( token_ids=_input_ids, - target_token=tokenizer.convert_tokens_to_ids( - IMAGE_TOKEN), + target_token=image_token_id, repeats=_processed_outputs[ "vision_query_lengths_images"], ) for _input_ids in processed_outputs["input_ids"] ], dim=0) - if num_videos > 0: + if videos: + _num_per_videos = [ + get_num_combined_frames(len(video)) for video in videos + ] + _processed_outputs["pixel_values_videos"] = [ + _processed_outputs["pixel_values_videos"] + [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])] + for _i in range(len(videos)) + ] + _processed_outputs["vision_query_lengths_videos"] = [ + _processed_outputs["vision_query_lengths_videos"] + [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])] + for _i in range(len(videos)) + ] + tokenizer = self.info.get_tokenizer() + video_token_id = tokenizer.convert_tokens_to_ids(VIDEO_TOKEN) processed_outputs["input_ids"] = torch.stack([ replace_multimodal_token( token_ids=_input_ids, - target_token=tokenizer.convert_tokens_to_ids( - VIDEO_TOKEN), - repeats=_processed_outputs[ - "vision_query_lengths_videos"], + target_token=video_token_id, + repeats=[ + sum(lens) for lens in + _processed_outputs["vision_query_lengths_videos"] + ], ) for _input_ids in processed_outputs["input_ids"] ], dim=0) - _ratios = [ - len(_pixel_values) for _pixel_values in - _processed_outputs["pixel_values_videos"] - ] - _num_per_videos = [ - int(_e / sum(_ratios) * - len(_processed_outputs["vision_query_lengths_videos"])) - for _e in _ratios - ] - _processed_outputs["vision_query_lengths_videos"] = [ - _processed_outputs["vision_query_lengths_videos"] - [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])] - for _i in range(0, num_videos) - ] - processed_outputs.update(_processed_outputs) return processed_outputs From b15629b16c370cce29bc0e938d2971bbbde76243 Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Wed, 20 Aug 2025 07:35:26 -0700 Subject: [PATCH 184/231] [Kernel] CUTLASS MoE FP8: Integrate cuda moe permute/unpermute (#23045) Signed-off-by: Shixian Cui Signed-off-by: Duncan Moss --- .../kernels/benchmark_grouped_gemm_cutlass.py | 35 +++- csrc/moe/moe_permute_unpermute_op.cu | 33 ++-- csrc/ops.h | 5 + .../cutlass_w8a8/moe/get_group_starts.cuh | 6 +- .../quantization/cutlass_w8a8/moe/moe_data.cu | 65 +++++-- .../cutlass_w8a8/scaled_mm_entry.cu | 24 +++ csrc/torch_bindings.cpp | 13 ++ tests/kernels/moe/test_cutlass_moe.py | 18 +- .../kernels/moe/test_moe_permute_unpermute.py | 6 +- tests/kernels/moe/test_pplx_cutlass_moe.py | 22 ++- .../quantization/test_cutlass_scaled_mm.py | 2 +- vllm/_custom_ops.py | 22 +++ .../layers/fused_moe/cutlass_moe.py | 179 +++++++++++------- .../layers/fused_moe/moe_permute_unpermute.py | 29 ++- .../compressed_tensors_moe.py | 31 +++ 15 files changed, 369 insertions(+), 121 deletions(-) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 1d4e730f99ae..a6b42406b5cb 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -80,6 +80,11 @@ def bench_run( a, score, topk, renormalize=False ) + ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + def run_triton_moe( a: torch.Tensor, w1: torch.Tensor, @@ -111,6 +116,10 @@ def run_cutlass_moe( w2: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, per_act_token: bool, @@ -125,6 +134,10 @@ def run_cutlass_moe( topk_ids, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, per_act_token, a1_scale=None, ) @@ -136,6 +149,10 @@ def run_cutlass_from_graph( w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, ): @@ -150,6 +167,10 @@ def run_cutlass_from_graph( topk_ids, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, per_act_token, a1_scale=None, ) @@ -194,6 +215,10 @@ def replay_graph(graph, num_repeats): w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, ) @@ -231,6 +256,10 @@ def replay_graph(graph, num_repeats): "w1_scale": w1_scale, "w2_scale": w2_scale, "per_act_token": per_act_token, + "ab_strides1": ab_strides1, + "ab_strides2": ab_strides2, + "c_strides1": c_strides1, + "c_strides2": c_strides2, # cuda graph params "cutlass_graph": cutlass_graph, "triton_graph": triton_graph, @@ -289,6 +318,10 @@ def replay_graph(graph, num_repeats): w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, per_act_token, @@ -297,7 +330,7 @@ def replay_graph(graph, num_repeats): results.append( benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 2922352a3f7c..ca0c873f49d9 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -45,8 +45,6 @@ void moe_permute( auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess auto permuted_experts_id = torch::empty_like(topk_ids); auto sorted_row_idx = torch::empty_like(inv_permuted_idx); - auto align_expert_first_token_offset = - torch::zeros_like(expert_first_token_offset); CubKeyValueSorter sorter{}; int64_t* valid_num_ptr = nullptr; @@ -85,12 +83,14 @@ void moe_permute( }); // get m_indices and update expert_first_token_offset with align block - getMIndices(get_ptr(expert_first_token_offset), - get_ptr(align_expert_first_token_offset), - get_ptr(m_indices), n_local_expert, align_block_size_value, - stream); + // this is only required for DeepGemm and not required for CUTLASS group gemm if (align_block_size.has_value()) { - // update align_expert_first_token_offset + auto align_expert_first_token_offset = + torch::zeros_like(expert_first_token_offset); + getMIndices(get_ptr(expert_first_token_offset), + get_ptr(align_expert_first_token_offset), + get_ptr(m_indices), n_local_expert, align_block_size_value, + stream); expert_first_token_offset.copy_(align_expert_first_token_offset); } } @@ -195,19 +195,14 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, torch::Tensor& expert_first_token_offset, torch::Tensor& src_row_id2dst_row_id_map, torch::Tensor& m_indices) { - TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); + TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0"); } -void moe_unpermute(const torch::Tensor& input, - const torch::Tensor& topk_weights, torch::Tensor& topk_ids, - const torch::Tensor& token_expert_indices, - const std::optional& expert_map, - int64_t n_expert, int64_t n_local_expert, int64_t topk, - const std::optional& align_block_size, - torch::Tensor& permuted_input, - torch::Tensor& expert_first_token_offset, - torch::Tensor& src_row_id2dst_row_id_map, - torch::Tensor& m_indices) { +void moe_unpermute( + const torch::Tensor& permuted_hidden_states, + const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx, + const std::optional& expert_first_token_offset, int64_t topk, + torch::Tensor& hidden_states) { TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); } @@ -224,4 +219,4 @@ bool moe_permute_unpermute_supported() { TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_permute", &moe_permute); m.impl("moe_unpermute", &moe_unpermute); -} +} \ No newline at end of file diff --git a/csrc/ops.h b/csrc/ops.h index 64bcec6ca152..86fe848e2fd5 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -229,6 +229,11 @@ void get_cutlass_moe_mm_data( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); +void get_cutlass_moe_mm_problem_sizes( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets); + void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh index 6c6e89790847..15bb2c300543 100644 --- a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh +++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh @@ -10,7 +10,7 @@ template __global__ void get_group_gemm_starts( - int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, + int64_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, ElementC** out_offsets, ElementAccumulator** a_scales_offsets, ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int, ElementAB* b_base_as_int, ElementC* out_base_as_int, @@ -34,7 +34,7 @@ __global__ void get_group_gemm_starts( else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ get_group_gemm_starts \ <<<1, num_experts, 0, stream>>>( \ - static_cast(expert_offsets.data_ptr()), \ + static_cast(expert_offsets.data_ptr()), \ static_cast(a_ptrs.data_ptr()), \ static_cast(b_ptrs.data_ptr()), \ static_cast(out_ptrs.data_ptr()), \ @@ -61,6 +61,8 @@ void run_get_group_gemm_starts( TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + // expect int64_t to avoid overflow during offset calculations + TORCH_CHECK(expert_offsets.dtype() == torch::kInt64); int num_experts = static_cast(expert_offsets.size(0)); bool per_act_token = a_scales.numel() != 1; diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu index 100f48508444..49cafcc32adc 100644 --- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu +++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu @@ -104,6 +104,53 @@ __global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids, } } +namespace { +inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids, + torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, + torch::Tensor& atomic_buffer, + int64_t num_experts, int64_t n, + int64_t k, cudaStream_t stream, + const bool swap_ab) { + int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel()); + + const int32_t* topk_ptr = static_cast(topk_ids.data_ptr()); + int32_t* ps1_ptr = static_cast(problem_sizes1.data_ptr()); + int32_t* ps2_ptr = static_cast(problem_sizes2.data_ptr()); + int32_t* atomic_ptr = static_cast(atomic_buffer.data_ptr()); + + if (swap_ab) { + compute_problem_sizes<<>>( + topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr, + static_cast(topk_ids.numel()), static_cast(n), + static_cast(k)); + } else { + compute_problem_sizes<<>>( + topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr, + static_cast(topk_ids.numel()), static_cast(n), + static_cast(k)); + } +} +} // namespace + +void get_cutlass_moe_mm_problem_sizes_caller( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets) { + auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index()); + auto options_int32 = + torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device()); + torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32); + + // Swap-AB should be disabled for FP4 path + bool may_swap_ab = (!blockscale_offsets.has_value()) && + (topk_ids.numel() <= SWAP_AB_THRESHOLD); + + launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, + atomic_buffer, num_experts, n, k, stream, + may_swap_ab); +} + void get_cutlass_moe_mm_data_caller( const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -121,21 +168,9 @@ void get_cutlass_moe_mm_data_caller( bool may_swap_ab = (!blockscale_offsets.has_value()) && (topk_ids.numel() <= SWAP_AB_THRESHOLD); - if (may_swap_ab) { - compute_problem_sizes<<>>( - static_cast(topk_ids.data_ptr()), - static_cast(problem_sizes1.data_ptr()), - static_cast(problem_sizes2.data_ptr()), - static_cast(atomic_buffer.data_ptr()), topk_ids.numel(), n, - k); - } else { - compute_problem_sizes<<>>( - static_cast(topk_ids.data_ptr()), - static_cast(problem_sizes1.data_ptr()), - static_cast(problem_sizes2.data_ptr()), - static_cast(atomic_buffer.data_ptr()), topk_ids.numel(), n, - k); - } + launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, + atomic_buffer, num_experts, n, k, stream, + may_swap_ab); if (blockscale_offsets.has_value()) { // fp4 path diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 106bacb4883c..84843ee6e094 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -76,6 +76,11 @@ void get_cutlass_moe_mm_data_caller( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); +void get_cutlass_moe_mm_problem_sizes_caller( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets); + void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -293,6 +298,25 @@ void get_cutlass_moe_mm_data( version_num, ". Required capability: 90 or 100"); } +void get_cutlass_moe_mm_problem_sizes( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets) { + int32_t version_num = get_sm_version_num(); +#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ + (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) + get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1, + problem_sizes2, num_experts, n, k, + blockscale_offsets); + return; +#endif + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm " + "kernel for CUDA device capability: ", + version_num, ". Required capability: 90 or 100"); +} + void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 7079671c2eb1..3a0ff6eaa790 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -440,6 +440,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { {stride_tag}); ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); + // A function that computes problem sizes for each expert's multiplication + // used by the two mms called from fused MoE operation. It takes topk_ids as + // an input, and computes problem_sizes1 and problem_sizes2 only. + ops.def( + "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, " + " Tensor! problem_sizes1, " + " Tensor! problem_sizes2, " + " int num_experts, int n, int k, " + " Tensor? blockscale_offsets) -> ()", + {stride_tag}); + ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, + &get_cutlass_moe_mm_problem_sizes); + // A function that computes data required to run fused MoE with w8a8 grouped // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs // as an input, and computes expert_offsets (token start indices of each diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 81fb3ec1de18..c84f66383b90 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -207,6 +207,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit, 'topk_ids': topk_ids, 'w1_scale': moe_tensors.w1_scale, 'w2_scale': moe_tensors.w2_scale, + 'ab_strides1': moe_tensors.ab_strides1, + 'ab_strides2': moe_tensors.ab_strides2, + 'c_strides1': moe_tensors.c_strides1, + 'c_strides2': moe_tensors.c_strides2, 'per_act_token': per_act_token, 'a1_scale': None #moe_tensors.a_scale } @@ -424,8 +428,8 @@ def test_run_cutlass_moe_fp8( topk_ids[0][1] = 1 workspace13_shape = (m * topk, max(2 * n, k)) - workspace2_shape = (m * topk, n) - output_shape = (m * topk, k) + workspace2_shape = (m * topk, max(n, k)) + output_shape = (m, k) workspace13 = torch.empty(prod(workspace13_shape), device="cuda", @@ -440,6 +444,11 @@ def test_run_cutlass_moe_fp8( expert_map[start:end] = list(range(num_local_experts)) expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda") + ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + activation = lambda o, i: torch.ops._C.silu_and_mul(o, i) a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale, torch.float8_e4m3fn, @@ -448,8 +457,9 @@ def test_run_cutlass_moe_fp8( func = lambda output: run_cutlass_moe_fp8( output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation, global_num_experts, expert_map, mt.w1_scale, mt.w2_scale, - a1q_scale, None, workspace13, workspace2, None, mt.a.dtype, - per_act_token, per_out_channel, False) + a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2, + workspace13, workspace2, None, mt.a.dtype, per_act_token, + per_out_channel, False, topk_weights) workspace13.random_() output_random_workspace = torch.empty(output_shape, diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 6ca01f9271bb..d71664d94b9c 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -238,7 +238,11 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, atol=0, rtol=0) # check mindice - torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0) + # current kernel usage assumes deepgemm requires align_block_size + # when it's not provided then we don't compute m_indices (for cutlass) + if align_block_size is not None: + torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0) + # check permuted_hidden_states, only valid token torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx], permuted_hidden_states[valid_row_idx], diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index f98937ee6c52..98908f271470 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -76,6 +76,7 @@ def pplx_cutlass_moe( assert torch.cuda.current_device() == pgi.local_rank num_tokens, hidden_dim = a.shape + intermediate_dim = w2.shape[2] num_experts = w1.shape[0] block_size = hidden_dim # TODO support more cases device = pgi.device @@ -124,8 +125,27 @@ def pplx_cutlass_moe( num_local_experts=num_local_experts, num_dispatchers=num_dispatchers) + ab_strides1 = torch.full((num_local_experts, ), + hidden_dim, + device="cuda", + dtype=torch.int64) + ab_strides2 = torch.full((num_local_experts, ), + intermediate_dim, + device="cuda", + dtype=torch.int64) + c_strides1 = torch.full((num_local_experts, ), + 2 * intermediate_dim, + device="cuda", + dtype=torch.int64) + c_strides2 = torch.full((num_local_experts, ), + hidden_dim, + device="cuda", + dtype=torch.int64) + experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers, - out_dtype, per_act_token, per_out_ch) + out_dtype, per_act_token, per_out_ch, + ab_strides1, ab_strides2, c_strides1, + c_strides2) fused_cutlass_experts = FusedMoEModularKernel( prepare_finalize, diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 8730eeaaa761..a15decdf6f82 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, expert_offsets = torch.zeros((num_experts + 1), device=device, - dtype=torch.int32) + dtype=torch.int64) problem_sizes = torch.zeros((num_experts, 3), device=device, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0d556053f898..39da08847b2e 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -844,6 +844,28 @@ def get_cutlass_moe_mm_data(topk_ids: torch.Tensor, blockscale_offsets) +def get_cutlass_moe_mm_problem_sizes( + topk_ids: torch.Tensor, + problem_sizes1: torch.Tensor, + problem_sizes2: torch.Tensor, + num_experts: int, + n: int, + k: int, + blockscale_offsets: Optional[torch.Tensor] = None): + """ + Compute only the per-expert problem sizes needed by the two grouped matrix + multiplications used in CUTLASS-based fused MoE. + + The function takes in topk_ids (token→expert mapping) and computes: + - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's + multiplication for the two grouped MMs + used in the fused MoE operation. + """ + return torch.ops._C.get_cutlass_moe_mm_problem_sizes( + topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, + blockscale_offsets) + + def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor): """ Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor. diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 0a02b558d09e..95d23ec0346c 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -9,12 +9,13 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + moe_permute, moe_unpermute) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP) -from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, - _fp8_quantize, +from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize, _resize_cache) from vllm.scalar_type import scalar_types @@ -34,6 +35,10 @@ def run_cutlass_moe_fp8( w2_scale: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor], + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], @@ -41,6 +46,7 @@ def run_cutlass_moe_fp8( per_act_token: bool, per_out_ch: bool, use_batched_format: bool, + topk_weights: Optional[torch.Tensor], ): a1q = hidden_states @@ -99,6 +105,22 @@ def run_cutlass_moe_fp8( topk = local_topk_ids.size(1) local_E = w1.size(0) + if use_batched_format: + mm1_out = _resize_cache(workspace13, (local_E * padded_M, N * 2)) + act_out = _resize_cache(workspace2, (local_E * padded_M, N)) + quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn), + (local_E * padded_M, N)) + mm2_out = _resize_cache(workspace2, (local_E * padded_M, K)) + else: + a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), + (M * topk, K)) + mm1_out = _resize_cache(workspace13, (M * topk, N * 2)) + act_out = _resize_cache(workspace2, (M * topk, N)) + # original workspace are based on input hidden_states dtype (bf16) + quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn), + (M * topk, N)) + mm2_out = _resize_cache(workspace2, (M * topk, K)) + if use_batched_format: assert expert_num_tokens is not None @@ -120,11 +142,10 @@ def run_cutlass_moe_fp8( w2_scale = w2_scale.reshape(w2_scale.size(0), -1) a1q = a1q.reshape(-1, a1q.size(2)) a1q_scale = a1q_scale.reshape(-1, a1q_scale.size(2)).contiguous() - + # c3x get_group_gemm_starts expects int64 to avoid overflow + # during offset calculations + expert_offsets = expert_offsets.to(torch.int64) else: - expert_offsets = torch.empty((global_num_experts + 1), - dtype=torch.int32, - device=device) problem_sizes1 = torch.empty((global_num_experts, 3), dtype=torch.int32, device=device) @@ -132,84 +153,57 @@ def run_cutlass_moe_fp8( dtype=torch.int32, device=device) - # With expert_map each Rank processes only a subset of experts. As - # a result not all of a_map and c2 tensors are filled. We fill it - # zeros for correctness. - if expert_map is not None: - a_map = torch.zeros((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - else: - a_map = torch.empty((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - - c_map = torch.empty((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - - ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, - problem_sizes1, problem_sizes2, a_map, - c_map, global_num_experts, N, K) - - a1q = _fp8_perm(a1q, a_map) - a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale + num_expert = global_num_experts if expert_map is None \ + else expert_map.size(0) + # permuted a1q reuses workspace2 + a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute( + a1q, + a1q_scale, + topk_ids, + num_expert, + local_E, + expert_map, + permuted_hidden_states=a1q_perm) expert_offsets = expert_offsets[:-1] - ab_strides1 = torch.full((w1.size(0), ), - K, - device=device, - dtype=torch.int64) - c_strides1 = torch.full((w1.size(0), ), - 2 * N, - device=device, - dtype=torch.int64) - ab_strides2 = torch.full((w1.size(0), ), - N, - device=device, - dtype=torch.int64) - c_strides2 = torch.full((w1.size(0), ), - K, - device=device, - dtype=torch.int64) - - if use_batched_format: - c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2)) - c2 = _resize_cache(workspace2, (local_E * padded_M, N)) - c3 = _resize_cache(workspace13, (local_E * padded_M, K)) - else: - c1 = _resize_cache(workspace13, (M * topk, N * 2)) - c2 = _resize_cache(workspace2, (M * topk, N)) - c3 = _resize_cache(workspace13, (M * topk, K)) + ops.get_cutlass_moe_mm_problem_sizes(local_topk_ids, problem_sizes1, + problem_sizes2, + global_num_experts, N, K) if not per_act_token and (expert_map is not None or use_batched_format): # this is necessary to avoid imprecise scale calculation caused by # random data in the unused workspace. The workspace is unused when # this rank handles only partial tokens, or when it is batched . - c1.fill_(0) + mm1_out.fill_(0) - ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets, + ops.cutlass_moe_mm(mm1_out, a1q, w1, a1q_scale, w1_scale, expert_offsets, problem_sizes1, ab_strides1, ab_strides1, c_strides1, per_act_token, per_out_ch) - activation_callable(c2, c1) + activation_callable(act_out, mm1_out) a2q, a2q_scale = ops.scaled_fp8_quant( - c2, a2_scale, use_per_token_if_dynamic=per_act_token) + act_out, + a2_scale, + use_per_token_if_dynamic=per_act_token, + output=quant_out) if expert_map is not None: - c3.fill_(0) + mm2_out.fill_(0) - ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, expert_offsets, + ops.cutlass_moe_mm(mm2_out, a2q, w2, a2q_scale, w2_scale, expert_offsets, problem_sizes2, ab_strides2, ab_strides2, c_strides2, per_act_token, per_out_ch) if use_batched_format: - output.copy_(c3.reshape(local_E, padded_M, K), non_blocking=True) + output.copy_(mm2_out.reshape(local_E, padded_M, K), non_blocking=True) else: - # We can't do this inplace because output may point to the same tensor - # as c3. - output.copy_(c3[c_map].view(M * topk, K), non_blocking=True) + # for non-chunking mode the output is resized from workspace13 + # so we need to make sure mm2_out uses workspace2. + moe_unpermute(out=output, + permuted_hidden_states=mm2_out, + topk_weights=topk_weights, + inv_permuted_idx=inv_perm) class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): @@ -219,6 +213,10 @@ def __init__( out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( @@ -229,6 +227,10 @@ def __init__( block_shape=block_shape, )) self.out_dtype = out_dtype + self.ab_strides1 = ab_strides1 + self.ab_strides2 = ab_strides2 + self.c_strides1 = c_strides1 + self.c_strides2 = c_strides2 def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: # Let PrepareAndFinalize::finalize() decide the impl. @@ -272,10 +274,11 @@ def apply( run_cutlass_moe_fp8( output, hidden_states, w1, w2, topk_ids, activation_callable, global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, - a2_scale, workspace13, workspace2, expert_num_tokens, + a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1, + self.c_strides2, workspace13, workspace2, expert_num_tokens, self.out_dtype if self.out_dtype is not None else in_dtype, self.per_act_token_quant, self.per_out_ch_quant, - use_batched_format) + use_batched_format, topk_weights) class CutlassExpertsFp8(CutlassExpertsFp8Base): @@ -285,12 +288,20 @@ def __init__( out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( out_dtype, per_act_token_quant, per_out_ch_quant, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, block_shape, ) @@ -307,6 +318,10 @@ def supports_chunking(self) -> bool: def supports_expert_map(self) -> bool: return True + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # topk weights and reduction are fused in moe_unpermute cuda kernel + return TopKWeightAndReduceNoOP() + def workspace_shapes( self, a: torch.Tensor, @@ -320,8 +335,8 @@ def workspace_shapes( expert_tokens_meta: Optional[mk.ExpertTokensMetadata], ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: workspace1 = (M * topk, max(N, K)) - workspace2 = (M * topk, N // 2) - output = (M * topk, K) + workspace2 = (M * topk, max(N // 2, K)) + output = (M, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) @@ -335,12 +350,20 @@ def __init__( out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( out_dtype, per_act_token_quant, per_out_ch_quant, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, block_shape, ) assert max_experts_per_worker > 0 @@ -378,7 +401,8 @@ def workspace_shapes( assert num_dp is not None workspace1 = (self.max_experts_per_worker, padded_M * num_dp, max(N, K)) - workspace2 = (self.max_experts_per_worker, padded_M * num_dp, (N // 2)) + workspace2 = (self.max_experts_per_worker, padded_M * num_dp, + max(N // 2, K)) output = (self.max_experts_per_worker, padded_M, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) @@ -392,6 +416,10 @@ def cutlass_moe_fp8( topk_ids: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, per_act_token: Optional[bool] = None, activation: str = "silu", a1_scale: Optional[torch.Tensor] = None, @@ -419,6 +447,17 @@ def cutlass_moe_fp8( Shape: [num_experts] or [num_experts, 2N] - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q. Shape: [num_experts] or [num_experts, K] + - ab_strides1 (torch.Tensor): The input/weight strides for the first gemm. + Shape: [num_experts] + - ab_strides2 (torch.Tensor): The input/weight strides for the second gemm. + Shape: [num_experts] + - c_strides1 (torch.Tensor): The output strides for the first gemm. + Shape: [num_experts] + - c_strides2 (torch.Tensor): The output strides for the second gemm. + Shape: [num_experts] + - per_act_token (Optional[bool]): Whether the scale is per-token or + per-tensor. + - activation (str): The activation function to use. - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a. Shape: scalar or [M] - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to @@ -450,6 +489,10 @@ def cutlass_moe_fp8( out_dtype=a.dtype, per_act_token_quant=per_act_token, per_out_ch_quant=per_out_ch, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, ), ) diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index d9059f50b445..16a155e71847 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -82,7 +82,8 @@ def moe_permute( n_local_expert: int = -1, expert_map: Optional[torch.Tensor] = None, align_block_size: Optional[int] = None, - fill_invalid_expert: int = -1 + fill_invalid_expert: int = -1, + permuted_hidden_states: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: """ @@ -95,14 +96,17 @@ def moe_permute( - n_expert (int): The number of expert. - n_local_expert (int): The number of expert in current EP rank. - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices - from the global expert space to the local expert space of the expert + from the global expert space to the local expert space of the expert parallel shard. - align_block_size (Optional[int]): align group gemm block size for deepgemm - fill_invalid_expert(int): fill expert id in m_indices for invalid expert to workaround DeepGemm unsupported -1 in m_indices + - permuted_hidden_states (Optional[torch.Tensor]): Optional output tensor. + If None, the output tensor will be created in this function. Returns: - permuted_hidden_states (torch.Tensor): permuted activation. - - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states + - a1q_scale (Optional[torch.Tensor]): permuted quant scale for hidden_states + if original scale not per-tensor scaling - expert_first_token_offset (torch.Tensor): offset of the first token of each expert for standard grouped gemm. if enable 'align_block_size' expert_first_token_offset will align up to 'align_block_size'. @@ -122,11 +126,16 @@ def moe_permute( 1) // align_block_size * align_block_size if n_local_expert == -1: n_local_expert = n_expert - permuted_hidden_states = torch.empty( - (permuted_row_size, n_hidden), - dtype=hidden_states.dtype, - device=hidden_states.device, - ) + if permuted_hidden_states is None: + permuted_hidden_states = torch.empty( + (permuted_row_size, n_hidden), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + assert permuted_hidden_states.size() == (permuted_row_size, n_hidden), ( + f"Expected permuted hidden states to be {(permuted_row_size, n_hidden)}" + f" but got {permuted_hidden_states.size()}") + token_expert_indices = torch.arange(0, n_token * topk, dtype=torch.int32, @@ -153,7 +162,8 @@ def moe_permute( align_block_size, permuted_hidden_states, expert_first_token_offset, inv_permuted_idx, permuted_idx, m_indices) - if a1q_scale is not None: + + if a1q_scale is not None and a1q_scale.dim() > 1: a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) // topk] return (permuted_hidden_states, a1q_scale, expert_first_token_offset, @@ -185,6 +195,7 @@ def moe_unpermute( n_hidden = permuted_hidden_states.size(-1) assert (n_hidden * permuted_hidden_states.element_size() ) % 16 == 0, "unpermue kernel need hidden dim align to 16B" + torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights, inv_permuted_idx, expert_first_token_offset, topk, out) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 8ca8249e694e..7bc35cd81ac3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -669,6 +669,25 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: from vllm.model_executor.layers.fused_moe import fused_experts self.fused_experts_func = fused_experts + if self.use_cutlass: + device = layer.w13_weight.device + # ab_strides1 and c_strides2 are the same + self.ab_strides1_c_strides2 = torch.full( + (layer.local_num_experts, ), + layer.hidden_size, + device=device, + dtype=torch.int64) + self.ab_strides2 = torch.full( + (layer.local_num_experts, ), + layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64) + self.c_strides1 = torch.full( + (layer.local_num_experts, ), + 2 * layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64) + def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, @@ -693,6 +712,10 @@ def select_gemm_impl( moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, ) else: logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) @@ -700,6 +723,10 @@ def select_gemm_impl( moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, ) self.disable_expert_map = (num_dispatchers > 1 @@ -822,6 +849,10 @@ def apply( expert_map=None if self.disable_expert_map else expert_map, w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, ) From 2de3c7bca25eca21ddd9b93486a5870eaafeb8b0 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 23:42:28 +0800 Subject: [PATCH 185/231] [CLI][Doc] Formalize `--mm-encoder-tp-mode` (#23190) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- docs/configuration/optimization.md | 45 ++++++++++++++++++++++++ vllm/config/__init__.py | 34 +++++++++++++++++- vllm/config/parallel.py | 4 --- vllm/engine/arg_utils.py | 35 +++++++++++------- vllm/model_executor/models/mllama4.py | 4 +-- vllm/model_executor/models/qwen2_5_vl.py | 3 +- vllm/model_executor/models/step3_vl.py | 3 +- 7 files changed, 104 insertions(+), 24 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index c7f50497d6ff..db9dfb313fb8 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -129,6 +129,51 @@ Data parallelism replicates the entire model across multiple GPU sets and proces Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`. Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size. +### Batch-level DP for Multi-Modal Encoders + +By default, TP is used to shard the weights of multi-modal encoders just like for language decoders, +in order to reduce the memory and compute load on each GPU. + +However, since the size of multi-modal encoders is very small compared to language decoders, +there is relatively little gain from TP. On the other hand, TP incurs significant communication +overhead because of all-reduce being performed after every layer. + +Given this, it may be advantageous to instead shard the batched input data using TP, essentially +performing batch-level DP. This has been shown to improve the throughput by around 10% for +`tensor_parallel_size=8`. For vision encoders that use hardware-unoptimized Conv3D operations, +batch-level DP can provide another 40% increase to throughput compared to regular TP. + +Nevertheless, since the weights of the multi-modal encoder are replicated across each TP rank, +there will be a minor increase in memory consumption and may cause OOM if you can barely fit the model already. + +You can enable batch-level DP by setting `mm_encoder_tp_mode="data"`, for example: + +```python +from vllm import LLM + +llm = LLM( + model="Qwen/Qwen2.5-VL-72B-Instruct", + # Create two EngineCore instances, one per DP rank + data_parallel_size=2, + # Within each EngineCore instance: + # The vision encoder uses TP=4 (not DP=2) to shard the input data + # The language decoder uses TP=4 to shard the weights as usual + tensor_parallel_size=4, + mm_encoder_tp_mode="data", +) +``` + +!! important + Batch-level DP is not to be confused with API request-level DP + (which is instead controlled by `data_parallel_size`). + +The availablilty of batch-level DP is based on model implementation. +Currently, the following models support `mm_encoder_tp_mode="data"`: + +- Llama4 () +- Qwen2.5-VL () +- Step3 () + ## Input Processing ### Parallel Processing diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 801fa97fe5da..5b5d477ef066 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -258,6 +258,7 @@ def is_init_field(cls: ConfigType, name: str) -> bool: ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"] +MMEncoderTPMode = Literal["weights", "data"] @config @@ -438,6 +439,19 @@ class ModelConfig: `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. Set to `0` to disable this cache completely (not recommended).""" + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """Indicates how to optimize multi-modal encoder inference using + tensor parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior) + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP.""" override_neuron_config: dict[str, Any] = field(default_factory=dict) """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to @@ -856,8 +870,10 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: media_io_kwargs=self.media_io_kwargs, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_encoder_tp_mode=self.mm_encoder_tp_mode, interleave_mm_strings=self.interleave_mm_strings, - skip_mm_profiling=self.skip_mm_profiling) + skip_mm_profiling=self.skip_mm_profiling, + ) return None @@ -2547,6 +2563,22 @@ class MultiModalConfig: Set to `0` to disable this cache completely (not recommended). """ + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """ + Indicates how to optimize multi-modal encoder inference using + tensor parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior) + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP. + """ + interleave_mm_strings: bool = False """ Enable fully interleaved support for multimodal prompts. diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index bac1e63800d7..7a9e68f0ea33 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -137,10 +137,6 @@ class is dynamically inherited by the worker class. This is used to inject rank: int = 0 """Global rank in distributed setup.""" - enable_multimodal_encoder_data_parallel: bool = False - """ Use data parallelism instead of tensor parallelism for vision encoder. - Only support LLama4 for now""" - @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 48d9cd08af03..6869c3f23f31 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -28,12 +28,12 @@ DeviceConfig, DistributedExecutorBackend, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, MambaDType, ModelConfig, ModelDType, - ModelImpl, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, - RunnerOption, SchedulerConfig, SchedulerPolicy, - SpeculativeConfig, TaskOption, TokenizerMode, - VllmConfig, get_attr_docs, get_field) + LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, + ModelDType, ModelImpl, MultiModalConfig, + ObservabilityConfig, ParallelConfig, PoolerConfig, + PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, + SchedulerPolicy, SpeculativeConfig, TaskOption, + TokenizerMode, VllmConfig, get_attr_docs, get_field) from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins @@ -352,6 +352,7 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb + mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling # LoRA fields enable_lora: bool = False @@ -434,16 +435,14 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location - enable_multimodal_encoder_data_parallel: bool = \ - ParallelConfig.enable_multimodal_encoder_data_parallel + # DEPRECATED + enable_multimodal_encoder_data_parallel: bool = False logits_processors: Optional[list[Union[ str, type[LogitsProcessor]]]] = ModelConfig.logits_processors """Custom logitproc types""" async_scheduling: bool = SchedulerConfig.async_scheduling - # DEPRECATED - enable_prompt_adapter: bool = False kv_sharing_fast_prefill: bool = \ CacheConfig.kv_sharing_fast_prefill @@ -685,7 +684,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **parallel_kwargs["worker_extension_cls"]) parallel_group.add_argument( "--enable-multimodal-encoder-data-parallel", - **parallel_kwargs["enable_multimodal_encoder_data_parallel"]) + action="store_true", + deprecated=True) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -735,6 +735,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: multimodal_group.add_argument("--disable-mm-preprocessor-cache", action="store_true", deprecated=True) + multimodal_group.add_argument( + "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"]) multimodal_group.add_argument( "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"]) @@ -909,6 +911,14 @@ def create_model_config(self) -> ModelConfig: self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB + if self.enable_multimodal_encoder_data_parallel: + logger.warning( + "--enable-multimodal-encoder-data-parallel` is deprecated " + "and will be removed in v0.13. " + "Please use `--mm-encoder-tp-mode data` instead.") + + self.mm_encoder_tp_mode = "data" + return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, @@ -947,6 +957,7 @@ def create_model_config(self) -> ModelConfig: config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_encoder_tp_mode=self.mm_encoder_tp_mode, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, @@ -1258,8 +1269,6 @@ def create_engine_config( distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, - enable_multimodal_encoder_data_parallel=self. - enable_multimodal_encoder_data_parallel, ) if model_config.is_multimodal_model: diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 35103eac8fb5..595bdd17cf2c 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -728,8 +728,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + self.config = config self.quant_config = quant_config self.multimodal_config = multimodal_config diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 34eec10296b5..811ecffcc1e4 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -877,8 +877,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.config = config self.multimodal_config = multimodal_config diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 5d41a9e569f5..f8877b584b19 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -882,8 +882,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.config = config self.multimodal_config = multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" if multimodal_config.get_limit_per_prompt("image"): self.vision_model = Step3VisionTransformer( From 240e099d3bd5519e38b7c27ab596e5f61121aeac Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 20 Aug 2025 09:05:29 -0700 Subject: [PATCH 186/231] [Misc] Add max_seq_len to CommonAttentionMetadata (#23216) Signed-off-by: Woosuk Kwon Signed-off-by: Duncan Moss --- tests/v1/attention/utils.py | 2 ++ tests/v1/spec_decode/test_tree_attention.py | 2 ++ vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/attention/backends/flex_attention.py | 2 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- vllm/v1/attention/backends/tree_attn.py | 2 +- vllm/v1/attention/backends/triton_attn.py | 2 +- vllm/v1/attention/backends/utils.py | 6 ++++++ vllm/v1/attention/backends/xformers.py | 2 +- vllm/v1/spec_decode/eagle.py | 1 + vllm/v1/worker/gpu_model_runner.py | 4 ++++ 12 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index a4e38eb32f6a..e547e71e0cdb 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -58,6 +58,7 @@ def create_common_attn_metadata( dtype=torch.int32, device=device) seq_lens_cpu = seq_lens.cpu() + max_seq_len = int(seq_lens_cpu.max()) # Create computed tokens (context length for each sequence) context_lens = [ @@ -101,6 +102,7 @@ def create_common_attn_metadata( num_reqs=batch_spec.batch_size, num_actual_tokens=num_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table_tensor, slot_mapping=slot_mapping, causal=True, diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 456ce712d36e..631781740866 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -50,6 +50,7 @@ def forward_attention( dtype=torch.int32, ) context_lens = seq_lens - query_lens + max_seq_len = int(seq_lens.max()) max_query_len = q_len num_actual_tokens = query_start_loc[-1] @@ -81,6 +82,7 @@ def forward_attention( num_reqs=batch_size, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table, slot_mapping=slot_mapping, ) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ab7a71a399b3..eed3cba9a2ca 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -233,7 +233,7 @@ def build(self, num_reqs = common_attn_metadata.num_reqs num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 53fafbc4af91..8a25088848a4 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -463,7 +463,7 @@ def build(self, page_size = self.page_size max_q_len = common_attn_metadata.max_query_len - max_seq_len = common_attn_metadata.seq_lens_cpu.max().item() + max_seq_len = common_attn_metadata.max_seq_len seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index e599411b2d7e..abca981035d9 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -305,7 +305,7 @@ def build(self, num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 36b5853bfdcb..b9ff113573a1 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -270,7 +270,7 @@ def build(self, num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 5d10e9e26082..2a0c52377cc7 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -205,7 +205,7 @@ def build( q_start_loc = common_attn_metadata.query_start_loc max_query_len = common_attn_metadata.max_query_len kv_seqlens = common_attn_metadata.seq_lens - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len block_table = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 48a9af3decac..c69dd8415f92 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -90,7 +90,7 @@ def build(self, num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 94dd3d2629eb..57c4d436c5b6 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -58,6 +58,8 @@ class CommonAttentionMetadata: """Total number of tokens in batch""" max_query_len: int """Longest query in batch""" + max_seq_len: int + """Longest context length in batch""" block_table_tensor: torch.Tensor slot_mapping: torch.Tensor @@ -107,6 +109,7 @@ def _make_metadata_with_slice( seq_lens = attn_metadata.seq_lens[request_slice] seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice] + max_seq_len = int(seq_lens_cpu.max()) num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[ request_slice] @@ -128,6 +131,7 @@ def _make_metadata_with_slice( num_reqs=num_requests, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table_tensor, slot_mapping=slot_mapping, ) @@ -520,6 +524,7 @@ def make_local_attention_virtual_batches( query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local) seq_lens_cpu = torch.from_numpy(seqlens_k_local) + max_seq_len = int(seq_lens_cpu.max()) return CommonAttentionMetadata( query_start_loc_cpu=query_start_loc_cpu, @@ -531,6 +536,7 @@ def make_local_attention_virtual_batches( num_reqs=len(seq_lens_cpu), num_actual_tokens=common_attn_metadata.num_actual_tokens, max_query_len=seqlens_q_local.max(), + max_seq_len=max_seq_len, block_table_tensor=block_table_local, slot_mapping=common_attn_metadata.slot_mapping, causal=True, diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index fe732c601770..b305bc153908 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -231,7 +231,7 @@ def build( q_seqlens = torch.diff(q_start_loc) max_query_len = common_attn_metadata.max_query_len kv_seqlens = common_attn_metadata.seq_lens - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len block_table = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 8cd2ad12cfa3..cc2b2a139d5e 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -582,6 +582,7 @@ def prepare_inputs( num_reqs=common_attn_metadata.num_reqs, num_actual_tokens=total_num_tokens, max_query_len=new_query_len_per_req.max().item(), + max_seq_len=new_seq_lens_cpu.max().item(), block_table_tensor=common_attn_metadata.block_table_tensor, slot_mapping=common_attn_metadata.slot_mapping[token_indices], causal=True, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e0bab3367caf..d9770226b14e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -774,6 +774,7 @@ def _prepare_inputs( self.seq_lens_np[num_reqs:].fill(0) self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True) seq_lens = self.seq_lens[:num_reqs] + max_seq_len = self.seq_lens_np[:num_reqs].max().item() # Copy the tensors to the GPU. self.input_ids[:total_num_scheduled_tokens].copy_( @@ -886,6 +887,7 @@ def _prepare_inputs( num_reqs=num_reqs, num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, + max_seq_len=max_seq_len, block_table_tensor=blk_table_tensor, slot_mapping=slot_mapping, causal=True, @@ -2338,6 +2340,7 @@ def _dummy_run( num_reqs=num_reqs, num_actual_tokens=num_tokens, max_query_len=max_query_len, + max_seq_len=self.max_model_len, block_table_tensor=self.input_batch.block_table[ kv_cache_group_id].get_device_tensor()[:num_reqs], slot_mapping=self.input_batch. @@ -3343,6 +3346,7 @@ def _build_encoder_only_attn_metadata( num_reqs=num_reqs, num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, + max_seq_len=self.seq_lens_cpu[:num_reqs].max().item(), block_table_tensor=dummy_block_table, slot_mapping=dummy_slot_mapping, causal=False, From 35b1c745cd3bfcc03af5e2d8db0c96a693790221 Mon Sep 17 00:00:00 2001 From: JartX Date: Wed, 20 Aug 2025 18:08:29 +0200 Subject: [PATCH 187/231] [FIXBUG ] Allow disabling rocm_aiter_fa backend for ROCm GPUs not compatible with AITER (#22795) Signed-off-by: JartX Signed-off-by: tjtanaa Co-authored-by: tjtanaa Signed-off-by: Duncan Moss --- vllm/v1/spec_decode/eagle.py | 80 ++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index cc2b2a139d5e..0a0e9fed725c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast from dataclasses import replace -from typing import Optional +from importlib.util import find_spec +from typing import Optional, Protocol import numpy as np import torch @@ -20,8 +21,6 @@ from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.v1.attention.backends.rocm_aiter_fa import ( - AiterFlashAttentionMetadata) from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata, TreeAttentionMetadataBuilder) from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata @@ -34,6 +33,17 @@ PADDING_SLOT_ID = -1 +class EagleAttentionMetadata(Protocol): + # Required attributes + num_actual_tokens: int + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_lens: torch.Tensor + block_table: torch.Tensor + slot_mapping: torch.Tensor + + class EagleProposer: def __init__( @@ -97,6 +107,20 @@ def __init__( dtype=self.dtype, device=device) + # Determine allowed attention backends once during initialization. + self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...] + if current_platform.is_rocm(): + rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata] + # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend + if find_spec("vllm.v1.attention.backends.rocm_aiter_fa"): + from vllm.v1.attention.backends.rocm_aiter_fa import ( + AiterFlashAttentionMetadata) + rocm_types.append(AiterFlashAttentionMetadata) + self.allowed_attn_types = tuple(rocm_types) + else: + self.allowed_attn_types = (FlashAttentionMetadata, + TreeAttentionMetadata) + # Parse the speculative token tree. spec_token_tree = self.speculative_config.speculative_token_tree self.tree_choices: list[tuple[int, @@ -165,7 +189,7 @@ def propose( for layer_name in self.attn_layer_names: per_layer_attn_metadata[layer_name] = attn_metadata if self.use_cuda_graph and \ - num_tokens <= self.cudagraph_batch_sizes[-1]: + num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) else: num_input_tokens = num_tokens @@ -225,25 +249,13 @@ def propose( # TODO: Currently, MTP module released by deepseek only has # one layer. Adapt this code to support multiple layers once # there's a multi-layer MTP module. - - # On ROCm, both AiterFlashAttention and TritonAttention - # support multi-token eagle spec decode. - if current_platform.is_rocm(): - assert isinstance( - attn_metadata, - (TritonAttentionMetadata, AiterFlashAttentionMetadata, - FlashAttentionMetadata)) - else: - # Currently, only FlashAttention supports multi-token eagle spec - # decode. This is because the code below makes assumptions about - # attn_metadata attributes available. - assert isinstance(attn_metadata, FlashAttentionMetadata) + assert isinstance(attn_metadata, self.allowed_attn_types) # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] if self.use_cuda_graph and \ - batch_size <= self.cudagraph_batch_sizes[-1]: + batch_size <= self.cudagraph_batch_sizes[-1]: input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) else: input_batch_size = batch_size @@ -449,7 +461,7 @@ def propose_tree( num_tokens, -1) if self.use_cuda_graph and \ - num_tokens <= self.cudagraph_batch_sizes[-1]: + num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph( num_tokens) else: @@ -508,19 +520,19 @@ def prepare_inputs( """ # E.g. # common_attn_metadata.query_start_loc{_cpu}: - # [0, q1, q1 + q2, q1 + q2 + q3] + # [0, q1, q1 + q2, q1 + q2 + q3] # common_attn_metadata.seq_lens{_cpu}: [s1, s2, s3] # num_rejected_tokens: [n1, n2, n3] # This function computes the intermediate values: # num_tokens_per_req: [q1 - n1, q2 - n2, q3 - n3] # And returns: # common_attn_metadata.query_start_loc{_cpu}: - # [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3] + # [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3] # common_attn_metadata.seq_lens{_cpu}: - # [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1] + # [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1] # token_indices: [0, 1, ..., q1 - n1 - 1, - # q1, q1 + 1, ..., q1 + q2 - n2 - 1, - # q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1] + # q1, q1 + 1, ..., q1 + q2 - n2 - 1, + # q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1] device = common_attn_metadata.query_start_loc.device query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu @@ -564,9 +576,9 @@ def prepare_inputs( old_query_start_locs_expanded = np.repeat( query_start_loc_cpu[:-1].numpy(), new_num_tokens_per_req_np) # Final token indices are: - # [0, 1, // req 1 - # q1 + 0, q1 + 1, q1 + 2, q1 + 3, // req 2 - # q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3 + # [0, 1, // req 1 + # q1 + 0, q1 + 1, q1 + 2, q1 + 3, // req 2 + # q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3 token_indices_np = token_offests + old_query_start_locs_expanded token_indices = torch.from_numpy(token_indices_np).to( device, non_blocking=True) @@ -616,20 +628,18 @@ def load_model(self, target_model: nn.Module) -> None: target_language_model = target_model # share embed_tokens with the target model if needed if get_pp_group().world_size == 1 \ - and self.model.model.embed_tokens.weight.shape \ - == target_language_model.model.embed_tokens.weight.shape: + and self.model.model.embed_tokens.weight.shape \ + == target_language_model.model.embed_tokens.weight.shape: logger.info( - "Assuming the EAGLE head shares the same vocab embedding" \ - " with the target model." - ) + "Assuming the EAGLE head shares the same vocab embedding" + " with the target model.") del self.model.model.embed_tokens self.model.model.embed_tokens = ( target_language_model.model.embed_tokens) else: logger.info( - "The EAGLE head's vocab embedding will be loaded separately" \ - " from the target model." - ) + "The EAGLE head's vocab embedding will be loaded separately" + " from the target model.") # share lm_head with the target model if needed # some model definition do not define lm_head explicitly From 285cd2b446a695cafcac55d083931f7c015cab37 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Wed, 20 Aug 2025 09:52:59 -0700 Subject: [PATCH 188/231] [torch.compile] Support conditional torch.compile per module (#22269) Signed-off-by: Yong Hoon Shin Signed-off-by: Duncan Moss --- .buildkite/test-pipeline.yaml | 2 + .../compile/piecewise/test_multiple_graphs.py | 137 +++------- tests/compile/test_decorator.py | 251 ++++++++++++++++++ vllm/compilation/decorators.py | 21 +- 4 files changed, 308 insertions(+), 103 deletions(-) create mode 100644 tests/compile/test_decorator.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2f7f1db75bfb..745420664010 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -328,6 +328,7 @@ steps: - pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_fusion_all_reduce.py + - pytest -v -s compile/test_decorator.py - label: PyTorch Fullgraph Smoke Test # 9min mirror_hardwares: [amdexperimental] @@ -341,6 +342,7 @@ steps: - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_full_cudagraph.py + - pytest -v -s compile/piecewise/test_multiple_graphs.py - label: PyTorch Fullgraph Test # 18min mirror_hardwares: [amdexperimental] diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index e460d7095178..f5e2d9ddb752 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -12,10 +12,9 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import (ignore_torch_compile, support_torch_compile) -from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, - set_current_vllm_config) -from vllm.envs import VLLM_USE_V1 -from vllm.forward_context import set_forward_context +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + VllmConfig, set_current_vllm_config) +from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils import direct_register_custom_op # create a library to hold the custom op @@ -164,104 +163,34 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -def test_ignore_torch_compile_decorator(): - assert VLLM_USE_V1 - - # piecewise - vllm_config = VllmConfig(compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, - use_cudagraph=True, - splitting_ops=["silly.attention"], - cudagraph_capture_sizes=[1, 2], - )) - - @support_torch_compile - class A(nn.Module): - - def __init__(self, - *, - vllm_config: VllmConfig, - prefix: str = '', - **kwargs) -> None: - super().__init__() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = x + x - attn_output = torch.empty_like(x) - torch.ops.silly.attention(x, x, x, attn_output) - x = attn_output - x = x * 3 - return x - - @ignore_torch_compile - class B(A): - ... - - @support_torch_compile - class C(B): - ... - - with set_current_vllm_config(vllm_config): - mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() - - # A has support_torch_compile - with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ), set_forward_context({}, vllm_config=vllm_config): - # first run is for compile - mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - # run cudagraph captured sizes - mod_A(torch.randn(2, MLP_SIZE).cuda()) - mod_A(torch.randn(1, MLP_SIZE).cuda()) - - with set_current_vllm_config(vllm_config): - mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda() - - # B's ignore_torch_compile should override A's support_torch_compile - with compilation_counter.expect( - num_graphs_seen=0, - num_piecewise_graphs_seen=0, - num_piecewise_capturable_graphs_seen=0, - num_backend_compilations=0, - num_cudagraph_captured=0, - ), set_forward_context({}, vllm_config=vllm_config): - mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - mod_B(torch.randn(2, MLP_SIZE).cuda()) - mod_B(torch.randn(1, MLP_SIZE).cuda()) - - with set_current_vllm_config(vllm_config): - mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda() - - # C's support_torch_compile should override B's ignore_torch_compile - with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ), set_forward_context({}, vllm_config=vllm_config): - mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - mod_C(torch.randn(2, MLP_SIZE).cuda()) - mod_C(torch.randn(1, MLP_SIZE).cuda()) - - @torch.inference_mode -def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor): +def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor, + cudagraph_runtime_mode: CUDAGraphMode): with set_forward_context({}, vllm_config=vllm_config): - # First run is for compile + # warmup for the model with cudagraph_mode NONE model(inputs) - # Run CUDAGraph captured sizes - model(inputs[:2]) - model(inputs[:1]) - - output = model(inputs[:2]) + # simulate cudagraphs capturing + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + model(inputs[:2]) + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=1, )): + model(inputs[:1]) + + # simulate cudagraphs replay + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + output = model(inputs[:2]) output = output.cpu() return output.cpu() @@ -277,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): splitting_ops=["silly.attention"], cudagraph_capture_sizes=[1, 2], )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -299,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_cudagraph_captured=8, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # no compile or cudagraph vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.NO_COMPILATION, )) + cudagraph_runtime_mode = CUDAGraphMode.NONE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -318,7 +250,8 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_backend_compilations=0, num_cudagraph_captured=0, ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # piecewise compile without CUDA graph vllm_config = VllmConfig(compilation_config=CompilationConfig( @@ -326,6 +259,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): use_cudagraph=False, splitting_ops=["silly.attention"], )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -340,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_backend_compilations=4, num_cudagraph_captured=0, # no cudagraph captured ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # Generally don't expect outputs with and without inductor # to be bitwise equivalent diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py new file mode 100644 index 000000000000..51f8ddd566d5 --- /dev/null +++ b/tests/compile/test_decorator.py @@ -0,0 +1,251 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +from torch import nn +from torch.library import Library + +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import (ignore_torch_compile, + support_torch_compile) +from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel, + CUDAGraphMode, VllmConfig, set_current_vllm_config) +from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import direct_register_custom_op + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + +BATCH_SIZE = 32 +MLP_SIZE = 128 + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + out.copy_(q) + out += k + out += v + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + +@torch.inference_mode +def run_model(vllm_config: VllmConfig, model: nn.Module, + cudagraph_runtime_mode: CUDAGraphMode): + with set_forward_context({}, vllm_config=vllm_config): + # warmup for the model with cudagraph_mode NONE + model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) + + # simulate cudagraphs capturing + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + model(torch.randn(2, MLP_SIZE).cuda()) + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=1, )): + model(torch.randn(1, MLP_SIZE).cuda()) + + # simulate cudagraphs replay + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + output = model(torch.randn(2, MLP_SIZE).cuda()) + + output = output.cpu() + return output.cpu() + + +def test_ignore_torch_compile_decorator(): + # piecewise + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + + @support_torch_compile + class A(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + x + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = x * 3 + return x + + @ignore_torch_compile + class B(A): + ... + + @support_torch_compile + class C(B): + ... + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + # A has support_torch_compile + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=3, + num_piecewise_capturable_graphs_seen=2, + num_backend_compilations=2, + num_cudagraph_captured=4, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) + + with set_current_vllm_config(vllm_config): + mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda() + + # B's ignore_torch_compile should override A's support_torch_compile + with compilation_counter.expect( + num_graphs_seen=0, + num_piecewise_graphs_seen=0, + num_piecewise_capturable_graphs_seen=0, + num_backend_compilations=0, + num_cudagraph_captured=0, + ): + run_model(vllm_config, mod_B, cudagraph_runtime_mode) + + with set_current_vllm_config(vllm_config): + mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda() + + # C's support_torch_compile should override B's ignore_torch_compile + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=3, + num_piecewise_capturable_graphs_seen=2, + num_backend_compilations=2, + num_cudagraph_captured=4, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_C, cudagraph_runtime_mode) + + +# Only enable torch.compile if +# vllm_config.cache_config.kv_sharing_fast_prefill=True +@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config. + kv_sharing_fast_prefill) +class B(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + x + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = x + x + return x + + +# Only enable torch.compile if +# vllm_config.cache_config.kv_sharing_fast_prefill=False +@support_torch_compile(enable_if=lambda vllm_config: not vllm_config. + cache_config.kv_sharing_fast_prefill) +class A(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + self.mod1 = B(vllm_config=vllm_config, prefix=prefix, **kwargs) + self.mod2 = B(vllm_config=vllm_config, prefix=prefix, **kwargs) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.mod1(x) + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = self.mod2(x) + return x + + +def test_conditional_compile_enable_if(): + vllm_config = VllmConfig(cache_config=CacheConfig( + kv_sharing_fast_prefill=True, ), + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + # A has support_torch_compile but enable_if fn returns False + # enalbe_if will be True for B, so we expect mod1 and mod2 + # to be compiled + with compilation_counter.expect( + num_graphs_seen=2, + num_piecewise_graphs_seen=6, + # 3 piecewise graphs per instance of B() + num_piecewise_capturable_graphs_seen=4, + num_backend_compilations=4, + num_cudagraph_captured=8, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) + + # Set kv_sharing_fast_prefill=False + # which will cause A to be compiled and B to not be compiled + vllm_config = VllmConfig(cache_config=CacheConfig( + kv_sharing_fast_prefill=False, ), + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=7, + # 3 attn ops and 4 non-attn ops + num_piecewise_capturable_graphs_seen=4, + num_backend_compilations=4, + num_cudagraph_captured=8, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 58f70ef9ef0a..41d9fcb824b0 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -52,6 +52,14 @@ def _should_ignore_torch_compile(cls) -> bool: return getattr(cls, IGNORE_COMPILE_KEY, False) +@overload +def support_torch_compile( + *, + enable_if: Optional[Callable[[VllmConfig], bool]] = None, +) -> Callable[[_T], _T]: + ... + + @overload def support_torch_compile( *, @@ -69,6 +77,7 @@ def support_torch_compile( cls: Optional[_T] = None, *, dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None, + enable_if: Optional[Callable[[VllmConfig], bool]] = None, ) -> Union[Callable[[_T], _T], _T]: """ A decorator to add support for compiling the forward method of a class. @@ -118,6 +127,11 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): NOTE: if an argument is `None`, it should always be passed as `None` during the lifetime of the model, otherwise, it cannot be captured as a single computation graph. + + `enable_if` is a function that takes a `VllmConfig` object as input and + returns a boolean value indicating whether to compile the model or not. + This is useful if you want to compile the model only when certain + conditions are met. """ def cls_decorator_helper(cls: _T) -> _T: @@ -149,7 +163,8 @@ def cls_decorator_helper(cls: _T) -> _T: if k not in sig.parameters: raise ValueError( f"Argument {k} not found in the forward method of {cls}") - return _support_torch_compile(cls, inferred_dynamic_arg_dims) + return _support_torch_compile(cls, inferred_dynamic_arg_dims, + enable_if) if cls is not None: # use `support_torch_compile` as a decorator without arguments @@ -162,6 +177,7 @@ def cls_decorator_helper(cls: _T) -> _T: def _support_torch_compile( cls: _T, dynamic_arg_dims: dict[str, Union[int, list[int]]], + enable_if: Optional[Callable[[VllmConfig], bool]] = None, ) -> _T: """ A decorator to add support for compiling the forward method of a class. @@ -182,13 +198,14 @@ def _support_torch_compile( def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config + enable_compile = enable_if is None or enable_if(vllm_config) # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = \ vllm_config.compilation_config.level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS ] or not supports_dynamo() or _should_ignore_torch_compile( - self.__class__) + self.__class__) or not enable_compile if self.do_not_compile: return From c826d11170211ec8c3efa2b29d4e6e4ff6f826b5 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Wed, 20 Aug 2025 10:37:29 -0700 Subject: [PATCH 189/231] Migrate Mistral3ImagePixelInputs to TensorSchema (#21945) Signed-off-by: Benji Beck Co-authored-by: Cyrus Leung Signed-off-by: Duncan Moss --- vllm/model_executor/models/mistral3.py | 38 ++++++++++++-------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index a647292d3a68..438513433d3b 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -3,7 +3,7 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, +from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar, Union) import torch @@ -32,6 +32,7 @@ PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) @@ -42,16 +43,24 @@ from .vision import get_vision_encoder_info -class Mistral3ImagePixelInputs(TypedDict): - type: Literal["pixel_values_pixtral"] - pixel_values: Union[torch.Tensor, list[torch.Tensor]] +class Mistral3ImagePixelInputs(TensorSchema): """ - Shape: `(batch_size * num_images, num_channels, height, width)` - - Note that `height` or `width` may be different per batch and image, - in which case the data is passed as a list instead of a batched tensor. + Dimensions: + - bn: Batch size * number of images + - c: Number of channels (3) + - h: Height of each image + - w: Width of each image """ + type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral" + + # Note that `height` or `width` may be different per batch and image, + # in which case the data is passed as a list instead of a batched tensor. + pixel_values: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"}), + ] + class Mistral3PatchMerger(nn.Module): """ @@ -456,19 +465,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - actual_dims = tuple(data.shape[1:]) - - if actual_dims != expected_dims: - expected_expr = ("batch_size", *map(str, expected_dims)) - raise ValueError( - f"The expected shape of pixel values is {expected_expr}. " - f"You supplied {tuple(data.shape)}.") - - return data - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]: pixel_values = kwargs.pop("pixel_values", None) From 58afbd2a4c08225c8875ef97fb6b406e540806c5 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 13:57:37 -0400 Subject: [PATCH 190/231] Limit HTTP header count and size (#23267) Signed-off-by: Taneem Ibrahim Signed-off-by: Russell Bryant Co-authored-by: Taneem Ibrahim Signed-off-by: Duncan Moss --- vllm/entrypoints/constants.py | 10 ++++++++++ vllm/entrypoints/launcher.py | 21 +++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 2 ++ vllm/entrypoints/openai/cli_args.py | 8 ++++++++ 4 files changed, 41 insertions(+) create mode 100644 vllm/entrypoints/constants.py diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/constants.py new file mode 100644 index 000000000000..b5bcccc35d6c --- /dev/null +++ b/vllm/entrypoints/constants.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Shared constants for vLLM entrypoints. +""" + +# HTTP header limits for h11 parser +# These constants help mitigate header abuse attacks +H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB +H11_MAX_HEADER_COUNT_DEFAULT = 256 diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 9f4dc19fb4ab..4e852ba59493 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -14,6 +14,8 @@ from vllm.engine.async_llm_engine import AsyncEngineDeadError from vllm.engine.multiprocessing import MQEngineDeadError from vllm.engine.protocol import EngineClient +from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, + H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) from vllm.entrypoints.ssl import SSLCertRefresher from vllm.logger import init_logger from vllm.utils import find_process_using_port @@ -26,6 +28,11 @@ async def serve_http(app: FastAPI, sock: Optional[socket.socket], enable_ssl_refresh: bool = False, **uvicorn_kwargs: Any): + """ + Start a FastAPI app using Uvicorn, with support for custom Uvicorn config + options. Supports http header limits via h11_max_incomplete_event_size and + h11_max_header_count. + """ logger.info("Available routes are:") for route in app.routes: methods = getattr(route, "methods", None) @@ -36,7 +43,21 @@ async def serve_http(app: FastAPI, logger.info("Route: %s, Methods: %s", path, ', '.join(methods)) + # Extract header limit options if present + h11_max_incomplete_event_size = uvicorn_kwargs.pop( + "h11_max_incomplete_event_size", None) + h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None) + + # Set safe defaults if not provided + if h11_max_incomplete_event_size is None: + h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT + if h11_max_header_count is None: + h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT + config = uvicorn.Config(app, **uvicorn_kwargs) + # Set header limits + config.h11_max_incomplete_event_size = h11_max_incomplete_event_size + config.h11_max_header_count = h11_max_header_count config.load() server = uvicorn.Server(config) _add_shutdown_handlers(app, server) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 765327da3b30..24148bcef235 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1922,6 +1922,8 @@ async def run_server_worker(listen_address, ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, ssl_cert_reqs=args.ssl_cert_reqs, + h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, + h11_max_header_count=args.h11_max_header_count, **uvicorn_kwargs, ) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index e15f65b43082..6e4eff5c8024 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -20,6 +20,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) +from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, + H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.logger import init_logger @@ -172,6 +174,12 @@ class FrontendArgs: enable_log_outputs: bool = False """If set to True, enable logging of model outputs (generations) in addition to the input logging that is enabled by default.""" + h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT + """Maximum size (bytes) of an incomplete HTTP event (header or body) for + h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB).""" + h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT + """Maximum number of HTTP headers allowed in a request for h11 parser. + Helps mitigate header abuse. Default: 256.""" @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: From 582c727dec168b6eef11f627634f05101e5cbf58 Mon Sep 17 00:00:00 2001 From: dongluw <108290936+dongluw@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:15:18 -0400 Subject: [PATCH 191/231] Small fix for Command-A-Vision (#23268) Signed-off-by: donglu Signed-off-by: Duncan Moss --- vllm/model_executor/models/cohere2_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 4682a8a428a0..fca1aee835b8 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -348,7 +348,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config=vllm_config, hf_config=config.text_config, prefix=maybe_prefix(prefix, "language_model"), - architectures=["Cohere2ForCausalLM"]) + architectures=config.text_config.architectures) @property def dtype(self): From c0eb3d7b340214afaa8bf69a8187c8cb751a6f9a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 15:13:36 -0400 Subject: [PATCH 192/231] [Kernel/Quant] Remove the original marlin format and qqq (#23204) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- .../configs/Meta-Llama-3-8B-QQQ.yaml | 12 - .../lm-eval-harness/configs/models-large.txt | 1 - CMakeLists.txt | 2 - benchmarks/kernels/benchmark_machete.py | 23 +- csrc/quantization/machete/generate.py | 145 +- csrc/quantization/marlin/dense/LICENSE | 209 --- csrc/quantization/marlin/dense/common/base.h | 32 - csrc/quantization/marlin/dense/common/mem.h | 89 -- .../marlin/dense/marlin_cuda_kernel.cu | 1073 -------------- .../marlin/qqq/marlin_qqq_gemm_kernel.cu | 1248 ----------------- csrc/torch_bindings.cpp | 17 - tests/compile/test_full_graph.py | 6 - tests/kernels/quantization/test_machete_mm.py | 34 +- .../kernels/quantization/test_marlin_gemm.py | 83 -- tests/quantization/test_configs.py | 10 - tests/quantization/test_lm_head.py | 6 +- tests/weight_loading/models.txt | 4 - vllm/_custom_ops.py | 36 - vllm/config/__init__.py | 7 +- vllm/lora/layers.py | 3 - vllm/model_executor/layers/linear.py | 1 - .../layers/quantization/__init__.py | 6 - .../layers/quantization/marlin.py | 263 ---- .../model_executor/layers/quantization/qqq.py | 275 ---- .../utils/marlin_utils_test_qqq.py | 126 -- .../layers/quantization/utils/quant_utils.py | 85 -- 26 files changed, 95 insertions(+), 3701 deletions(-) delete mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml delete mode 100644 csrc/quantization/marlin/dense/LICENSE delete mode 100644 csrc/quantization/marlin/dense/common/base.h delete mode 100644 csrc/quantization/marlin/dense/common/mem.h delete mode 100644 csrc/quantization/marlin/dense/marlin_cuda_kernel.cu delete mode 100644 csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu delete mode 100644 vllm/model_executor/layers/quantization/marlin.py delete mode 100644 vllm/model_executor/layers/quantization/qqq.py delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml deleted file mode 100644 index 56ec933c9cc0..000000000000 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# For vllm script, with -t option (tensor parallel size). -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 -model_name: "HandH1998/QQQ-Llama-3-8b-g128" -tasks: -- name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.419 - - name: "exact_match,flexible-extract" - value: 0.416 -limit: 1000 -num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt index 27a1a9a82bd3..37eeac85c933 100644 --- a/.buildkite/lm-eval-harness/configs/models-large.txt +++ b/.buildkite/lm-eval-harness/configs/models-large.txt @@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml Mixtral-8x7B-Instruct-v0.1.yaml Qwen2-57B-A14-Instruct.yaml DeepSeek-V2-Lite-Chat.yaml -Meta-Llama-3-8B-QQQ.yaml diff --git a/CMakeLists.txt b/CMakeLists.txt index bcbd1b52a06c..a1deefb07f09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -357,9 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) set(MARLIN_SRCS - "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" - "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 975d10f2e92e..a9c4d30d9b18 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -253,28 +253,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: else: assert bt.a.dtype == torch.int8 assert bt.wtype == scalar_types.uint4b8 - - if bt.w_ch_s is not None: - s_ch = bt.w_ch_s.to(torch.float32) - else: - s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device) - - if bt.w_tok_s is not None: - s_tok = bt.w_tok_s.to(torch.float32) - else: - s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device) - - fn = lambda: ops.marlin_qqq_gemm( - a=bt.a, - b_q_weight=w_q, - s_group=w_s, - s_tok=s_tok, - s_ch=s_ch, - workspace=workspace.scratch, - size_m=bt.a.shape[0], - size_n=bt.w_ref.shape[1], - size_k=bt.w_ref.shape[0], - ) + raise NotImplementedError("QQQ is not supported anymore") return fn diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 88b3f9c734a3..0d14ba15937c 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -571,78 +571,79 @@ def get_unique_schedules(heuristic: dict[str, ScheduleConfig]): itertools.repeat(default_heuristic)) ] - # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) - # TODO (LucasWilkinson): Further tuning required - qqq_tile_heuristic_config = { - #### M = 257+ - # ((128, 256), (2, 1, 1)) Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), - # "M > 256": ((128, 256), (2, 1, 1)), - "M > 256": ((128, 128), (2, 1, 1)), - #### M = 129-256 - "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), - "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), - # ((128, 256), (2, 1, 1)) Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - # "M > 128": ((128, 256), (2, 1, 1)), - "M > 128": ((128, 128), (2, 1, 1)), - #### M = 65-128 - "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), - "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), - "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), - "M > 64": ((128, 128), (2, 1, 1)), - #### M = 33-64 - "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), - # Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), - "M > 32": ((128, 64), (2, 1, 1)), - #### M = 17-32 - "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), - "M > 16": ((256, 32), (2, 1, 1)), - #### M = 1-16 - "N >= 26624": ((256, 16), (1, 1, 1)), - None: ((128, 16), (1, 1, 1)), - } - - # For now we use the same heuristic for all types - # Heuristic is currently tuned for H100s - qqq_heuristic = [ - (cond, ScheduleConfig(*tile_config, - **sch_common_params)) # type: ignore - for cond, tile_config in qqq_tile_heuristic_config.items() - ] - - QQQ_kernel_types = [ - *(TypeConfig( - a=DataType.s8, - b=VLLMDataType.u4b8, - b_group_scale=b_group_scale, - b_group_zeropoint=DataType.void, - b_channel_scale=DataType.f32, - a_token_scale=DataType.f32, - out=DataType.f16, - accumulator=DataType.s32, - ) for b_group_scale in (DataType.f16, DataType.void)), - *(TypeConfig( - a=DataType.e4m3, - b=VLLMDataType.u4b8, - b_group_scale=b_group_scale, - b_group_zeropoint=DataType.void, - b_channel_scale=DataType.f32, - a_token_scale=DataType.f32, - out=DataType.f16, - accumulator=DataType.f32, - ) for b_group_scale in (DataType.f16, DataType.void)), - ] - - impl_configs += [ - ImplConfig(x[0], x[1], x[2]) - for x in zip(QQQ_kernel_types, - itertools.repeat(get_unique_schedules(qqq_heuristic)), - itertools.repeat(qqq_heuristic)) - ] + # TODO: Support W4A8 when ready + # # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) + # # TODO (LucasWilkinson): Further tuning required + # qqq_tile_heuristic_config = { + # #### M = 257+ + # # ((128, 256), (2, 1, 1)) Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), + # # "M > 256": ((128, 256), (2, 1, 1)), + # "M > 256": ((128, 128), (2, 1, 1)), + # #### M = 129-256 + # "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), + # "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), + # # ((128, 256), (2, 1, 1)) Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # # "M > 128": ((128, 256), (2, 1, 1)), + # "M > 128": ((128, 128), (2, 1, 1)), + # #### M = 65-128 + # "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), + # "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), + # "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), + # "M > 64": ((128, 128), (2, 1, 1)), + # #### M = 33-64 + # "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), + # # Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), + # "M > 32": ((128, 64), (2, 1, 1)), + # #### M = 17-32 + # "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), + # "M > 16": ((256, 32), (2, 1, 1)), + # #### M = 1-16 + # "N >= 26624": ((256, 16), (1, 1, 1)), + # None: ((128, 16), (1, 1, 1)), + # } + + # # For now we use the same heuristic for all types + # # Heuristic is currently tuned for H100s + # qqq_heuristic = [ + # (cond, ScheduleConfig(*tile_config, + # **sch_common_params)) # type: ignore + # for cond, tile_config in qqq_tile_heuristic_config.items() + # ] + + # QQQ_kernel_types = [ + # *(TypeConfig( + # a=DataType.s8, + # b=VLLMDataType.u4b8, + # b_group_scale=b_group_scale, + # b_group_zeropoint=DataType.void, + # b_channel_scale=DataType.f32, + # a_token_scale=DataType.f32, + # out=DataType.f16, + # accumulator=DataType.s32, + # ) for b_group_scale in (DataType.f16, DataType.void)), + # *(TypeConfig( + # a=DataType.e4m3, + # b=VLLMDataType.u4b8, + # b_group_scale=b_group_scale, + # b_group_zeropoint=DataType.void, + # b_channel_scale=DataType.f32, + # a_token_scale=DataType.f32, + # out=DataType.f16, + # accumulator=DataType.f32, + # ) for b_group_scale in (DataType.f16, DataType.void)), + # ] + + # impl_configs += [ + # ImplConfig(x[0], x[1], x[2]) + # for x in zip(QQQ_kernel_types, + # itertools.repeat(get_unique_schedules(qqq_heuristic)), + # itertools.repeat(qqq_heuristic)) + # ] output_dir = os.path.join(SCRIPT_DIR, "generated") diff --git a/csrc/quantization/marlin/dense/LICENSE b/csrc/quantization/marlin/dense/LICENSE deleted file mode 100644 index 1d1e4cf9c823..000000000000 --- a/csrc/quantization/marlin/dense/LICENSE +++ /dev/null @@ -1,209 +0,0 @@ -Contains code from https://github.com/IST-DASLab/marlin - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ------------------------------------------------------------------------------------- - -This product bundles various third-party components under other open source licenses. -This section summarizes those components and their licenses. See licenses/ -for text of these licenses. diff --git a/csrc/quantization/marlin/dense/common/base.h b/csrc/quantization/marlin/dense/common/base.h deleted file mode 100644 index 68c83d5478cf..000000000000 --- a/csrc/quantization/marlin/dense/common/base.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Modified by HandH1998 - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } - -// Instances of `Vec` are used to organize groups of >>registers<<, as needed -// for instance as inputs to tensor core operations. Consequently, all -// corresponding index accesses must be compile-time constants, which is why we -// extensively use `#pragma unroll` throughout the kernel code to guarantee -// this. -template -struct Vec { - T elems[n]; - __device__ T& operator[](int i) { return elems[i]; } -}; diff --git a/csrc/quantization/marlin/dense/common/mem.h b/csrc/quantization/marlin/dense/common/mem.h deleted file mode 100644 index 64f9c393d77c..000000000000 --- a/csrc/quantization/marlin/dense/common/mem.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Modified by HandH1998 - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -// Predicated asynchronous global->shared copy; used for inputs A where we apply -// predication to handle batchsizes that are not multiples of 16. -__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, - bool pred = true) { - const int BYTES = 16; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " .reg .pred p;\n" - " setp.ne.b32 p, %0, 0;\n" - " @p cp.async.cg.shared.global [%1], [%2], %3;\n" - "}\n" ::"r"((int)pred), - "r"(smem), "l"(glob_ptr), "n"(BYTES)); -} - -// Asynchronous global->shared copy -__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { - const int BYTES = 16; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " cp.async.cg.shared.global [%0], [%1], %2;\n" - "}\n" ::"r"(smem), - "l"(glob_ptr), "n"(BYTES)); -} - -// Async copy fence. -__device__ inline void cp_async_fence() { - asm volatile("cp.async.commit_group;\n" ::); -} - -// Wait until at most `n` async copy stages are still pending. -template -__device__ inline void cp_async_wait() { - asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); -} - -// Wait until barrier reaches `count`, then lock for current threadblock. -__device__ inline void barrier_acquire(int* lock, int count) { - if (threadIdx.x == 0) { - int state = -1; - do - // Guarantee that subsequent writes by this threadblock will be visible - // globally. - asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" - : "=r"(state) - : "l"(lock)); - while (state != count); - } - __syncthreads(); -} - -// Release barrier and increment visitation count. -__device__ inline void barrier_release(int* lock, bool reset = false) { - __syncthreads(); - if (threadIdx.x == 0) { - if (reset) { - lock[0] = 0; - return; - } - int val = 1; - // Make sure that all writes since acquiring this barrier are visible - // globally, while releasing the barrier. - asm volatile("fence.acq_rel.gpu;\n"); - asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" - : - : "l"(lock), "r"(val)); - } -} diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu deleted file mode 100644 index ea96326ed7e6..000000000000 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ /dev/null @@ -1,1073 +0,0 @@ -/* - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#include "common/base.h" -#include "core/registration.h" - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - #include "common/mem.h" -#endif - -template -inline std::string str(T x) { - return std::to_string(x); -} - -namespace marlin_dense { - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - -using I4 = Vec; -// Matrix fragments for tensor core instructions; their precise layout is -// documented here: -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type -using FragA = Vec; -using FragB = Vec; -using FragC = Vec; -using FragS = Vec; // quantization scales - -// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 -// output/accumulation. -__device__ inline void mma(const FragA& a_frag, const FragB& frag_b, - FragC& frag_c) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); -} - -// Instruction for loading a full 16x16 matrix fragment of operand A from shared -// memory, directly in tensor core layout. -__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { - uint32_t* a = reinterpret_cast(&frag_a); - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" - : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) - : "r"(smem)); -} - -// Lookup-table based 3-input logical operation; explicitly used for -// dequantization as the compiler does not seem to automatically recognize it in -// all cases. -template -__device__ inline int lop3(int a, int b, int c) { - int res; - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(res) - : "r"(a), "r"(b), "r"(c), "n"(lut)); - return res; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16 -// values. We mostly follow the strategy in the link below, with some small -// changes: -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h -__device__ inline FragB dequant(int q) { - const int LO = 0x000f000f; - const int HI = 0x00f000f0; - const int EX = 0x64006400; - // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); - // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point - // directly into `SUB` and `ADD`. - const int SUB = 0x64086408; - const int MUL = 0x2c002c00; - const int ADD = 0xd480d480; - FragB frag_b; - frag_b[0] = __hsub2(*reinterpret_cast(&lo), - *reinterpret_cast(&SUB)); - frag_b[1] = __hfma2(*reinterpret_cast(&hi), - *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); - return frag_b; -} - -// Multiply dequantized values by the corresponding quantization scale; used -// only for grouped quantization. -__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) { - half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]); - frag_b[0] = __hmul2(frag_b[0], s); - frag_b[1] = __hmul2(frag_b[1], s); -} - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // fp16 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // fp16 output buffer of shape mxn - const int4* __restrict__ s, // fp16 quantization scales of shape - // (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Each threadblock processes one "stripe" of the B matrix with (roughly) the - // same size, which might involve multiple column "slices" (of width 16 * - // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM - // example: - // 0 1 3 - // 0 2 3 - // 1 2 4 - // While this kind of partitioning makes things somewhat more complicated, it - // ensures good utilization of all SMs for many kinds of shape and GPU - // configurations, while requiring as few slow global cross-threadblock - // reductions as possible. - - // For larger GEMMs we run multiple batchsize 64 versions in parallel for a - // better partitioning with less reductions - int parallel = 1; - if (prob_m > 16 * thread_m_blocks) { - parallel = prob_m / (16 * thread_m_blocks); - prob_m = 16 * thread_m_blocks; - } - - int k_tiles = prob_k / 16 / thread_k_blocks; - int n_tiles = prob_n / 16 / thread_n_blocks; - int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x); - // Ensure that the number of tiles in each stripe is a multiple of the - // groupsize; this avoids an annoying special case where a stripe starts in - // the middle of group. - if (group_blocks != -1) - iters = (group_blocks / thread_k_blocks) * - ceildiv(iters, (group_blocks / thread_k_blocks)); - - int slice_row = (iters * blockIdx.x) % k_tiles; - int slice_col_par = (iters * blockIdx.x) / k_tiles; - int slice_col = slice_col_par; - int slice_iters; // number of threadblock tiles in the current slice - int slice_count = - 0; // total number of active threadblocks in the current slice - int slice_idx; // index of threadblock in current slice; numbered bottom to - // top - - // We can easily implement parallel problem execution by just remapping - // indices and advancing global pointers - if (slice_col_par >= n_tiles) { - A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8; - C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; - locks += (slice_col_par / n_tiles) * n_tiles; - slice_col = slice_col_par % n_tiles; - } - - // Compute all information about the current slice which is required for - // synchronization. - auto init_slice = [&]() { - slice_iters = - iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; - if (slice_iters == 0) return; - if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; - slice_count = 1; - slice_idx = 0; - int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); - if (col_first <= k_tiles * (slice_col_par + 1)) { - int col_off = col_first - k_tiles * slice_col_par; - slice_count = ceildiv(k_tiles - col_off, iters); - if (col_off > 0) slice_count++; - int delta_first = iters * blockIdx.x - col_first; - if (delta_first < 0 || (col_off == 0 && delta_first == 0)) - slice_idx = slice_count - 1; - else { - slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) slice_idx--; - } - } - if (slice_col == n_tiles) { - A += 16 * thread_m_blocks * prob_k / 8; - C += 16 * thread_m_blocks * prob_n / 8; - locks += n_tiles; - slice_col = 0; - } - }; - init_slice(); - - int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory - // We typically use `constexpr` to indicate that this value is a compile-time - // constant - constexpr int a_sh_stride = - 16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory - constexpr int a_gl_rd_delta_o = - 16 * thread_k_blocks / - 8; // delta between subsequent A tiles in global memory - int a_gl_rd_delta_i = - a_gl_stride * - (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile - constexpr int a_sh_wr_delta = - a_sh_stride * - (threads / a_gl_rd_delta_o); // between shared memory writes - constexpr int a_sh_rd_delta_o = - 2 * ((threads / 32) / - (thread_n_blocks / 4)); // between shared memory tile reads - constexpr int a_sh_rd_delta_i = - a_sh_stride * 16; // within a shared memory tile - constexpr int a_sh_stage = - a_sh_stride * (16 * thread_m_blocks); // overall size of a tile - constexpr int a_sh_wr_iters = - ceildiv(a_sh_stage, - a_sh_wr_delta); // number of shared write iterations for a tile - - int b_gl_stride = 16 * prob_n / 32; - constexpr int b_sh_stride = 32 * thread_n_blocks / 4; - int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; - int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride); - constexpr int b_sh_wr_delta = threads; - constexpr int b_sh_rd_delta = threads; - constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; - constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; - - int s_gl_stride = prob_n / 8; - constexpr int s_sh_stride = 16 * thread_n_blocks / 8; - constexpr int s_sh_stage = s_sh_stride; - int s_gl_rd_delta = s_gl_stride; - - // Global A read index of current thread. - int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - a_gl_rd += a_gl_rd_delta_o * slice_row; - // Shared write index of current thread. - int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - // Shared read index. - int a_sh_rd = - a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16; - a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); - - int b_gl_rd = - b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride); - b_gl_rd += b_sh_stride * slice_col; - b_gl_rd += b_gl_rd_delta_o * slice_row; - auto b_sh_wr = threadIdx.x; - auto b_sh_rd = threadIdx.x; - - int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + - s_sh_stride * slice_col + threadIdx.x; - auto s_sh_wr = threadIdx.x; - int s_sh_rd; - // We use a different scale layout for grouped and column-wise quantization as - // we scale a `half2` tile in column-major layout in the former and in - // row-major in the latter case. - if (group_blocks != -1) - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - else - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) % 4; - - // Precompute which thread should not read memory in which iterations; this is - // needed if there are more threads than required for a certain tilesize or - // when the batchsize is not a multiple of 16. - bool a_sh_wr_pred[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; - bool s_sh_wr_pred = threadIdx.x < s_sh_stride; - - // To ensure that writing and reading A tiles to/from shared memory, the - // latter in fragment format, is fully bank conflict free, we need to use a - // rather fancy XOR-based layout. The key here is that neither reads nor - // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the - // same shared memory banks. Further, it seems (based on NSight-Compute) that - // each warp must also write a consecutive memory segment? - auto transform_a = [&](int i) { - int row = i / a_gl_rd_delta_o; - return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; - }; - // Since the computation of this remapping is non-trivial and, due to our main - // loop unrolls, all shared memory accesses are static, we simply precompute - // both transformed reads and writes. - int a_sh_wr_trans[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); - int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - #pragma unroll - for (int j = 0; j < thread_m_blocks; j++) - a_sh_rd_trans[i][j] = - transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); - } - - // Since B-accesses have non-constant stride they have to be computed at - // runtime; we break dependencies between subsequent accesses with a tile by - // maintining multiple pointers (we have enough registers), a tiny - // optimization. - const int4* B_ptr[b_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; - - extern __shared__ int4 sh[]; - // Shared memory storage for global fetch pipelines. - int4* sh_a = sh; - int4* sh_b = sh_a + (stages * a_sh_stage); - int4* sh_s = sh_b + (stages * b_sh_stage); - // Register storage for double buffer of shared memory reads. - FragA frag_a[2][thread_m_blocks]; - I4 frag_b_quant[2]; - FragC frag_c[thread_m_blocks][4][2]; - FragS frag_s[2][4]; - - // Zero accumulators. - auto zero_accums = [&]() { - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; - }; - - // Asynchronously fetch the next A, B and s tile from global to the next - // shared memory pipeline location. - auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { - if (pred) { - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) { - cp_async4_pred( - &sh_a_stage[a_sh_wr_trans[i]], - &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], - a_sh_wr_pred[i]); - } - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); - B_ptr[i] += b_gl_rd_delta_o; - } - // Only fetch scales if this tile starts a new group - if constexpr (group_blocks != -1) { - // This assumes group_blocks >= thread_k_blocks - // and would need to be modified to support smaller groups. - static_assert(group_blocks >= thread_k_blocks); - if (pipe % (group_blocks / thread_k_blocks) == 0) { - int4* sh_s_stage = sh_s + s_sh_stage * pipe; - if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); - s_gl_rd += s_gl_rd_delta; - } - } - } - // Insert a fence even when we are winding down the pipeline to ensure that - // waiting is also correct at this point. - cp_async_fence(); - }; - - // Wait until the next thread tile has been loaded to shared memory. - auto wait_for_stage = [&]() { - // We only have `stages - 2` active fetches since we are double buffering - // and can only issue the next fetch when it is guaranteed that the previous - // shared memory load is fully complete (as it may otherwise be - // overwritten). - cp_async_wait(); - __syncthreads(); - }; - - // Load the next sub-tile from the current location in the shared memory pipe - // into the current register buffer. - auto fetch_to_registers = [&](int k, int pipe) { - // It may seem inefficient that we reload the groups for every sub-tile; - // however, this does not seem to be a significant bottleneck, while some - // theoretically better attempts have lead to bad instruction ordering by - // the compiler and correspondingly a noticeable drop in performance. - if constexpr (group_blocks != -1) { - // This assumes group_blocks >= thread_k_blocks - // and would need to be modified to support smaller groups. - static_assert(group_blocks >= thread_k_blocks); - int4* sh_s_stage = - sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * - (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; - } - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) - ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - frag_b_quant[k % 2] = *reinterpret_cast( - &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); - }; - - // Execute the actual tensor core matmul of a sub-tile. - auto matmul = [&](int k) { - // We have the m dimension as the inner loop in order to encourage overlapping - // dequantization and matmul operations. - #pragma unroll - for (int j = 0; j < 4; j++) { - int b_quant = frag_b_quant[k % 2][j]; - int b_quant_shift = b_quant >> 8; - FragB frag_b0 = dequant(b_quant); - // If there are no groups, we can just scale the final output once and can - // avoid doing so for each weight. - if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0); - FragB frag_b1 = dequant(b_quant_shift); - if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1); - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); - mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); - } - } - }; - - // Since we slice across the k dimension of a tile in order to increase the - // number of warps while keeping the n dimension of a tile reasonable, we have - // multiple warps that accumulate their partial sums of the same output - // location; which we have to reduce over in the end. We do in shared memory. - auto thread_block_reduce = [&]() { - constexpr int red_off = threads / b_sh_stride / 2; - if (red_off >= 1) { - auto red_idx = threadIdx.x / b_sh_stride; - constexpr int red_sh_stride = b_sh_stride * 4 * 2; - constexpr int red_sh_delta = b_sh_stride; - int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) + - (threadIdx.x % b_sh_stride); - - // Parallel logarithmic shared memory reduction. We make sure to avoid any - // unnecessary read or write iterations, e.g., for two warps we write only - // once by warp 1 and read only once by warp 0. - - #pragma unroll - for (int m_block = 0; m_block < thread_m_blocks; m_block++) { - #pragma unroll - for (int i = red_off; i > 0; i /= 2) { - if (i <= red_idx && red_idx < 2 * i) { - #pragma unroll - for (int j = 0; j < 4 * 2; j++) { - int red_sh_wr = - red_sh_delta * j + (red_sh_rd - red_sh_stride * i); - if (i < red_off) { - float* c_rd = - reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); - float* c_wr = reinterpret_cast(&sh[red_sh_wr]); - #pragma unroll - for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += - c_rd[k] + c_wr[k]; - } - sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; - } - } - __syncthreads(); - } - if (red_idx == 0) { - #pragma unroll - for (int i = 0; i < 4 * 2; i++) { - float* c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); - #pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += - c_rd[j]; - } - } - __syncthreads(); - } - } - }; - - // Since multiple threadblocks may process parts of the same column slice, we - // finally have to globally reduce over the results. As the striped - // partitioning minimizes the number of such reductions and our outputs are - // usually rather small, we perform this reduction serially in L2 cache. - auto global_reduce = [&](bool first = false, bool last = false) { - // We are very careful here to reduce directly in the output buffer to - // maximize L2 cache utilization in this step. To do this, we write out - // results in FP16 (but still reduce with FP32 compute). - constexpr int active_threads = 32 * thread_n_blocks / 4; - if (threadIdx.x < active_threads) { - int c_gl_stride = prob_n / 8; - int c_gl_wr_delta_o = 8 * c_gl_stride; - int c_gl_wr_delta_i = 4 * (active_threads / 32); - int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + - 4 * (threadIdx.x / 32) + threadIdx.x % 4; - c_gl_wr += (2 * thread_n_blocks) * slice_col; - constexpr int c_sh_wr_delta = active_threads; - auto c_sh_wr = threadIdx.x; - - int row = (threadIdx.x % 32) / 4; - - if (!first) { - // Interestingly, doing direct global accesses here really seems to mess up - // the compiler and lead to slowdowns, hence we also use async-copies even - // though these fetches are not actually asynchronous. - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - } - cp_async_fence(); - cp_async_wait<0>(); - } - - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { - if (!first) { - int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; - #pragma unroll - for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += - __half2float(reinterpret_cast<__half*>(&c_red)[j]); - } - } - if (!last) { - int4 c; - #pragma unroll - for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast<__half*>(&c)[j] = - __float2half(reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); - } - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = - c; - } - } - } - } - }; - - // Write out the reduce final result in the correct layout. We only actually - // reshuffle matrix fragments in this step, the reduction above is performed - // in fragment layout. - auto write_result = [&]() { - int c_gl_stride = prob_n / 8; - constexpr int c_sh_stride = 2 * thread_n_blocks + 1; - int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); - constexpr int c_sh_rd_delta = - c_sh_stride * (threads / (2 * thread_n_blocks)); - - int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - c_gl_wr += (2 * thread_n_blocks) * slice_col; - int c_sh_wr = - (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; - c_sh_wr += 32 * (threadIdx.x / 32); - int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - - int c_gl_wr_end = c_gl_stride * prob_m; - - // We first reorder in shared memory to guarantee the most efficient final - // global write patterns - auto write = [&](int idx, float c0, float c1, FragS& s) { - half2 res = __halves2half2(__float2half(c0), __float2half(c1)); - if (group_blocks == - -1) // for per-column quantization we finally apply the scale here - res = __hmul2(res, s[0]); - ((half2*)sh)[idx] = res; - }; - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - #pragma unroll - for (int j = 0; j < 4; j++) { - int wr = c_sh_wr + 8 * j; - write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); - write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); - } - c_sh_wr += 16 * (4 * c_sh_stride); - } - } - __syncthreads(); - - #pragma unroll - for (int i = 0; - i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); - i++) { - if (c_gl_wr < c_gl_wr_end) { - C[c_gl_wr] = sh[c_sh_rd]; - c_gl_wr += c_gl_wr_delta; - c_sh_rd += c_sh_rd_delta; - } - } - }; - - // Start global fetch and register load pipelines. - auto start_pipes = [&]() { - #pragma unroll - for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); - zero_accums(); - wait_for_stage(); - fetch_to_registers(0, 0); - a_gl_rd += a_gl_rd_delta_o * (stages - 1); - }; - start_pipes(); - - // Main loop. - while (slice_iters) { - // We unroll over both the global fetch and the register load pipeline to - // ensure all shared memory accesses are static. Note that both pipelines have - // even length meaning that the next iteration will always start at index 0. - #pragma unroll - for (int pipe = 0; pipe < stages;) { - #pragma unroll - for (int k = 0; k < b_sh_wr_iters; k++) { - fetch_to_registers(k + 1, pipe % stages); - if (k == b_sh_wr_iters - 2) { - fetch_to_shared((pipe + stages - 1) % stages, pipe, - slice_iters >= stages); - pipe++; - wait_for_stage(); - } - matmul(k); - } - slice_iters--; - if (slice_iters == 0) break; - } - a_gl_rd += a_gl_rd_delta_o * stages; - - // Process results and, if necessary, proceed to the next column slice. - // While this pattern may not be the most readable, other ways of writing - // the loop seemed to noticeably worse performance after compilation. - if (slice_iters == 0) { - cp_async_wait<0>(); - bool last = slice_idx == slice_count - 1; - // For per-column scales, we only fetch them here in the final step before - // write-out - if (group_blocks == -1 && last) { - if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); - cp_async_fence(); - } - thread_block_reduce(); - if (group_blocks == -1 && last) { - cp_async_wait<0>(); - __syncthreads(); - if (threadIdx.x / 32 < thread_n_blocks / 4) { - reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; - reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; - } - } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice - barrier_acquire(&locks[slice_col], slice_idx); - global_reduce(slice_idx == 0, last); - barrier_release(&locks[slice_col], last); - } - if (last) // only the last block in a slice actually writes the result - write_result(); - slice_row = 0; - slice_col_par++; - slice_col++; - init_slice(); - if (slice_iters) { - a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; - if (slice_col == 0) { - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; - } - s_gl_rd = s_sh_stride * slice_col + threadIdx.x; - start_pipes(); - } - } - } -} - -#else - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // fp16 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // fp16 output buffer of shape mxn - const int4* __restrict__ s, // fp16 quantization scales of shape - // (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Marlin is not implemented yet for SM < 8.0 - assert(false); - return; -} - -#endif - -// 8 warps are a good choice since every SM has 4 schedulers and having more -// than 1 warp per schedule allows some more latency hiding. At the same time, -// we want relatively few warps to have many registers per warp and small tiles. -const int USER_THREADS = - 256; // Note: This is only used with user-provided thread_k/n -const int STAGES = 4; // 4 pipeline stages fit into shared memory -const int SHARED_MEM = - 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) - -static constexpr int min_thread_n = 64; -static constexpr int min_thread_k = 64; - -static constexpr int tile_size = 16; -static constexpr int max_par = 16; - -static constexpr int pack_factor_4bit = - 8; // We have 8 4-bit vals inside a 32 bit - -#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - GROUP_BLOCKS, NUM_THREADS) \ - else if (thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute(Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, \ - SHARED_MEM); \ - Marlin<<>>( \ - A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks); \ - } - -typedef struct { - int thread_k; - int thread_n; - int num_threads; -} thread_config_t; - -thread_config_t small_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {128, 128, 256}, // Default - {128, 64, 128}, // Reduce N 2X, same K - {64, 256, 256}, // Reduce K 2X, increase N 2X - {64, 128, 128}, // Reduce K 2X, same N -}; - -thread_config_t large_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {64, 256, 256}, // Default - {128, 128, 256}, // Reduce N 2X, increase K 2X - {64, 128, 128}, // Reduce N 2X, same K - {128, 64, 128}, // Reduce N 4X, increase K 2X -}; - -bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n, - int prob_k) { - // Sanity - if (th_config.thread_k == -1 || th_config.thread_n == -1 || - th_config.num_threads == -1) { - return false; - } - - // Verify K/N are divisible by thread K/N - if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { - return false; - } - - // thread_k can be only 128 or 64 (because it must be less than groupsize - // which is 128) - if (th_config.thread_k != 128 && th_config.thread_k != 64) { - return false; - } - - // Verify min for thread K/N - if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { - return false; - } - - // num_threads must be at least 128 (= 4 warps) - if (th_config.num_threads < 128) { - return false; - } - - return true; -} - -thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { - if (prob_m <= 16) { - for (auto th_config : small_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - - } else { - for (auto th_config : large_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - } - - return thread_config_t{-1, -1, -1}; -} - -#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) - -void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m, - int prob_n, int prob_k, void* workspace, int groupsize = -1, - int dev = 0, cudaStream_t stream = 0, int thread_k = -1, - int thread_n = -1, int sms = -1, int max_par = 16) { - int tot_m = prob_m; - int tot_m_blocks = ceildiv(tot_m, 16); - int pad = 16 * tot_m_blocks - tot_m; - - if (sms == -1) - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); - - // Set thread config - thread_config_t th_config; - if (thread_k != -1 && thread_n != -1) { - // User-defined config - th_config = thread_config_t{thread_k, thread_n, USER_THREADS}; - } else { - // Auto config - th_config = determine_thread_config(prob_m, prob_n, prob_k); - } - - if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) { - throw std::runtime_error( - "Invalid thread config: thread_k = " + str(th_config.thread_k) + - ", thread_n = " + str(th_config.thread_n) + - ", num_threads = " + str(th_config.num_threads) + " for MKN = [" + - str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]"); - } - - // Uncomment for debug - // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) + - // ", thread_n = " + str(th_config.thread_n) + - // ", num_threads = " + str(th_config.num_threads) + " for - // MKN = [" + str(prob_m) + - // ", " + str(prob_k) + ", " + str(prob_n) + "]\n"; - - int num_threads = th_config.num_threads; - thread_k = th_config.thread_k; - thread_n = th_config.thread_n; - - int thread_k_blocks = thread_k / 16; - int thread_n_blocks = thread_n / 16; - int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; - int blocks = sms; - - if (prob_m == 0 || prob_n == 0 || prob_k == 0) { - return; - } - - TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, - " is not divisible by thread_n = ", thread_n); - TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, - " is not divisible by thread_k = ", thread_k); - if (group_blocks != -1) { - TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, - " is not divisible by group_blocks = ", group_blocks); - } - - const int4* A_ptr = (const int4*)A; - const int4* B_ptr = (const int4*)B; - int4* C_ptr = (int4*)C; - const int4* s_ptr = (const int4*)s; - - int* locks = (int*)workspace; - - for (int i = 0; i < tot_m_blocks; i += 4) { - int thread_m_blocks = tot_m_blocks - i; - prob_m = tot_m - 16 * i; - int par = 1; - if (thread_m_blocks > 4) { - // Note that parallel > 1 currently only works for inputs without any - // padding - par = (16 * thread_m_blocks - pad) / 64; - if (par > max_par) par = max_par; - prob_m = 64 * par; - i += 4 * (par - 1); - thread_m_blocks = 4; - } - - // For compilation speed, we only define the kernel configurations that have - // seemed useful (in terms of performance) in our testing, however many more - // are, in principle, possible. - if (false) { - } - CALL_IF(8, 8, 256) - CALL_IF(16, 4, 256) - CALL_IF(8, 4, 128) - CALL_IF(4, 8, 128) - else { - throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + - ", " + str(prob_k) + ", " + str(prob_n) + "]" + - ", groupsize = " + str(groupsize) + - ", thread_m_blocks = " + str(thread_m_blocks) + - ", thread_n_blocks = " + str(thread_n_blocks) + - ", thread_k_blocks = " + str(thread_k_blocks)); - } - - A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par; - C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; - } -} - -} // namespace marlin_dense - -torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, - torch::Tensor& b_scales, torch::Tensor& workspace, - int64_t size_m, int64_t size_n, int64_t size_k) { - // Verify M - TORCH_CHECK(size_m == a.size(0), - "Shape mismatch: a.size(0) = " + str(a.size(0)) + - ", size_m = " + str(size_m)); - - // Verify K - TORCH_CHECK(size_k == a.size(1), - "Shape mismatch: a.size(1) = " + str(a.size(1)) + - ", size_k = " + str(size_k)); - TORCH_CHECK(size_k % marlin_dense::tile_size == 0, - "size_k = " + str(size_k) + " is not divisible by tile_size = " + - str(marlin_dense::tile_size)); - TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0), - "Shape mismatch: b_q_weight.size(0) = " + - str(b_q_weight.size(0)) + ", size_k = " + str(size_k) + - ", tile_size = " + str(marlin_dense::tile_size)); - - // Verify N - TORCH_CHECK(b_scales.size(1) == size_n, - "b_scales.size(1) = " + str(b_scales.size(1)) + - ", size_n = " + str(size_n)); - TORCH_CHECK( - b_q_weight.size(1) % marlin_dense::tile_size == 0, - "b_q_weight.size(1) = " + str(b_q_weight.size(1)) + - " is not divisible by tile_size = " + str(marlin_dense::tile_size)); - - int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) * - marlin_dense::pack_factor_4bit; - TORCH_CHECK( - size_n == actual_size_n, - "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n)); - - // Verify A device and strides - TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); - TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); - - // Verify B device and strides - TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); - TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); - - // Verify scales device and strides - TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); - TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); - - // Alloc C matrix - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); - torch::Tensor c = torch::empty({size_m, size_n}, options); - - // thread_k: `k` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_k = -1; - // thread_n: `n` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_n = -1; - // sms: number of SMs to use for the kernel (can usually be left as auto -1) - int sms = -1; - - // Detect groupsize - if (b_scales.size(0) != 1) { - TORCH_CHECK(size_k % b_scales.size(0) == 0, - "size_k = " + str(size_k) + - ", is not divisible by b_scales.size(0) = " + - str(b_scales.size(0))); - } - int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0); - - // Verify groupsize - TORCH_CHECK(groupsize == -1 || groupsize == 128, - "Unexpected groupsize = " + str(groupsize)); - - // Verify workspace size - TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0, - "size_n = " + str(size_n) + - ", is not divisible by min_thread_n = " + - str(marlin_dense::min_thread_n)); - int min_workspace_size = - (size_n / marlin_dense::min_thread_n) * marlin_dense::max_par; - TORCH_CHECK(workspace.numel() >= min_workspace_size, - "workspace.numel = " + str(workspace.numel()) + - " is below min_workspace_size = " + str(min_workspace_size)); - - int dev = a.get_device(); - marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), - b_scales.data_ptr(), size_m, size_n, size_k, - workspace.data_ptr(), groupsize, dev, - at::cuda::getCurrentCUDAStream(dev), thread_k, - thread_n, sms, marlin_dense::max_par); - - return c; -} - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { - m.impl("marlin_gemm", &marlin_gemm); -} diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu deleted file mode 100644 index c96d68d9b29a..000000000000 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ /dev/null @@ -1,1248 +0,0 @@ -/* - * Adapted from - * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu - * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp - * Modified by HandH1998 - * Copyright (C) 2024 HandH1998 - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#include "../dense/common/base.h" -#include "core/registration.h" - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - #include "../dense/common/mem.h" -#endif - -template -inline std::string str(T x) { - return std::to_string(x); -} - -namespace { - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - -using I4 = Vec; -// Matrix fragments for tensor core instructions; their precise layout is -// documented here: -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type -using FragA = Vec; -using FragB = Vec; -using FragC = Vec; -using FragS_GROUP = Vec; // weight per-group quantization scales -using FragS_CHANNEL = - Vec; // weight per-channel quantization scales or activaton - // per-token quantization scales - -// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however, -// cp.async.ca can support BYTES = 4, 8, 16; -// as s_tok's shape is equal to prob_m, we need set s_tok to float type, -// and cp_size = 1 float, i.e., 4 BYTES -// Asynchronous global->shared copy for activation quantizaton scales s_tok -__device__ inline void cp_async1(void* smem_ptr, const void* glob_ptr) { - const int BYTES = 4; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " cp.async.ca.shared.global [%0], [%1], %2;\n" - "}\n" ::"r"(smem), - "l"(glob_ptr), "n"(BYTES)); -} - -// m16n8k16 tensor core mma instruction with int8 inputs and int32 -// output/accumulation. -__device__ inline void mma(const FragA& a_frag, const FragB& frag_b, - FragC& frag_c) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - int* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]), - "r"(c[3])); -} - -// Instruction for loading a full 16x16 matrix fragment of operand A from shared -// memory, directly in int8 tensor core layout. -__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { - uint32_t* a = reinterpret_cast(&frag_a); - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" - : "=r"(a[0]), "=r"(a[1]) - : "r"(smem)); -} - -inline __device__ half2 float2_to_half2(float2 f) { - uint32_t res; - // NOTE(HandH1998): h0,h1 should be uint16_t, not half - uint16_t h0, h1; - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h0) : "f"(f.x)); - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h1) : "f"(f.y)); - asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(res) : "h"(h0), "h"(h1)); - return reinterpret_cast(res); -} - -inline __device__ float int32_to_float(int h) { - float res; - asm volatile("cvt.rn.f32.s32 %0, %1;\n" : "=f"(res) : "r"(h)); - return res; -} - -// Lookup-table based 3-input logical operation; explicitly used for -// dequantization as the compiler does not seem to automatically recognize it in -// all cases. -template -__device__ inline int lop3(int a, int b, int c) { - int res; - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(res) - : "r"(a), "r"(b), "r"(c), "n"(lut)); - return res; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values -// for weight per channel dequant. -__device__ inline FragB dequant_per_channel(int q) { - static constexpr int MASK = 0xf0f0f0f0; - FragB frag_b; - frag_b[0] = (q & MASK); - return frag_b; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values -// for weight per group dequant. -__device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { - static constexpr uint32_t LO = 0x000f000f; - static constexpr uint32_t HI = 0x00f000f0; - static constexpr uint32_t EX = 0x64006400; - // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); - // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point - // directly into `SUB` and `ADD`. - static constexpr uint32_t SUB = 0x64086408; - static constexpr uint32_t MUL = 0x2c002c00; - static constexpr uint32_t ADD = 0xd480d480; - *reinterpret_cast(&t0) = __hsub2( - *reinterpret_cast(&t0), *reinterpret_cast(&SUB)); - *reinterpret_cast(&t1) = __hfma2( - *reinterpret_cast(&t1), *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); - - uint16_t s = reinterpret_cast(&frag_s)[i]; - uint32_t double_s; - // pack 2xfp16 to half2 - asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(double_s) : "h"(s), "h"(s)); - // dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4 - // half, respectively) - static constexpr uint32_t MAGIC_NUM = 0x64806480; - *reinterpret_cast(&t0) = __hfma2( - *reinterpret_cast(&t0), *reinterpret_cast(&double_s), - *reinterpret_cast(&MAGIC_NUM)); - *reinterpret_cast(&t1) = __hfma2( - *reinterpret_cast(&t1), *reinterpret_cast(&double_s), - *reinterpret_cast(&MAGIC_NUM)); - // take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4 - // int8 into 1 uint32 - FragB frag_b; - uint32_t uint8s; - static constexpr uint32_t MASK_0246 = 0x6420; - static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080; - asm volatile("prmt.b32 %0,%1,%2,%3;\n" - : "=r"(uint8s) - : "r"(t0), "r"(t1), "n"(MASK_0246)); - frag_b[0] = (uint8s ^ UINT8s_TO_INT8s_MASK); - return frag_b; -} - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // int8 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // int32 global_reduce buffer of shape - // (max_par*16*4)xn, as int8 tensor core's output is - // int32 dtype - int4* __restrict__ D, // fp16 output buffer of shape mxn - const float* __restrict__ s_tok, // fp32 activation per-token quantization - // scales of shape mx1 - const int4* __restrict__ s_ch, // fp32 weight per-channel quantization - // scales of shape 1xn - const int4* __restrict__ s_group, // fp16 weight per-group quantization - // scales of shape (k/groupsize)xn, when - // group_blocks=-1, it should be nullptr - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Each threadblock processes one "stripe" of the B matrix with (roughly) the - // same size, which might involve multiple column "slices" (of width 16 * - // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM - // example: - // 0 1 3 - // 0 2 3 - // 1 2 4 - // While this kind of partitioning makes things somewhat more complicated, it - // ensures good utilization of all SMs for many kinds of shape and GPU - // configurations, while requiring as few slow global cross-threadblock - // reductions as possible. - - // For larger GEMMs we run multiple batchsize 64 versions in parallel for a - // better partitioning with less reductions - int parallel = 1; - if (prob_m > 16 * thread_m_blocks) { - parallel = prob_m / (16 * thread_m_blocks); - prob_m = 16 * thread_m_blocks; - } - - int k_tiles = prob_k / 16 / thread_k_blocks; - int n_tiles = prob_n / 16 / thread_n_blocks; - int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x); - // Ensure that the number of tiles in each stripe is a multiple of the - // groupsize; this avoids an annoying special case where a stripe starts in - // the middle of group. - if constexpr (group_blocks != -1) - iters = (group_blocks / thread_k_blocks) * - ceildiv(iters, (group_blocks / thread_k_blocks)); - - int slice_row = (iters * blockIdx.x) % k_tiles; - int slice_col_par = (iters * blockIdx.x) / k_tiles; - int slice_col = slice_col_par; - int slice_iters; // number of threadblock tiles in the current slice - int slice_count = - 0; // total number of active threadblocks in the current slice - int slice_idx; // index of threadblock in current slice; numbered bottom to - // top - - // We can easily implement parallel problem execution by just remapping - // indices and advancing global pointers - if (slice_col_par >= n_tiles) { - A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 16; - C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 4; - D += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; - s_tok += (slice_col_par / n_tiles) * 16 * thread_m_blocks; - locks += (slice_col_par / n_tiles) * n_tiles; - slice_col = slice_col_par % n_tiles; - } - - // Compute all information about the current slice which is required for - // synchronization. - auto init_slice = [&]() { - slice_iters = - iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; - if (slice_iters == 0) return; - if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; - slice_count = 1; - slice_idx = 0; - int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); - if (col_first <= k_tiles * (slice_col_par + 1)) { - int col_off = col_first - k_tiles * slice_col_par; - slice_count = ceildiv(k_tiles - col_off, iters); - if (col_off > 0) slice_count++; - int delta_first = iters * blockIdx.x - col_first; - if (delta_first < 0 || (col_off == 0 && delta_first == 0)) - slice_idx = slice_count - 1; - else { - slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) slice_idx--; - } - } - if (slice_col == n_tiles) { - A += 16 * thread_m_blocks * prob_k / 16; - C += 16 * thread_m_blocks * prob_n / 4; - D += 16 * thread_m_blocks * prob_n / 8; - s_tok += 16 * thread_m_blocks; - locks += n_tiles; - slice_col = 0; - } - }; - init_slice(); - - int a_gl_stride = prob_k / 16; // stride of the A matrix in global memory - // We typically use `constexpr` to indicate that this value is a compile-time - // constant - constexpr int a_sh_stride = - 16 * thread_k_blocks / 16; // stride of an A matrix tile in shared memory - constexpr int a_gl_rd_delta_o = - 16 * thread_k_blocks / - 16; // delta between subsequent A tiles in global memory - int a_gl_rd_delta_i = - a_gl_stride * - (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile - constexpr int a_sh_wr_delta = - a_sh_stride * - (threads / a_gl_rd_delta_o); // between shared memory writes - constexpr int a_sh_rd_delta_o = - 1 * ((threads / 32) / - (thread_n_blocks / 4)); // between shared memory tile reads - constexpr int a_sh_rd_delta_i = - a_sh_stride * 16; // within a shared memory tile - constexpr int a_sh_stage = - a_sh_stride * (16 * thread_m_blocks); // overall size of a tile - constexpr int a_sh_wr_iters = - ceildiv(a_sh_stage, - a_sh_wr_delta); // number of shared write iterations for a tile - - int b_gl_stride = 16 * prob_n / 32; - constexpr int b_sh_stride = 32 * thread_n_blocks / 4; - int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; - int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride); - constexpr int b_sh_wr_delta = threads; - constexpr int b_sh_rd_delta = threads; - constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; - constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; - - constexpr int s_tok_sh_stride = 16 * thread_m_blocks; - - constexpr int s_ch_sh_stride = 16 * thread_n_blocks / 4; - - int s_group_gl_stride = prob_n / 8; - constexpr int s_group_sh_stride = 16 * thread_n_blocks / 8; - constexpr int s_group_sh_stage = s_group_sh_stride; - int s_group_gl_rd_delta = s_group_gl_stride; - - // Global A read index of current thread. - int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - a_gl_rd += a_gl_rd_delta_o * slice_row; - // Shared write index of current thread. - int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - // Shared read index. - // NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix - int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % 16); - a_sh_rd += 1 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); - - int b_gl_rd = - b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride); - b_gl_rd += b_sh_stride * slice_col; - b_gl_rd += b_gl_rd_delta_o * slice_row; - auto b_sh_wr = threadIdx.x; - auto b_sh_rd = threadIdx.x; - - auto s_tok_gl_rd = threadIdx.x; - // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10, - // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for - // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as - // s_tok's size is not fixed, we can not shuffle before inference we shuffle - // it when fetching s_tok from global memory to shared memory, that's why - // s_tok_sh_wr is like this - int s_tok_sh_wr = - (threadIdx.x / 16) * 16 + (threadIdx.x % 8) * 2 + (threadIdx.x % 16) / 8; - int s_tok_sh_rd = (threadIdx.x % 32) / 4; - bool s_tok_sh_wr_pred = threadIdx.x < prob_m; - - auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x; - auto s_ch_sh_wr = threadIdx.x; - int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - 2 * ((threadIdx.x % 32) % 4); - bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride; - - int s_group_gl_rd, s_group_sh_wr, s_group_sh_rd; - bool s_group_sh_wr_pred; - if constexpr (group_blocks != -1) { - s_group_gl_rd = - s_group_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + - s_group_sh_stride * slice_col + threadIdx.x; - s_group_sh_wr = threadIdx.x; - // NOTE(HandH1998): s_group_sh_rd is related to mma output C - s_group_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - s_group_sh_wr_pred = threadIdx.x < s_group_sh_stride; - } - - // Precompute which thread should not read memory in which iterations; this is - // needed if there are more threads than required for a certain tilesize or - // when the batchsize is not a multiple of 16. - bool a_sh_wr_pred[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; - - // To ensure that writing and reading A tiles to/from shared memory, the - // latter in fragment format, is fully bank conflict free, we need to use a - // rather fancy XOR-based layout. The key here is that neither reads nor - // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the - // same shared memory banks. Further, it seems (based on NSight-Compute) that - // each warp must also write a consecutive memory segment? - auto transform_a = [&](int i) { - int row = i / a_gl_rd_delta_o; - return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; - }; - // Since the computation of this remapping is non-trivial and, due to our main - // loop unrolls, all shared memory accesses are static, we simply precompute - // both transformed reads and writes. - int a_sh_wr_trans[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); - int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - #pragma unroll - for (int j = 0; j < thread_m_blocks; j++) - a_sh_rd_trans[i][j] = - transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); - } - - // Since B-accesses have non-constant stride they have to be computed at - // runtime; we break dependencies between subsequent accesses with a tile by - // maintining multiple pointers (we have enough registers), a tiny - // optimization. - const int4* B_ptr[b_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; - - extern __shared__ int4 sh[]; - // Shared memory storage for global fetch pipelines. - // NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages * - // a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage) - int4* sh_a = sh; - int4* sh_b = sh_a + (stages * a_sh_stage); - int4* sh_s_tok = sh_b + (stages * b_sh_stage); - int4* sh_s_ch = sh_s_tok + s_tok_sh_stride; - int4* sh_s_group = sh_s_ch + s_ch_sh_stride; - - // Register storage for double buffer of shared memory reads. - FragA frag_a[2][thread_m_blocks]; - I4 frag_b_quant[2]; - FragC frag_c[thread_m_blocks][4][2]; - FragS_GROUP frag_s_group[2][4]; - FragS_CHANNEL frag_s_tok[thread_m_blocks]; - FragS_CHANNEL frag_s_ch[2][4]; - - // Zero accumulators. - auto zero_accums = [&]() { - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; - }; - - // Asynchronously fetch the next A, B and s tile from global to the next - // shared memory pipeline location. - auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { - if (pred) { - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) { - cp_async4_pred( - &sh_a_stage[a_sh_wr_trans[i]], - &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], - a_sh_wr_pred[i]); - } - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); - B_ptr[i] += b_gl_rd_delta_o; - } - // Only fetch scales if this tile starts a new group - if constexpr (group_blocks != -1) { - if (pipe % (group_blocks / thread_k_blocks) == 0) { - int4* sh_s_group_stage = sh_s_group + s_group_sh_stage * pipe; - if (s_group_sh_wr_pred) - cp_async4(&sh_s_group_stage[s_group_sh_wr], - &s_group[s_group_gl_rd]); - s_group_gl_rd += s_group_gl_rd_delta; - } - } - } - // Insert a fence even when we are winding down the pipeline to ensure that - // waiting is also correct at this point. - cp_async_fence(); - }; - - // Wait until the next thread tile has been loaded to shared memory. - auto wait_for_stage = [&]() { - // We only have `stages - 2` active fetches since we are double buffering - // and can only issue the next fetch when it is guaranteed that the previous - // shared memory load is fully complete (as it may otherwise be - // overwritten). - cp_async_wait(); - __syncthreads(); - }; - - // Load the next sub-tile from the current location in the shared memory pipe - // into the current register buffer. - auto fetch_to_registers = [&](int k, int pipe) { - // It may seem inefficient that we reload the groups for every sub-tile; - // however, this does not seem to be a significant bottleneck, while some - // theoretically better attempts have lead to bad instruction ordering by - // the compiler and correspondingly a noticeable drop in performance. - if constexpr (group_blocks != -1) { - int4* sh_s_group_stage = - sh_s_group + - s_group_sh_stage * ((group_blocks / thread_k_blocks) * - (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s_group[k % 2])[0] = - sh_s_group_stage[s_group_sh_rd]; - } - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) - ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - frag_b_quant[k % 2] = *reinterpret_cast( - &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); - }; - - // Execute the actual tensor core matmul of a sub-tile. - auto matmul = [&](int k) { - // We have the m dimension as the inner loop in order to encourage overlapping - // dequantization and matmul operations. - #pragma unroll - for (int j = 0; j < 4; j++) { - int b_quant = frag_b_quant[k % 2][j]; - // int b_quant_shift = b_quant << 4; - FragB frag_b0, frag_b1; - // If there are no groups, we can just scale the final output once and can - // avoid doing so for each weight. - if constexpr (group_blocks != -1) { - int b_quant_shift = b_quant >> 8; - frag_b0 = dequant_per_group(b_quant, frag_s_group[k % 2][j], 0); - frag_b1 = dequant_per_group(b_quant_shift, frag_s_group[k % 2][j], 1); - } else { - int b_quant_shift = b_quant << 4; - frag_b0 = dequant_per_channel(b_quant); - frag_b1 = dequant_per_channel(b_quant_shift); - } - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); - mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); - } - } - }; - - // Since we slice across the k dimension of a tile in order to increase the - // number of warps while keeping the n dimension of a tile reasonable, we have - // multiple warps that accumulate their partial sums of the same output - // location; which we have to reduce over in the end. We do in shared memory. - auto thread_block_reduce = [&]() { - constexpr int red_off = threads / b_sh_stride / 2; - if (red_off >= 1) { - auto red_idx = threadIdx.x / b_sh_stride; - constexpr int red_sh_stride = b_sh_stride * 4 * 2; - constexpr int red_sh_delta = b_sh_stride; - int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) + - (threadIdx.x % b_sh_stride); - - // Parallel logarithmic shared memory reduction. We make sure to avoid any - // unnecessary read or write iterations, e.g., for two warps we write only - // once by warp 1 and read only once by warp 0. - - #pragma unroll - for (int m_block = 0; m_block < thread_m_blocks; m_block++) { - #pragma unroll - for (int i = red_off; i > 0; i /= 2) { - if (i <= red_idx && red_idx < 2 * i) { - #pragma unroll - for (int j = 0; j < 4 * 2; j++) { - int red_sh_wr = - red_sh_delta * j + (red_sh_rd - red_sh_stride * i); - if (i < red_off) { - int* c_rd = - reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); - int* c_wr = reinterpret_cast(&sh[red_sh_wr]); - #pragma unroll - for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += - c_rd[k] + c_wr[k]; - } - sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; - } - } - __syncthreads(); - } - if (red_idx == 0) { - #pragma unroll - for (int i = 0; i < 4 * 2; i++) { - int* c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); - #pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += - c_rd[j]; - } - } - __syncthreads(); - } - } - }; - - // Since multiple threadblocks may process parts of the same column slice, we - // finally have to globally reduce over the results. As the striped - // partitioning minimizes the number of such reductions and our outputs are - // usually rather small, we perform this reduction serially in L2 cache. - // global_reduce works on INT32 elements, which are the results of INT8 GEMM. - // This is why we need another INT32 maxtrix `C` to reduce instead of the - // original half matrix `D`. - auto global_reduce = [&](bool first = false, bool last = false) { - // We are very careful here to reduce directly in the output buffer to - // maximize L2 cache utilization in this step. To do this, we write out - // results in FP16 (but still reduce with FP32 compute). - constexpr int active_threads = 32 * thread_n_blocks / 4; - if (threadIdx.x < active_threads) { - int c_gl_stride = prob_n / 4; - int c_gl_wr_delta_o = 8 * c_gl_stride; - int c_gl_wr_delta_i = 8 * (active_threads / 32); - int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + - 8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2; - c_gl_wr += (4 * thread_n_blocks) * slice_col; - constexpr int c_sh_wr_delta = active_threads * 2; - auto c_sh_wr = 2 * threadIdx.x; - - int row = (threadIdx.x % 32) / 4; - - if (!first) { - // Interestingly, doing direct global accesses here really seems to mess up - // the compiler and lead to slowdowns, hence we also use async-copies even - // though these fetches are not actually asynchronous. - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i + 1], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2) + 1], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - } - cp_async_fence(); - cp_async_wait<0>(); - } - - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { - if (!first) { - int4 d_red1 = sh[c_sh_wr + i * c_sh_wr_delta]; - int4 d_red2 = sh[c_sh_wr + i * c_sh_wr_delta + 1]; - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += - reinterpret_cast(&d_red1)[j]; - } - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)] += - reinterpret_cast(&d_red2)[j]; - } - } - if (!last) { - int4 d1, d2; - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast(&d1)[j] = reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]; - } - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast(&d2)[j] = reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)]; - } - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = - d1; - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2) + - 1] = d2; - } - } - } - } - }; - - // Write out the reduce final result in the correct layout. We only actually - // reshuffle matrix fragments in this step, the reduction above is performed - // in fragment layout. - auto write_result = [&]() { - int d_gl_stride = prob_n / 8; - constexpr int d_sh_stride = 2 * thread_n_blocks + 1; - int d_gl_wr_delta = d_gl_stride * (threads / (2 * thread_n_blocks)); - constexpr int d_sh_rd_delta = - d_sh_stride * (threads / (2 * thread_n_blocks)); - - int d_gl_wr = d_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - d_gl_wr += (2 * thread_n_blocks) * slice_col; - int d_sh_wr = - (4 * d_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; - d_sh_wr += 32 * (threadIdx.x / 32); - int d_sh_rd = d_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - - int d_gl_wr_end = d_gl_stride * prob_m; - - // We first reorder in shared memory to guarantee the most efficient final - // global write patterns - auto write = [&](int idx, int c0, int c1, float a_s, FragS_CHANNEL& w_s) { - float2 deq_res; - deq_res.x = int32_to_float(c0) * w_s[0] * a_s; - deq_res.y = int32_to_float(c1) * w_s[1] * a_s; - ((half2*)sh)[idx] = float2_to_half2(deq_res); - }; - - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - #pragma unroll - for (int j = 0; j < 4; j++) { - int wr = d_sh_wr + 8 * j; - write(wr + (4 * d_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s_tok[i][0], - frag_s_ch[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * d_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s_tok[i][1], - frag_s_ch[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * d_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s_tok[i][0], - frag_s_ch[j / 2][2 * (j % 2) + 1]); - write(wr + (4 * d_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s_tok[i][1], - frag_s_ch[j / 2][2 * (j % 2) + 1]); - } - d_sh_wr += 16 * (4 * d_sh_stride); - } - } - __syncthreads(); - - #pragma unroll - for (int i = 0; - i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); - i++) { - if (d_gl_wr < d_gl_wr_end) { - D[d_gl_wr] = sh[d_sh_rd]; - d_gl_wr += d_gl_wr_delta; - d_sh_rd += d_sh_rd_delta; - } - } - }; - - // Start global fetch and register load pipelines. - auto start_pipes = [&]() { - #pragma unroll - for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); - zero_accums(); - wait_for_stage(); - fetch_to_registers(0, 0); - a_gl_rd += a_gl_rd_delta_o * (stages - 1); - }; - start_pipes(); - - // Main loop. - while (slice_iters) { - // We unroll over both the global fetch and the register load pipeline to - // ensure all shared memory accesses are static. Note that both pipelines have - // even length meaning that the next iteration will always start at index 0. - #pragma unroll - for (int pipe = 0; pipe < stages;) { - #pragma unroll - for (int k = 0; k < b_sh_wr_iters; k++) { - fetch_to_registers(k + 1, pipe % stages); - if (k == b_sh_wr_iters - 2) { - fetch_to_shared((pipe + stages - 1) % stages, pipe, - slice_iters >= stages); - pipe++; - wait_for_stage(); - } - matmul(k); - } - slice_iters--; - if (slice_iters == 0) break; - } - a_gl_rd += a_gl_rd_delta_o * stages; - - // Process results and, if necessary, proceed to the next column slice. - // While this pattern may not be the most readable, other ways of writing - // the loop seemed to noticeably worse performance after compilation. - if (slice_iters == 0) { - cp_async_wait<0>(); - bool last = slice_idx == slice_count - 1; - // For per-column scales, we only fetch them here in the final step before - // write-out - if (last) { - if (s_tok_sh_wr_pred) { - cp_async1(&sh_s_tok[s_tok_sh_wr], &s_tok[s_tok_gl_rd]); - } - if (s_ch_sh_wr_pred) { - cp_async4(&sh_s_ch[s_ch_sh_wr], &s_ch[s_ch_gl_rd]); - } - cp_async_fence(); - } - thread_block_reduce(); - if (last) { - cp_async_wait<0>(); - __syncthreads(); - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - frag_s_tok[i][0] = - *reinterpret_cast(&sh_s_tok[16 * i + 2 * s_tok_sh_rd]); - frag_s_tok[i][1] = *reinterpret_cast( - &sh_s_tok[16 * i + 2 * s_tok_sh_rd + 1]); - } - reinterpret_cast(&frag_s_ch)[0] = sh_s_ch[s_ch_sh_rd + 0]; - reinterpret_cast(&frag_s_ch)[1] = sh_s_ch[s_ch_sh_rd + 1]; - reinterpret_cast(&frag_s_ch)[2] = sh_s_ch[s_ch_sh_rd + 8]; - reinterpret_cast(&frag_s_ch)[3] = sh_s_ch[s_ch_sh_rd + 9]; - } - } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice - barrier_acquire(&locks[slice_col], slice_idx); - global_reduce(slice_idx == 0, last); - barrier_release(&locks[slice_col], last); - } - if (last) // only the last block in a slice actually writes the result - write_result(); - slice_row = 0; - slice_col_par++; - slice_col++; - init_slice(); - if (slice_iters) { - a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; - if (slice_col == 0) { - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; - } - s_group_gl_rd = s_group_sh_stride * slice_col + threadIdx.x; - s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x; - start_pipes(); - } - } - } -} - -#else - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // int8 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // int32 global_reduce buffer of shape - // (max_par*16*4)xn, as int8 tensor core's output is - // int32 dtype - int4* __restrict__ D, // fp16 output buffer of shape mxn - const float* __restrict__ s_tok, // fp32 activation per-token quantization - // scales of shape mx1 - const int4* __restrict__ s_ch, // fp32 weight per-channel quantization - // scales of shape 1xn - const int4* __restrict__ s_group, // fp16 weight per-group quantization - // scales of shape (k/groupsize)xn, when - // group_blocks=-1, it should be nullptr - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Marlin is not implemented yet for SM < 8.0 - assert(false); - return; -} - -#endif - -// 8 warps are a good choice since every SM has 4 schedulers and having more -// than 1 warp per schedule allows some more latency hiding. At the same time, -// we want relatively few warps to have many registers per warp and small tiles. -const int USER_THREADS = - 256; // Note: This is only used with user-provided thread_k/n -const int STAGES = 4; // 4 pipeline stages fit into shared memory - -static constexpr int min_thread_n = 64; -static constexpr int min_thread_k = 64; - -static constexpr int tile_size = 16; -static constexpr int max_par = 16; - -static constexpr int pack_factor_4bit = - 8; // We have 8 4-bit vals inside a 32 bit - -#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - GROUP_BLOCKS, NUM_THREADS) \ - else if (thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute(Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, \ - max_shared_mem); \ - Marlin \ - <<>>( \ - A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr, \ - prob_m, prob_n, prob_k, locks); \ - } - -typedef struct { - int thread_k; - int thread_n; - int num_threads; -} thread_config_t; - -thread_config_t small_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {128, 128, 256}, // Default - {128, 64, 128}, // Reduce N 2X, same K - {64, 256, 256}, // Reduce K 2X, increase N 2X - {64, 128, 128}, // Reduce K 2X, same N -}; - -thread_config_t large_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {64, 256, 256}, // Default - {128, 128, 256}, // Reduce N 2X, increase K 2X - {64, 128, 128}, // Reduce N 2X, same K - {128, 64, 128}, // Reduce N 4X, increase K 2X -}; - -bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n, - int prob_k) { - // Sanity - if (th_config.thread_k == -1 || th_config.thread_n == -1 || - th_config.num_threads == -1) { - return false; - } - - // Verify K/N are divisible by thread K/N - if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { - return false; - } - - // thread_k can be only 128 or 64 (because it must be less than groupsize - // which is 128) - if (th_config.thread_k != 128 && th_config.thread_k != 64) { - return false; - } - - // Verify min for thread K/N - if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { - return false; - } - - // num_threads must be at least 128 (= 4 warps) - if (th_config.num_threads < 128) { - return false; - } - - return true; -} - -thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { - if (prob_m <= 16) { - for (auto th_config : small_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - - } else { - for (auto th_config : large_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - } - - return thread_config_t{-1, -1, -1}; -} - -#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) - -void marlin_qqq_cuda(const void* A, const void* B, void* C, void* D, - void* s_tok, void* s_ch, void* s_group, int prob_m, - int prob_n, int prob_k, void* workspace, - int groupsize = -1, int dev = 0, cudaStream_t stream = 0, - int thread_k = -1, int thread_n = -1, int sms = -1, - int max_par = 16) { - int tot_m = prob_m; - int tot_m_blocks = ceildiv(tot_m, 16); - int pad = 16 * tot_m_blocks - tot_m; - - if (sms == -1) - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); - - int max_shared_mem = 0; - cudaDeviceGetAttribute(&max_shared_mem, - cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); - TORCH_CHECK(max_shared_mem > 0); - - // Set thread config - thread_config_t th_config; - if (thread_k != -1 && thread_n != -1) { - // User-defined config - th_config = thread_config_t{thread_k, thread_n, USER_THREADS}; - } else { - // Auto config - th_config = determine_thread_config(prob_m, prob_n, prob_k); - } - - if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) { - throw std::runtime_error( - "Invalid thread config: thread_k = " + str(th_config.thread_k) + - ", thread_n = " + str(th_config.thread_n) + - ", num_threads = " + str(th_config.num_threads) + " for MKN = [" + - str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]"); - } - - int num_threads = th_config.num_threads; - thread_k = th_config.thread_k; - thread_n = th_config.thread_n; - - int thread_k_blocks = thread_k / 16; - int thread_n_blocks = thread_n / 16; - int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; - int blocks = sms; - - if (prob_m == 0 || prob_n == 0 || prob_k == 0) { - return; - } - - TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, - " is not divisible by thread_n = ", thread_n); - TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, - " is not divisible by thread_k = ", thread_k); - if (group_blocks != -1) { - TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, - " is not divisible by group_blocks = ", group_blocks); - } - - const int4* A_ptr = (const int4*)A; - const int4* B_ptr = (const int4*)B; - int4* C_ptr = (int4*)C; - int4* D_ptr = (int4*)D; - const float* s_tok_ptr = (const float*)s_tok; - const int4* s_ch_ptr = (const int4*)s_ch; - const int4* s_group_ptr = (const int4*)s_group; - - int* locks = (int*)workspace; - - for (int i = 0; i < tot_m_blocks; i += 4) { - int thread_m_blocks = tot_m_blocks - i; - prob_m = tot_m - 16 * i; - int par = 1; - if (thread_m_blocks > 4) { - // Note that parallel > 1 currently only works for inputs without any - // padding - par = (16 * thread_m_blocks - pad) / 64; - if (par > max_par) par = max_par; - prob_m = 64 * par; - i += 4 * (par - 1); - thread_m_blocks = 4; - } - - // For compilation speed, we only define the kernel configurations that have - // seemed useful (in terms of performance) in our testing, however many more - // are, in principle, possible. - if (false) { - } - CALL_IF(8, 8, 256) - CALL_IF(16, 4, 256) - CALL_IF(8, 4, 128) - CALL_IF(4, 8, 128) - else { - throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + - ", " + str(prob_k) + ", " + str(prob_n) + "]" + - ", groupsize = " + str(groupsize) + - ", thread_m_blocks = " + str(thread_m_blocks) + - ", thread_n_blocks = " + str(thread_n_blocks) + - ", thread_k_blocks = " + str(thread_k_blocks)); - } - - A_ptr += 16 * thread_m_blocks * (prob_k / 16) * par; - D_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; - s_tok_ptr += 16 * thread_m_blocks * par; - } -} -} // anonymous namespace - -torch::Tensor marlin_qqq_gemm(torch::Tensor const& a, - torch::Tensor const& b_q_weight, - torch::Tensor const& s_tok, - torch::Tensor const& s_ch, - torch::Tensor const& s_group, - torch::Tensor& workspace, int64_t size_m, - int64_t size_n, int64_t size_k) { - // Verify M - TORCH_CHECK(size_m == a.size(0), - "Shape mismatch: a.size(0) = " + str(a.size(0)) + - ", size_m = " + str(size_m)); - TORCH_CHECK(size_m == s_tok.numel(), - "Shape mismatch: s_tok.numel() = " + str(s_tok.numel()) + - ", size_m = " + str(size_m)); - - // Verify K - TORCH_CHECK(size_k == a.size(1), - "Shape mismatch: a.size(1) = " + str(a.size(1)) + - ", size_k = " + str(size_k)); - TORCH_CHECK(size_k % tile_size == 0, - "size_k = " + str(size_k) + - " is not divisible by tile_size = " + str(tile_size)); - TORCH_CHECK( - (size_k / tile_size) == b_q_weight.size(0), - "Shape mismatch: b_q_weight.size(0) = " + str(b_q_weight.size(0)) + - ", size_k = " + str(size_k) + ", tile_size = " + str(tile_size)); - - int groupsize = (s_group.numel() == 0) ? -1 : size_k / s_group.size(0); - // Verify groupsize - TORCH_CHECK(groupsize == -1 || groupsize == 128, - "Unexpected groupsize = " + str(groupsize)); - - // Verify N - TORCH_CHECK(s_ch.numel() == size_n, - "Shape mismatch: s_ch.numel() = " + str(s_ch.numel()) + - ", size_n = " + str(size_n)); - TORCH_CHECK(b_q_weight.size(1) % tile_size == 0, - "b_q_weight.size(1) = " + str(b_q_weight.size(1)) + - " is not divisible by tile_size = " + str(tile_size)); - if (groupsize != -1) { - TORCH_CHECK(s_group.size(1) == size_n, - "Shape mismatch: s_group.size(1) = " + str(s_group.size(1)) + - ", size_n = " + str(size_n)); - TORCH_CHECK( - size_k % s_group.size(0) == 0, - "size_k = " + str(size_k) + - ", is not divisible by s_group.size(0) = " + str(s_group.size(0))); - } - - int actual_size_n = (b_q_weight.size(1) / tile_size) * pack_factor_4bit; - TORCH_CHECK(size_n == actual_size_n, - "Shape mismatch: size_n = " + str(size_n) + - ", actual_size_n = " + str(actual_size_n)); - - // Verify A device and strides - TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); - TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); - - // Verify B device and strides - TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); - TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); - - // Verify s_tok device, strides and dtype - TORCH_CHECK(s_tok.device().is_cuda(), "s_tok is not on GPU"); - TORCH_CHECK(s_tok.is_contiguous(), "s_tok is not contiguous"); - TORCH_CHECK(s_tok.dtype() == torch::kFloat32, "s_tok's dtype is not float32"); - - // Verify s_ch device, strides and dtype - TORCH_CHECK(s_ch.device().is_cuda(), "s_ch is not on GPU"); - TORCH_CHECK(s_ch.is_contiguous(), "s_ch is not contiguous"); - TORCH_CHECK(s_ch.dtype() == torch::kFloat32, "s_ch's dtype is not float32"); - - // Verify s_group device, strides and dtype - TORCH_CHECK(s_group.device().is_cuda(), "s_group is not on GPU"); - TORCH_CHECK(s_group.is_contiguous(), "s_group is not contiguous"); - TORCH_CHECK(s_group.dtype() == torch::kFloat16, - "s_group's dtype is not float16"); - - // Verify workspace size - TORCH_CHECK(size_n % min_thread_n == 0, - "size_n = " + str(size_n) + - ", is not divisible by min_thread_n = " + str(min_thread_n)); - int min_workspace_size = (size_n / min_thread_n) * max_par; - TORCH_CHECK(workspace.numel() >= min_workspace_size, - "workspace.numel = " + str(workspace.numel()) + - " is below min_workspace_size = " + str(min_workspace_size)); - - // Alloc C matrix - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - auto options_c = torch::TensorOptions().dtype(torch::kInt).device(a.device()); - torch::Tensor c = torch::empty({max_par * 64, size_n}, options_c); - - // Alloc D matrix - auto options_d = - torch::TensorOptions().dtype(torch::kFloat16).device(a.device()); - torch::Tensor d = torch::empty({size_m, size_n}, options_d); - - // thread_k: `k` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_k = -1; - // thread_n: `n` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_n = -1; - // sms: number of SMs to use for the kernel (can usually be left as auto -1) - int sms = -1; - - int dev = a.get_device(); - marlin_qqq_cuda( - a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), d.data_ptr(), - s_tok.data_ptr(), s_ch.data_ptr(), s_group.data_ptr(), size_m, size_n, - size_k, workspace.data_ptr(), groupsize, dev, - at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par); - - return d; -} - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { - m.impl("marlin_qqq_gemm", &marlin_qqq_gemm); -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 3a0ff6eaa790..60710f62c064 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -241,14 +241,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // custom types: // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA - // Marlin (Dense) Optimized Quantized GEMM for GPTQ. - ops.def( - "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " - "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> " - "Tensor", - {stride_tag}); - // conditionally compiled so impl in source file - // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ. ops.def( "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " @@ -353,15 +345,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size); #ifndef USE_ROCM - // marlin_qqq_gemm for QQQ. - ops.def( - "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, " - "Tensor s_tok, Tensor s_ch, Tensor s_group, " - "Tensor! workspace, SymInt size_m, SymInt size_n, " - "SymInt size_k) -> Tensor", - {stride_tag}); - // conditionally compiled so impl registration is in source file - // CUTLASS nvfp4 block scaled GEMM ops.def( "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b," diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index a2fc6ffeb8b2..84178344a5f3 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): "quantization": "gptq_marlin_24" })) - if is_quant_method_supported("marlin"): - TEST_MODELS.append( - ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { - "quantization": "marlin" - })) - if not current_platform.is_rocm() and is_quant_method_supported("awq"): TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { "quantization": "AWQ" diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index a842d2f1cbe8..0e09661c955e 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -95,23 +95,23 @@ class Tensors: token_scale_type=None) for w_type in [scalar_types.uint4, scalar_types.uint8] for a_type in [torch.float16, torch.bfloat16]), - # QQQ style - *(TypeConfig(act_type=torch.int8, - weight_type=scalar_types.uint4b8, - output_type=torch.float16, - group_scale_type=group_scale_type, - group_zero_type=None, - channel_scale_type=torch.float, - token_scale_type=torch.float) - for group_scale_type in [None, torch.float16]), - *(TypeConfig(act_type=torch.float8_e4m3fn, - weight_type=scalar_types.uint4b8, - output_type=torch.float16, - group_scale_type=group_scale_type, - group_zero_type=None, - channel_scale_type=torch.float, - token_scale_type=torch.float) - for group_scale_type in [None, torch.float16]), + # # QQQ style + # *(TypeConfig(act_type=torch.int8, + # weight_type=scalar_types.uint4b8, + # output_type=torch.float16, + # group_scale_type=group_scale_type, + # group_zero_type=None, + # channel_scale_type=torch.float, + # token_scale_type=torch.float) + # for group_scale_type in [None, torch.float16]), + # *(TypeConfig(act_type=torch.float8_e4m3fn, + # weight_type=scalar_types.uint4b8, + # output_type=torch.float16, + # group_scale_type=group_scale_type, + # group_zero_type=None, + # channel_scale_type=torch.float, + # token_scale_type=torch.float) + # for group_scale_type in [None, torch.float16]), ] # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index cea7700ac329..ad077e0b9473 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -13,11 +13,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES) -from vllm.model_executor.layers.quantization.qqq import ( - MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N, - MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx, marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales, query_marlin_supported_quant_types) @@ -31,8 +27,6 @@ marlin_weights) from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( marlin_24_quantize) -from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import ( # noqa: E501 - marlin_qqq_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights) from vllm.scalar_type import scalar_types @@ -449,68 +443,6 @@ def test_hqq_marlin_gemm( assert max_diff < 0.04 -@pytest.mark.skipif(not is_quant_method_supported("qqq"), - reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) -@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) -@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS) -@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES) -@pytest.mark.parametrize("mnk_factors", MNK_FACTORS) -def test_marlin_qqq_gemm( - k_chunk, - n_chunk, - num_bits, - group_size, - mnk_factors, -): - int8_traits = torch.iinfo(torch.int8) - m_factor, n_factor, k_factor = mnk_factors - - size_m = m_factor - size_k = k_chunk * k_factor - size_n = n_chunk * n_factor - - a_input = rand_data((size_m, size_k)) - b_weight = rand_data((size_k, size_n)) - - # Quantize activations - s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to( - torch.float) - q_a = (a_input / s_a).round().clamp(int8_traits.min, - int8_traits.max).to(torch.int8) - - # Quantize weights - w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \ - marlin_qqq_quantize(b_weight, num_bits, group_size) - - workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N, - MARLIN_QQQ_MAX_PARALLEL) - - opcheck(torch.ops._C.marlin_qqq_gemm, - (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel, - marlin_qqq_s_group, workspace.scratch, a_input.shape[0], - b_weight.shape[1], a_input.shape[1])) - - output = ops.marlin_qqq_gemm( - q_a, - marlin_qqq_q_w, - s_a, - marlin_qqq_s_channel, - marlin_qqq_s_group, - workspace.scratch, - a_input.shape[0], - b_weight.shape[1], - a_input.shape[1], - ) - output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref) - - torch.cuda.synchronize() - - max_diff = compute_max_diff(output, output_ref) - - assert max_diff < 0.04 - - def test_marlin_gemm_subset_input(): quant_type = scalar_types.uint4b8 group_size = 128 @@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m): max_diff = compute_max_diff(output, output_ref) assert max_diff < 0.04 - - -def test_marlin_gemm_opcheck(): - size_m = 2048 - size_n = 4096 - size_k = 4096 - a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16) - w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32) - s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16) - wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_MAX_PARALLEL).scratch - x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k) - y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k) - torch.testing.assert_close(x, y) - opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k)) diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 8cf8402436ff..1843bffd2115 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -22,22 +22,12 @@ class ModelPair: MODEL_ARG_EXPTYPES = [ # AUTOGPTQ # compat: autogptq <=0.7.1 is_marlin_format: bool - # Model Serialized in Marlin Format should always use Marlin kernel. - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"), # Model Serialized in Exllama Format. ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"), # compat: autogptq >=0.8.0 use checkpoint_format: str - # Model Serialized in Marlin Format should always use Marlin kernel. - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"), # Model Serialized in Exllama Format. ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"), ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"), diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 11f78a23bb4c..5ec8b27c1571 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -11,7 +11,6 @@ from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinLinearMethod) -from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod from vllm.model_executor.layers.vocab_parallel_embedding import ( UnquantizedEmbeddingMethod) @@ -19,9 +18,7 @@ MODELS_QUANT = [ ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True), - ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), - ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False) ] @@ -41,8 +38,7 @@ def check_model(model): lm_head_layer = model.lm_head if lm_head_quantized: assert isinstance(lm_head_layer.quant_method, - (GPTQLinearMethod, GPTQMarlinLinearMethod, - MarlinLinearMethod)) + (GPTQLinearMethod, GPTQMarlinLinearMethod)) else: assert isinstance(lm_head_layer.quant_method, UnquantizedEmbeddingMethod) diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index 1b797074096e..cc18c9ff1f09 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -26,9 +26,5 @@ compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing awq, casperhansen/mixtral-instruct-awq, main awq_marlin, casperhansen/mixtral-instruct-awq, main fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main -marlin, nm-testing/zephyr-beta-7b-marlin-g128, main -marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main -qqq, HandH1998/QQQ-Llama-3-8b-g128, main -qqq, HandH1998/QQQ-Llama-3-8b, main hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main None, mgleize/fairseq2-dummy-Llama-3.2-1B, main \ No newline at end of file diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 39da08847b2e..59f2d7737f19 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -387,14 +387,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, torch.ops._C.gptq_shuffle(q_weight, q_perm, bit) -# marlin -def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, - b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int, - size_n: int, size_k: int) -> torch.Tensor: - return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m, - size_n, size_k) - - # marlin_24 def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_meta: torch.Tensor, b_scales: torch.Tensor, @@ -437,25 +429,6 @@ def _gptq_marlin_gemm_fake(a: torch.Tensor, is_zp_float: bool = False) -> torch.Tensor: return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) - @register_fake("_C::marlin_qqq_gemm") - def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, - s_tok: torch.Tensor, s_ch: torch.Tensor, - s_group: torch.Tensor, workspace: torch.Tensor, - size_m: torch.SymInt, size_n: torch.SymInt, - size_k: torch.SymInt) -> torch.Tensor: - return torch.empty((size_m, size_n), - dtype=torch.float16, - device=a.device) - - @register_fake("_C::marlin_gemm") - def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, - b_scales: torch.Tensor, workspace: torch.Tensor, - size_m: torch.SymInt, size_n: torch.SymInt, - size_k: torch.SymInt) -> torch.Tensor: - return torch.empty((size_m, size_n), - dtype=torch.float16, - device=a.device) - @register_fake("_C::awq_dequantize") def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor, zeros: torch.Tensor, split_k_iters: torch.SymInt, @@ -1348,15 +1321,6 @@ def scaled_int8_quant( return output, input_scales, input_azp -# qqq ops -def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, - s_tok: torch.Tensor, s_ch: torch.Tensor, - s_group: torch.Tensor, workspace: torch.Tensor, - size_m: int, size_n: int, size_k: int) -> torch.Tensor: - return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group, - workspace, size_m, size_n, size_k) - - # gguf def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int, dtype: Optional[torch.dtype]) -> torch.Tensor: diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 5b5d477ef066..62dfd4333bee 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1112,9 +1112,9 @@ def _parse_quant_hf_config(self): def _verify_quantization(self) -> None: supported_quantization = me_quant.QUANTIZATION_METHODS optimized_quantization_methods = [ - "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", - "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8", - "quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc" + "fp8", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", + "fbgemm_fp8", "compressed-tensors", "experts_int8", "quark", + "modelopt_fp4", "bitblas", "gptq_bitblas", "inc" ] if self.quantization is not None: self.quantization = cast(me_quant.QuantizationMethods, @@ -1137,7 +1137,6 @@ def _verify_quantization(self) -> None: # `override_quantization_method` method) must be checked in order # of preference (this is particularly important for GPTQ). overrides = [ - "marlin", "bitblas", "gptq_marlin_24", "gptq_marlin", diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index de5933d6d41e..24a05d310d10 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -48,9 +48,6 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device: # GPTQ/AWQ elif hasattr(base_layer, "qweight"): return base_layer.qweight.device - # marlin - elif hasattr(base_layer, "B"): - return base_layer.B.device # HQQ marlin elif hasattr(base_layer, "W_q"): return base_layer.W_q.device diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index d3b6b2089f42..654e2ec7b2fa 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -42,7 +42,6 @@ "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", - "QQQLinearMethod", "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index a4c2671225f5..ea51468422dc 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -15,7 +15,6 @@ "fbgemm_fp8", "modelopt", "modelopt_fp4", - "marlin", "bitblas", "gguf", "gptq_marlin_24", @@ -25,7 +24,6 @@ "gptq", "compressed-tensors", "bitsandbytes", - "qqq", "hqq", "experts_int8", "neuron_quant", @@ -106,13 +104,11 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .hqq_marlin import HQQMarlinConfig from .inc import INCConfig from .ipex_quant import IPEXConfig - from .marlin import MarlinConfig from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config from .moe_wna16 import MoeWNA16Config from .mxfp4 import Mxfp4Config from .neuron_quant import NeuronQuantConfig from .ptpc_fp8 import PTPCFp8Config - from .qqq import QQQConfig from .rtn import RTNConfig from .torchao import TorchAOConfig from .tpu_int8 import Int8TpuConfig @@ -125,7 +121,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "fbgemm_fp8": FBGEMMFp8Config, "modelopt": ModelOptFp8Config, "modelopt_fp4": ModelOptNvFp4Config, - "marlin": MarlinConfig, "bitblas": BitBLASConfig, "gguf": GGUFConfig, "gptq_marlin_24": GPTQMarlin24Config, @@ -136,7 +131,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, "ptpc_fp8": PTPCFp8Config, - "qqq": QQQConfig, "hqq": HQQMarlinConfig, "experts_int8": ExpertsInt8Config, "neuron_quant": NeuronQuantConfig, diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py deleted file mode 100644 index 18d1c13373df..000000000000 --- a/vllm/model_executor/layers/quantization/marlin.py +++ /dev/null @@ -1,263 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.parameter import (BasevLLMParameter, - ChannelQuantScaleParameter, - GroupQuantScaleParameter, - PackedvLLMParameter) - -logger = init_logger(__name__) - - -class MarlinConfig(QuantizationConfig): - """Config class for Marlin. - - Reference: https://github.com/IST-DASLab/marlin/tree/master - """ - - def __init__( - self, - group_size: int, - lm_head_quantized: bool, - ) -> None: - super().__init__() - - # Group size for the quantization. - self.group_size = group_size - self.lm_head_quantized = lm_head_quantized - if self.group_size != 128 and self.group_size != -1: - raise ValueError( - "Currently, only group size 128 and -1 (channelwise) " - "is supported for Marlin, but got group_size of " - f"{self.group_size}") - - # 4 Bits packed into 32 bit datatype. - self.pack_factor = 32 // 4 - - # Tile size used by marlin kernels. - self.tile_size = 16 - - # Min out_features dim - self.min_n_threads = 64 - - # Min in_features dim - self.min_k_threads = 128 - - # Max parallel problems to solve at once (improves large - # batch performance) - self.max_parallel = 16 - - # Permutation length used by the marlin kernels. - self.perm_len = 1024 - - def __repr__(self) -> str: - return (f"MarlinConfig(group_size={self.group_size}, " - f"lm_head_quantized={self.lm_head_quantized})") - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "marlin" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - # Need to figure it out - def get_min_capability(cls) -> int: - return 80 - - @classmethod - def get_config_filenames(cls) -> list[str]: - return ["quantize_config.json"] - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "MarlinConfig": - group_size = cls.get_from_keys(config, ["group_size"]) - lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], - default=False) - return cls(group_size, lm_head_quantized) - - @classmethod - def override_quantization_method( - cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: - # compat: autogptq >=0.8.0 use checkpoint_format: str - # compat: autogptq <=0.7.1 is_marlin_format: bool - is_marlin_format = (hf_quant_cfg.get("checkpoint_format") == "marlin" - or hf_quant_cfg.get("is_marlin_format", False)) - - is_valid_user_quant = (user_quant is None or user_quant == "gptq" - or user_quant == "marlin") - - if is_marlin_format and is_valid_user_quant: - msg = ("The model is serialized in {} format. Using {} kernel.". - format(cls.get_name(), cls.get_name())) - logger.info(msg) - return cls.get_name() - - return None - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["MarlinLinearMethod"]: - if (isinstance(layer, LinearBase) or - (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)): - return MarlinLinearMethod(self) - return None - - -class MarlinLinearMethod(LinearMethodBase): - """Linear method for Marlin. - - Args: - quant_config: The Marlin quantization config. - """ - - def __init__(self, quant_config: MarlinConfig): - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - del output_size # Unused. - weight_loader = extra_weight_attrs["weight_loader"] - - if params_dtype != torch.float16: - raise ValueError( - f"The params dtype must be float16, but got {params_dtype}") - - # Validate output_size_per_partition - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.min_n_threads != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"min_n_threads = {self.quant_config.min_n_threads}.") - if output_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"pack_factor = {self.quant_config.pack_factor}.") - - # Validate input_size_per_partition - if input_size_per_partition % self.quant_config.min_k_threads != 0: - raise ValueError( - f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"min_k_threads = {self.quant_config.min_k_threads}.") - if (self.quant_config.group_size != -1 and - input_size_per_partition % self.quant_config.group_size != 0): - raise ValueError(f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"group_size = {self.quant_config.group_size}.") - - # Check that we have at least 4 tiles horizontally in the shard - num_tiles_per_perm = self.quant_config.perm_len // ( - self.quant_config.tile_size**2) - if output_size_per_partition % num_tiles_per_perm != 0: - raise ValueError( - "Each permutation group must reside on the same gpu") - - # Quantized 4Bit weights packed into Int32. - qweight = PackedvLLMParameter( - data=torch.empty( - input_size_per_partition // self.quant_config.tile_size, - output_size_per_partition * self.quant_config.tile_size // - self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - input_dim=0, - output_dim=1, - packed_dim=1, - packed_factor=self.quant_config.pack_factor, - marlin_tile_size=self.quant_config.tile_size, - weight_loader=weight_loader) - - # Determine if channelwise or not - input_groups = (1 if self.quant_config.group_size == -1 else - input_size_per_partition // - self.quant_config.group_size) - - weight_scale_args = { - "data": - torch.empty( - input_groups, - output_size_per_partition, - device="cuda", - dtype=params_dtype, - ), - "weight_loader": - weight_loader - } - if input_groups == 1: - scales = ChannelQuantScaleParameter(output_dim=1, - **weight_scale_args) - else: - scales = GroupQuantScaleParameter(output_dim=1, - input_dim=0, - **weight_scale_args) - - # Allocate workspace (Used for internal locking mechanism) - max_workspace_size = ( - output_size_per_partition // - self.quant_config.min_n_threads) * self.quant_config.max_parallel - - workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - weight_loader=weight_loader) - - layer.register_parameter("B", qweight) - layer.register_parameter("s", scales) - layer.register_parameter("workspace", workspace) - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # required by torch.compile - layer.B = Parameter(layer.B.data, requires_grad=False) - layer.s = Parameter(layer.s.data, requires_grad=False) - layer.workspace = Parameter(layer.workspace.data, requires_grad=False) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qweight = layer.B - scales = layer.s - workspace = layer.workspace - - x_2d = x.view(-1, x.shape[-1]) - - size_m = x_2d.shape[0] - size_k = x_2d.shape[1] - size_n = scales.shape[1] - - output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m, - size_n, size_k) - - output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) - - if bias is not None: - output.add_(bias) # In-place add - - return output diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py deleted file mode 100644 index 25978cb13b3a..000000000000 --- a/vllm/model_executor/layers/quantization/qqq.py +++ /dev/null @@ -1,275 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.parameter import (BasevLLMParameter, - ChannelQuantScaleParameter, - GroupQuantScaleParameter, - PackedvLLMParameter) - -logger = init_logger(__name__) - -MARLIN_QQQ_TILE = 16 -MARLIN_QQQ_MIN_THREAD_N = 64 -MARLIN_QQQ_MIN_THREAD_K = 128 -MARLIN_QQQ_MAX_PARALLEL = 16 - -MARLIN_QQQ_SUPPORTED_NUM_BITS = [4] -MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128] -MARLIN_QQQ_SUPPORTED_SYM = [True] - - -class QQQConfig(QuantizationConfig): - """Config class for QQQ - - Reference: https://arxiv.org/pdf/2406.09904 - """ - - def __init__( - self, - weight_bits: int, - group_size: int, - is_sym: bool = True, - ) -> None: - super().__init__() - self.weight_bits = weight_bits - self.group_size = group_size - self.is_sym = is_sym - - # Verify - if self.weight_bits not in MARLIN_QQQ_SUPPORTED_NUM_BITS: - raise ValueError( - f"QQQ does not support weight_bits = {self.weight_bits}. " - f"Only weight_bits = {MARLIN_QQQ_SUPPORTED_NUM_BITS} " - "are supported.") - if self.group_size not in MARLIN_QQQ_SUPPORTED_GROUP_SIZES: - raise ValueError( - f"QQQ does not support group_size = {self.group_size}. " - f"Only group_sizes = {MARLIN_QQQ_SUPPORTED_GROUP_SIZES} " - "are supported.") - if self.is_sym not in MARLIN_QQQ_SUPPORTED_SYM: - raise ValueError( - f"QQQ does not support is_sym = {self.is_sym}. " - f"Only sym = {MARLIN_QQQ_SUPPORTED_SYM} are supported.") - - # 4 Bits packed into 32 bit datatype. - self.pack_factor = 32 // self.weight_bits - - # Tile size used by QQQ kernels. - self.tile_size = MARLIN_QQQ_TILE - - # Min out_features dim - self.min_n_threads = MARLIN_QQQ_MIN_THREAD_N - - # Min in_features dim - self.min_k_threads = MARLIN_QQQ_MIN_THREAD_K - - # Max parallel problems to solve at once (improves large - # batch performance) - self.max_parallel = MARLIN_QQQ_MAX_PARALLEL - - # Permutation length used by the QQQ kernels. - self.perm_len = 1024 - - def __repr__(self) -> str: - return "QQQConfig(weight_bits={}, group_size={})".format( - self.weight_bits, self.group_size) - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "qqq" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - def get_min_capability(cls) -> int: - return 80 - - @classmethod - def get_config_filenames(cls) -> list[str]: - """List of filenames to search for in the model directory.""" - return [ - "quant_config.json", - "quantize_config.json", - ] - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "QQQConfig": - weight_bits = cls.get_from_keys(config, ["wbits"]) - group_size = cls.get_from_keys(config, ["group_size"]) - return cls(weight_bits, group_size) - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["QQQLinearMethod"]: - if isinstance(layer, LinearBase): - return QQQLinearMethod(self) - return None - - -class QQQLinearMethod(LinearMethodBase): - """Linear method for QQQ. - - Args: - quant_config: The QQQ quantization config. - """ - - def __init__(self, quant_config: QQQConfig): - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - weight_loader = extra_weight_attrs["weight_loader"] - if params_dtype != torch.float16: - raise ValueError( - f"The params dtype must be float16, but got {params_dtype}") - - # Validate output_size_per_partition - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.min_n_threads != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"min_n_threads = {self.quant_config.min_n_threads}.") - if output_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"pack_factor = {self.quant_config.pack_factor}.") - - # Validate input_size_per_partition - if input_size_per_partition % self.quant_config.min_k_threads != 0: - raise ValueError( - f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"min_k_threads = {self.quant_config.min_k_threads}.") - if (self.quant_config.group_size != -1 and - input_size_per_partition % self.quant_config.group_size != 0): - raise ValueError(f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"group_size = {self.quant_config.group_size}.") - - # Check that we have at least 4 tiles horizontally in the shard - num_tiles_per_perm = self.quant_config.perm_len // ( - self.quant_config.tile_size**2) - if output_size_per_partition % num_tiles_per_perm != 0: - raise ValueError( - "Each permutation group must reside on the same gpu") - - # Quantized 4Bit weights packed into Int32. - qweight = PackedvLLMParameter( - data=torch.empty( - input_size_per_partition // self.quant_config.tile_size, - output_size_per_partition * self.quant_config.tile_size // - self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - input_dim=0, - output_dim=1, - packed_dim=1, - packed_factor=self.quant_config.pack_factor, - marlin_tile_size=self.quant_config.tile_size, - weight_loader=weight_loader) - - s_channel = ChannelQuantScaleParameter(data=torch.empty( - 1, - output_size_per_partition, - device="cuda", - dtype=torch.float, - ), - weight_loader=weight_loader, - output_dim=1) - - if self.quant_config.group_size == -1: - s_group_data = torch.tensor( - [], - device="cuda", - dtype=torch.half, - ) - else: - s_group_data = torch.empty( - input_size_per_partition // self.quant_config.group_size, - output_size_per_partition, - device="cuda", - dtype=torch.half, - ) - - s_group_attr = {"data": s_group_data, "weight_loader": weight_loader} - - if self.quant_config.group_size == -1: - s_group = BasevLLMParameter(**s_group_attr) - else: - s_group = GroupQuantScaleParameter(output_dim=1, - input_dim=0, - **s_group_attr) - - # Allocate workspace (Used for internal locking mechanism) - max_workspace_size = ( - output_size_per_partition // - self.quant_config.min_n_threads) * self.quant_config.max_parallel - - workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - weight_loader=weight_loader) - - layer.register_parameter("B", qweight) - layer.register_parameter("s_channel", s_channel) - layer.register_parameter("s_group", s_group) - layer.register_parameter("workspace", workspace) - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # required by torch.compile - layer.B = Parameter(layer.B.data, requires_grad=False) - layer.s_channel = Parameter(layer.s_channel.data, requires_grad=False) - layer.s_group = Parameter(layer.s_group.data, requires_grad=False) - layer.workspace = Parameter(layer.workspace.data, requires_grad=False) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qweight = layer.B - s_ch = layer.s_channel - s_group = layer.s_group - workspace = layer.workspace - - x_2d = x.view(-1, x.shape[-1]) - - size_m = x_2d.shape[0] - size_k = x_2d.shape[1] - size_n = s_ch.shape[1] - - x_int8, s_tok, _ = ops.scaled_int8_quant(x_2d) - - output_2d = ops.marlin_qqq_gemm(x_int8, qweight, s_tok, s_ch, s_group, - workspace, size_m, size_n, size_k) - - output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) - - if bias is not None: - output.add_(bias) # In-place add - - return output diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py deleted file mode 100644 index 8a64bebae04c..000000000000 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ /dev/null @@ -1,126 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import numpy -import torch - -from .marlin_utils_test import marlin_permute_weights -from .quant_utils import get_pack_factor, qqq_quantize_weights - - -def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size): - # Permute - q_w = marlin_permute_weights(q_w, size_k, size_n, perm) - - # Pack - pack_factor = get_pack_factor(num_bits) - orig_device = q_w.device - - q_w = q_w.cpu().numpy().astype(numpy.uint32) - - q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), - dtype=numpy.uint32) - if group_size == size_k: - for i in range(pack_factor): - q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i - else: - for i in range(pack_factor): - q_packed |= q_w[:, i::pack_factor] << num_bits * i - - q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device) - - return q_packed - - -def get_qqq_scale_perms(): - scale_perm: list[int] = [] - for i in range(8): - scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: list[int] = [] - for i in range(4): - scale_perm_single.extend( - [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) - return scale_perm, scale_perm_single - - -# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501 -def get_qqq_weight_perm(num_bits: int, quant_type: str): - perm_list: list[int] = [] - for i in range(32): - perm1: list[int] = [] - col = i // 4 - for block in [0, 1]: - for row in [ - 4 * (i % 4), - 4 * (i % 4) + 1, - 4 * (i % 4) + 2, - 4 * (i % 4) + 3, - ]: - perm1.append(16 * row + col + 8 * block) - for j in range(4): - perm_list.extend([p + 256 * j for p in perm1]) - - perm = numpy.array(perm_list) - - assert quant_type in ["per-channel", - "per-group"], "not supported quantization type" - if num_bits == 4: - if quant_type == "per-channel": - interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3]) - else: - interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) - else: - raise Exception("num_bits must be 4, got {}".format(num_bits)) - - perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() - perm = torch.from_numpy(perm) - return perm - - -def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size): - scale_perm, scale_perm_single = get_qqq_scale_perms() - if group_size < size_k and group_size != -1: - s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm] - s_channel = s_channel.reshape( - (-1, len(scale_perm_single)))[:, scale_perm_single] - s_group = s_group.reshape((-1, size_n)).contiguous() - else: - s_channel = s_channel.reshape( - (-1, len(scale_perm_single)))[:, scale_perm_single] - s_channel = s_channel.reshape((-1, size_n)).contiguous() - - return s_group, s_channel - - -def marlin_qqq_quantize( - w: torch.Tensor, - num_bits: int, - group_size: int, -): - size_k, size_n = w.shape - - # Normalize group_size - if group_size == -1: - group_size = size_k - assert group_size <= size_k - quant_type = "per-channel" if group_size == size_k else "per-group" - - # Quantize - w_ref, q_w, s_group, s_channel = qqq_quantize_weights( - w, num_bits, group_size) - - # Reformat to marlin_qqq - weight_perm = get_qqq_weight_perm(num_bits, quant_type) - marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits, - weight_perm, group_size) - marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales( - s_group, s_channel, size_k, size_n, group_size) - - # Create result - res_list = [ - w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel - ] - for i in range(len(res_list)): - res_list[i] = res_list[i].to(w.device) - - return res_list diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 428e9e99aa88..3cfaca6230b1 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -9,8 +9,6 @@ import torch from vllm._custom_ops import cutlass_scaled_mm_supports_fp4 -from vllm.model_executor.layers.quantization.qqq import ( - MARLIN_QQQ_SUPPORTED_NUM_BITS) from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -386,89 +384,6 @@ def gptq_quantize_weights(w: torch.Tensor, return w_ref, w_q, w_s, g_idx, rand_perm -# QQQ employs different quant schemes for per-group and -# per-channel quantization. -def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int): - orig_device = w.device - size_k, size_n = w.shape - - assert w.is_floating_point(), "w must be float" - assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \ - f"Unsupported num_bits = {num_bits}" - assert group_size in SUPPORTED_GROUP_SIZES + [ - size_k - ], f"Unsupported groupsize = {group_size}" - - if group_size == -1: - group_size = size_k - assert group_size <= size_k - - if group_size < size_k: - # Reshape to [groupsize, -1] - w = w.reshape((-1, group_size, size_n)) - w = w.permute(1, 0, 2) - w = w.reshape((group_size, -1)) - - max_q_val = 2**num_bits - 1 - half_q_val = (max_q_val + 1) // 2 - - # Compute scale for each group - s_group = torch.max(torch.abs(w), 0, keepdim=True)[0] - s_group *= 2 / max_q_val # 2 => symmetric - - # Quantize - q_w = torch.round(w / s_group).int() - q_w += half_q_val - q_w = torch.clamp(q_w, 0, max_q_val) - # Compute ref (dequantized) - w_ref = (q_w - half_q_val).half() * s_group - - # Restore original shapes - def reshape_w(w): - w = w.reshape((group_size, -1, size_n)) - w = w.permute(1, 0, 2) - w = w.reshape((size_k, size_n)).contiguous() - return w - - q_w = reshape_w(q_w) - w_ref = reshape_w(w_ref) - - # Compute int8 quantization scale for each channel - s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0] - s_channel /= 127.0 - t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8) - w_ref = t_int8.half() * s_channel - s_channel = s_channel.reshape(1, -1).to(dtype=torch.float) - - # Fuse scales - s_group = (s_group.reshape(-1, size_n).contiguous() / - s_channel).to(dtype=torch.half) - else: - max_q_val = 2**(num_bits - 1) - 1 - - # Compute scale for each channel - s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0] - s_channel /= max_q_val - - # Quantize - q_w = torch.round(w / s_channel).int() - q_w = torch.clamp(q_w, -max_q_val, max_q_val) - # Compute ref (dequantized) - w_ref = q_w.half() * s_channel - - s_group = torch.tensor([], dtype=torch.half) - # div 2 ** (8 - self.bits)) to offset right shift in unpacking - s_channel /= (2**(8 - num_bits)) - s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float) - - return ( - w_ref.to(device=orig_device), - q_w.to(device=orig_device), - s_group.to(device=orig_device), - s_channel.to(device=orig_device), - ) - - def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor): orig_device = q_w.device From c68cadbd16ebf0f497b39fc59a784010c6a61a20 Mon Sep 17 00:00:00 2001 From: bigmoyan Date: Thu, 21 Aug 2025 03:59:54 +0800 Subject: [PATCH 193/231] [Fix] correct tool_id for kimi-k2 when use tool_choice=required (#21259) Co-authored-by: wangzhengtao Signed-off-by: Duncan Moss --- .../test_completion_with_function_calling.py | 314 +++++++++++------- tests/utils.py | 10 +- vllm/entrypoints/chat_utils.py | 17 +- vllm/entrypoints/openai/protocol.py | 4 +- vllm/entrypoints/openai/serving_chat.py | 64 +++- .../tool_parsers/deepseekv3_tool_parser.py | 4 +- .../granite_20b_fc_tool_parser.py | 4 +- .../tool_parsers/granite_tool_parser.py | 4 +- .../openai/tool_parsers/hermes_tool_parser.py | 4 +- .../tool_parsers/internlm2_tool_parser.py | 4 +- .../openai/tool_parsers/jamba_tool_parser.py | 4 +- .../openai/tool_parsers/llama_tool_parser.py | 4 +- .../tool_parsers/minimax_tool_parser.py | 4 +- .../tool_parsers/phi4mini_tool_parser.py | 4 +- .../openai/tool_parsers/xlam_tool_parser.py | 4 +- 15 files changed, 283 insertions(+), 166 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index a5b081f86107..4ef5d4e8a699 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -13,6 +13,127 @@ # any model with a chat template should work here MODEL_NAME = "Qwen/Qwen3-0.6B" +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to find the weather for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + "options": { + "$ref": "#/$defs/WeatherOptions", + "description": "Optional parameters for weather query", + }, + }, + "required": ["country", "unit"], + "$defs": { + "WeatherOptions": { + "title": "WeatherOptions", + "type": "object", + "additionalProperties": False, + "properties": { + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "default": "celsius", + "description": "Temperature unit", + "title": "Temperature Unit", + }, + "include_forecast": { + "type": "boolean", + "default": False, + "description": + "Whether to include a 24-hour forecast", + "title": "Include Forecast", + }, + "language": { + "type": "string", + "default": "zh-CN", + "description": "Language of the response", + "title": "Language", + "enum": ["zh-CN", "en-US", "ja-JP"], + }, + }, + }, + }, + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_forecast", + "description": "Get the weather forecast for a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to get the forecast for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "days": { + "type": + "integer", + "description": + "Number of days to get the forecast for (1-7)", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["country", "days", "unit"], + }, + }, + }, +] + +messages = [ + { + "role": "user", + "content": "Hi! How are you doing today?" + }, + { + "role": "assistant", + "content": "I'm doing well! How can I help you?" + }, + { + "role": + "user", + "content": + "Can you tell me what the current weather is in Berlin and the "\ + "forecast for the next 5 days, in fahrenheit?", + }, +] + @pytest.fixture(scope="module") def server(): # noqa: F811 @@ -27,6 +148,8 @@ def server(): # noqa: F811 "hermes", "--reasoning-parser", "qwen3", + "--gpu-memory-utilization", + "0.4" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -54,129 +177,6 @@ async def client(server): async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str, stream: bool, tool_choice: Union[str, dict], enable_thinking: bool): - tools = [ - { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": - "The city to find the weather for, e.g. 'Vienna'", - "default": "Vienna", - }, - "country": { - "type": - "string", - "description": - "The country that the city is in, e.g. 'Austria'", - }, - "unit": { - "type": "string", - "description": - "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - "options": { - "$ref": "#/$defs/WeatherOptions", - "description": - "Optional parameters for weather query", - }, - }, - "required": ["country", "unit"], - "$defs": { - "WeatherOptions": { - "title": "WeatherOptions", - "type": "object", - "additionalProperties": False, - "properties": { - "unit": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - "default": "celsius", - "description": "Temperature unit", - "title": "Temperature Unit", - }, - "include_forecast": { - "type": "boolean", - "default": False, - "description": - "Whether to include a 24-hour forecast", - "title": "Include Forecast", - }, - "language": { - "type": "string", - "default": "zh-CN", - "description": "Language of the response", - "title": "Language", - "enum": ["zh-CN", "en-US", "ja-JP"], - }, - }, - }, - }, - }, - }, - }, - { - "type": "function", - "function": { - "name": "get_forecast", - "description": "Get the weather forecast for a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": - "The city to get the forecast for, e.g. 'Vienna'", - "default": "Vienna", - }, - "country": { - "type": - "string", - "description": - "The country that the city is in, e.g. 'Austria'", - }, - "days": { - "type": - "integer", - "description": - "Number of days to get the forecast for (1-7)", - }, - "unit": { - "type": "string", - "description": - "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - }, - "required": ["country", "days", "unit"], - }, - }, - }, - ] - - messages = [ - { - "role": "user", - "content": "Hi! How are you doing today?" - }, - { - "role": "assistant", - "content": "I'm doing well! How can I help you?" - }, - { - "role": - "user", - "content": - "Can you tell me what the current weather is in Berlin and the "\ - "forecast for the next 5 days, in fahrenheit?", - }, - ] if not stream: # Non-streaming test chat_completion = await client.chat.completions.create( @@ -216,3 +216,71 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str, output.extend(chunk.choices[0].delta.tool_calls) assert len(output) > 0 + + +@pytest.fixture(scope="module") +def k2_server(): # noqa: F811 + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "half", + "--enable-auto-tool-choice", + "--guided-decoding-backend", + "xgrammar", + "--tool-call-parser", + "hermes", + "--reasoning-parser", + "qwen3", + "--gpu-memory-utilization", + "0.4", + ] + # hack to test kimi_k2 tool use tool_id format. + # avoid error in is_deepseek_mla check by setting kv_lora_rank=null + with RemoteOpenAIServer(MODEL_NAME, + args, + override_hf_configs={ + "model_type": 'kimi_k2', + 'kv_lora_rank': None + }) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def k2_client(k2_server): + async with k2_server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("stream", [True, False]) +@pytest.mark.parametrize("tool_choice", ["required"]) +async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str, + stream: bool, tool_choice: str): + + if not stream: + # Non-streaming test + chat_completion = await k2_client.chat.completions.create( + messages=messages, + model=model_name, + tools=tools, + tool_choice=tool_choice) + assert chat_completion.choices[0].message.tool_calls is not None + assert len(chat_completion.choices[0].message.tool_calls) > 0 + assert chat_completion.choices[0].message.tool_calls[ + 0].id == 'functions.get_current_weather:0' + else: + # Streaming test + output_stream = await k2_client.chat.completions.create( + messages=messages, + model=model_name, + tools=tools, + tool_choice=tool_choice, + stream=True) + + output = [] + async for chunk in output_stream: + if chunk.choices and chunk.choices[0].delta.tool_calls: + output.extend(chunk.choices[0].delta.tool_calls) + for o in output: + assert o.id is None or o.id == 'functions.get_current_weather:0' diff --git a/tests/utils.py b/tests/utils.py index e98707fb4447..4dba5494665a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,6 +5,7 @@ import copy import functools import importlib +import json import os import signal import subprocess @@ -101,7 +102,8 @@ def __init__(self, env_dict: Optional[dict[str, str]] = None, seed: Optional[int] = 0, auto_port: bool = True, - max_wait_seconds: Optional[float] = None) -> None: + max_wait_seconds: Optional[float] = None, + override_hf_configs: Optional[dict[str, Any]] = None) -> None: if auto_port: if "-p" in vllm_serve_args or "--port" in vllm_serve_args: raise ValueError("You have manually specified the port " @@ -120,6 +122,12 @@ def __init__(self, vllm_serve_args = vllm_serve_args + ["--seed", str(seed)] + if override_hf_configs is not None: + vllm_serve_args = vllm_serve_args + [ + "--hf-overrides", + json.dumps(override_hf_configs) + ] + parser = FlexibleArgumentParser( description="vLLM's remote OpenAI server.") subparsers = parser.add_subparsers(required=False, dest="subparser") diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 74c8093f4967..87772a499f42 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1345,5 +1345,18 @@ def apply_mistral_chat_template( "template") raise ValueError(str(e)) from e -def random_tool_call_id() -> str: - return f"chatcmpl-tool-{random_uuid()}" +def get_history_tool_calls_cnt(conversation: list[ConversationMessage]): + idx = 0 + for msg in conversation: + if msg['role'] == 'assistant': + tool_calls = msg.get('tool_calls') + idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa + return idx + +def make_tool_call_id(id_type:str='random', func_name=None, idx=None): + + if id_type=='kimi_k2': + return f'functions.{func_name}:{idx}' + else: + # by default return random + return f"chatcmpl-tool-{random_uuid()}" diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 39facd4d53d3..a44868973f5d 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -38,7 +38,7 @@ from vllm import envs from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, - random_tool_call_id) + make_tool_call_id) from vllm.entrypoints.score_utils import (ScoreContentPartParam, ScoreMultiModalParam) from vllm.logger import init_logger @@ -1634,7 +1634,7 @@ class FunctionCall(OpenAIBaseModel): class ToolCall(OpenAIBaseModel): - id: str = Field(default_factory=random_tool_call_id) + id: str = Field(default_factory=make_tool_call_id) type: Literal["function"] = "function" function: FunctionCall diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d57868847eed..65aac23ee618 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -19,7 +19,8 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, ConversationMessage, - random_tool_call_id) + get_history_tool_calls_cnt, + make_tool_call_id) from vllm.entrypoints.harmony_utils import ( get_developer_message, get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, get_system_message, parse_chat_input, @@ -133,6 +134,10 @@ def __init__( source = "model" if source == "auto" else source logger.info("Using default chat sampling params from %s: %s", source, self.default_sampling_params) + if self.model_config.hf_config.model_type == 'kimi_k2': + self.tool_call_id_type = 'kimi_k2' + else: + self.tool_call_id_type = 'random' self.use_harmony = model_config.hf_config.model_type == "gpt_oss" if self.use_harmony: @@ -379,6 +384,7 @@ def extract_tool_call_required_streaming( current_text: Optional[str], delta_text: str, function_name_returned: bool, + tool_call_idx: Optional[int] = None ) -> tuple[Optional[DeltaMessage], bool]: if current_text is None or current_text == "": # if the current text is empty, we cannot parse it @@ -424,8 +430,12 @@ def extract_tool_call_required_streaming( current_tool_call = obj[-2] function_name_returned = True + tool_call_id = make_tool_call_id( + id_type=self.tool_call_id_type, + func_name=current_tool_call["name"], + idx=tool_call_idx) delta_message = DeltaMessage(tool_calls=[ - DeltaToolCall(id=random_tool_call_id(), + DeltaToolCall(id=tool_call_id, function=DeltaFunctionCall( name=current_tool_call["name"], arguments=arguments), @@ -491,6 +501,10 @@ async def chat_completion_stream_generator( all_previous_token_ids: Optional[list[list[int]]] function_name_returned = [False] * num_choices + if self.tool_call_id_type == 'kimi_k2': + history_tool_call_cnt = get_history_tool_calls_cnt(conversation) + else: + history_tool_call_cnt = 0 # Always track previous_texts for comprehensive output logging previous_texts = [""] * num_choices @@ -673,7 +687,6 @@ async def chat_completion_stream_generator( previous_text = previous_texts[i] previous_token_ids = all_previous_token_ids[i] current_text = previous_text + delta_text - # avoid the None + list error. if previous_token_ids: current_token_ids = previous_token_ids + as_list( @@ -733,7 +746,7 @@ async def chat_completion_stream_generator( index=i) else: delta_tool_call = DeltaToolCall( - id=random_tool_call_id(), + id=make_tool_call_id(), type="function", function=DeltaFunctionCall( name=tool_choice_function_name, @@ -764,7 +777,11 @@ async def chat_completion_stream_generator( previous_text=previous_text, current_text=content, delta_text=delta_text, - function_name_returned=fn_name_returned)) + function_name_returned=fn_name_returned, + tool_call_idx=history_tool_call_cnt)) + if (delta_message and delta_message.tool_calls and + delta_message.tool_calls[0].id is not None): + history_tool_call_cnt += 1 # update the previous values for the next iteration previous_texts[i] = current_text @@ -1089,6 +1106,10 @@ async def chat_completion_full_generator( assert final_res is not None choices: list[ChatCompletionResponseChoice] = [] + if self.tool_call_id_type == 'kimi_k2': + history_tool_call_cnt = get_history_tool_calls_cnt(conversation) + else: + history_tool_call_cnt = 0 role = self.get_chat_request_role(request) for output in final_res.outputs: @@ -1194,17 +1215,26 @@ async def chat_completion_full_generator( assert content is not None tool_calls = TypeAdapter( list[FunctionDefinition]).validate_json(content) + tool_call_ids = [] + for tool_call in tool_calls: + tool_call_ids.append( + make_tool_call_id(id_type=self.tool_call_id_type, + func_name=tool_call.name, + idx=history_tool_call_cnt)) + history_tool_call_cnt += 1 message = ChatMessage( role=role, content="", - reasoning_content=reasoning_content, tool_calls=[ - tool_call_class(function=FunctionCall( - name=tool_call.name, - arguments=json.dumps(tool_call.parameters, - ensure_ascii=False))) - for tool_call in tool_calls - ]) + tool_call_class(id=tool_call_ids[i], + function=FunctionCall( + name=tool_call.name, + arguments=json.dumps( + tool_call.parameters, + ensure_ascii=False))) + for i, tool_call in enumerate(tool_calls) + ], + reasoning_content=reasoning_content) # if the request doesn't use tool choice # OR specifies to not use a tool @@ -1248,7 +1278,6 @@ async def chat_completion_full_generator( if (tool_call_info.content and len(tool_call_info.content) > 0): ret_content = tool_call_info.content - message = ChatMessage(role=role, reasoning_content=reasoning_content, content=ret_content) @@ -1327,12 +1356,11 @@ async def chat_completion_full_generator( elif choice.message.tool_calls: # For tool calls, log the function name and arguments tool_call_descriptions = [] - for tool_call in choice.message.tool_calls: - if hasattr(tool_call.function, "name") and hasattr( - tool_call.function, "arguments"): + for tc in choice.message.tool_calls: + if hasattr(tc.function, "name") and hasattr( + tc.function, "arguments"): tool_call_descriptions.append( - f"{tool_call.function.name}({tool_call.function.arguments})" - ) + f"{tc.function.name}({tc.function.arguments})") tool_calls_str = ", ".join(tool_call_descriptions) output_text = f"[tool_calls: {tool_calls_str}]" diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index da4760ad1b64..ac272b0c3b20 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -6,7 +6,7 @@ import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -267,7 +267,7 @@ def extract_tool_calls_streaming( DeltaToolCall( index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True), diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 5508ba6a3940..824b100f357b 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -10,7 +10,7 @@ import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -203,7 +203,7 @@ def extract_tool_calls_streaming( delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index fcc5b7edda83..ac517616a95b 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -8,7 +8,7 @@ import partial_json_parser from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -185,7 +185,7 @@ def extract_tool_calls_streaming( delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index d126130ab9bc..a6ce33af6bd0 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -9,7 +9,7 @@ import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -307,7 +307,7 @@ def extract_tool_calls_streaming( return DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 92004de030d1..6ef8fadf59ac 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -8,7 +8,7 @@ import partial_json_parser from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -107,7 +107,7 @@ def extract_tool_calls_streaming( delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 66b483d8b0f6..3b41f6034704 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -9,7 +9,7 @@ import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -222,7 +222,7 @@ def extract_tool_calls_streaming( delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 194a144ad576..31b19c8db416 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -10,7 +10,7 @@ from partial_json_parser.core.options import Allow from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -213,7 +213,7 @@ def extract_tool_calls_streaming( delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 226309ef293a..283e6095013d 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -7,7 +7,7 @@ import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -394,7 +394,7 @@ def _ensure_state_arrays(self, tool_count: int) -> None: sent_tools.append({ "sent_name": False, "sent_arguments": "", - "id": random_tool_call_id(), + "id": make_tool_call_id(), }) while len(tool_ids) < tool_count: diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index 5501028cf36b..85dd56213c6a 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -8,7 +8,7 @@ import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, ExtractedToolCallInformation, @@ -74,7 +74,7 @@ def extract_tool_calls( tool_calls: list[ToolCall] = [ ToolCall( - id=random_tool_call_id(), + id=make_tool_call_id(), type="function", function=FunctionCall( name=raw_function_call["name"], diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py index 321718b1c950..87cd413b3720 100644 --- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py @@ -7,7 +7,7 @@ import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -226,7 +226,7 @@ def extract_tool_calls_streaming( function_name = name_match.group(1) # The test expects us to send just the name first - tool_id = random_tool_call_id() + tool_id = make_tool_call_id() delta = DeltaMessage(tool_calls=[ DeltaToolCall( index=0, From fa40ad397675688820bca59a015fd1d954f02a2f Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 20 Aug 2025 13:03:37 -0700 Subject: [PATCH 194/231] [Frontend] improve error logging of chat completion (#22957) Signed-off-by: Chen Zhang Signed-off-by: Duncan Moss --- vllm/entrypoints/openai/api_server.py | 74 +++++++++++++++++++++------ 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 24148bcef235..14ba8aa64183 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -600,8 +600,11 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Responses API") - - generator = await handler.create_responses(request, raw_request) + try: + generator = await handler.create_responses(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -618,7 +621,11 @@ async def retrieve_responses(response_id: str, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Responses API") - response = await handler.retrieve_responses(response_id) + try: + response = await handler.retrieve_responses(response_id) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), @@ -633,7 +640,11 @@ async def cancel_responses(response_id: str, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Responses API") - response = await handler.cancel_responses(response_id) + try: + response = await handler.cancel_responses(response_id) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), @@ -667,9 +678,11 @@ async def create_chat_completion(request: ChatCompletionRequest, if handler is None: return base(raw_request).create_error_response( message="The model does not support Chat Completions API") - - generator = await handler.create_chat_completion(request, raw_request) - + try: + generator = await handler.create_chat_completion(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -742,7 +755,11 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Embeddings API") - generator = await handler.create_embedding(request, raw_request) + try: + generator = await handler.create_embedding(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -770,8 +787,11 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Pooling API") - - generator = await handler.create_pooling(request, raw_request) + try: + generator = await handler.create_pooling(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -791,7 +811,11 @@ async def create_classify(request: ClassificationRequest, return base(raw_request).create_error_response( message="The model does not support Classification API") - generator = await handler.create_classify(request, raw_request) + try: + generator = await handler.create_classify(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -820,7 +844,11 @@ async def create_score(request: ScoreRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Score API") - generator = await handler.create_score(request, raw_request) + try: + generator = await handler.create_score(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -878,8 +906,12 @@ async def create_transcriptions(raw_request: Request, message="The model does not support Transcriptions API") audio_data = await request.file.read() - generator = await handler.create_transcription(audio_data, request, - raw_request) + try: + generator = await handler.create_transcription(audio_data, request, + raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -919,8 +951,12 @@ async def create_translations(request: Annotated[TranslationRequest, message="The model does not support Translations API") audio_data = await request.file.read() - generator = await handler.create_translation(audio_data, request, - raw_request) + try: + generator = await handler.create_translation(audio_data, request, + raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -949,7 +985,11 @@ async def do_rerank(request: RerankRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Rerank (Score) API") - generator = await handler.do_rerank(request, raw_request) + try: + generator = await handler.do_rerank(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) From 8396597ae7d5b10b164b29f722e03ed275c7ca6e Mon Sep 17 00:00:00 2001 From: Saurabh Misra Date: Wed, 20 Aug 2025 13:17:11 -0700 Subject: [PATCH 195/231] [Perf] Speed up function `_convert_tokens_to_string_with_added_encoders` by 13.7x (#20413) Signed-off-by: Saurabh Misra Signed-off-by: Aseem Saxena Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com> Co-authored-by: Aseem Saxena Signed-off-by: Duncan Moss --- vllm/transformers_utils/detokenizer_utils.py | 25 ++++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index be1040c3e014..101f31d39cc1 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -23,27 +23,32 @@ def _convert_tokens_to_string_with_added_encoders( # NOTE(woosuk): The following code is slow because it runs a for loop over # the output_tokens. In Python, running a for loop over a list can be slow # even when the loop body is very simple. + # Performance improvements: avoid repeated attribute and function lookups; + # localize frequently used objects; + sub_texts: list[str] = [] current_sub_text: list[str] = [] - all_special_tokens = set(tokenizer.all_special_tokens) + convert_tokens_to_string = tokenizer.convert_tokens_to_string + added_vocab_set = set(tokenizer.get_added_vocab()) + all_special_tokens = set( + tokenizer.all_special_tokens) if skip_special_tokens else () + for token in output_tokens: - if skip_special_tokens and token in all_special_tokens: + # Use precomputed set for skip-special check + if token in all_special_tokens: continue - if token in tokenizer.get_added_vocab(): + if token in added_vocab_set: if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) - current_sub_text = [] + sub_texts.append(convert_tokens_to_string(current_sub_text)) + current_sub_text.clear() sub_texts.append(token) else: current_sub_text.append(token) if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) + sub_texts.append(convert_tokens_to_string(current_sub_text)) if spaces_between_special_tokens: return " ".join(sub_texts) - else: - return "".join(sub_texts) + return "".join(sub_texts) # 5 is an arbitrary value that should work for all From 410423ecee202f07fa93326587a2fa84ccd108e0 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 16:28:30 -0400 Subject: [PATCH 196/231] Do not use eval() to convert unknown types (#23266) Signed-off-by: Russell Bryant Signed-off-by: Duncan Moss --- .../openai/tool_parsers/qwen3coder_tool_parser.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py index cf4d0b231aee..2501d6739e8f 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -208,15 +208,10 @@ def convert_param_value(param_value: str, param_name: str, "valid JSON object in tool '%s', will try other " "methods to parse it.", param_value, param_name, func_name) - try: - converted_value = eval(param_value) - return converted_value - except Exception: - logger.warning( - "Parsed value '%s' of parameter '%s' cannot be " - "converted via Python `eval()` in tool '%s', " - "degenerating to string.", param_value, param_name, - func_name) + logger.warning( + "Parameter '%s' has unknown type '%s'. " + "The value will be treated as a string.", param_name, + param_type) return param_value # Extract function name From e0be5ba7b0e8e6f5889b5276c45f78e6249c804b Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Thu, 21 Aug 2025 05:07:28 +0800 Subject: [PATCH 197/231] [Feature] use --eplb_config to set eplb param (#20562) Signed-off-by: rongfu.leng Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: rongfu.leng Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Duncan Moss --- vllm/config/__init__.py | 3 +- vllm/config/parallel.py | 108 +++++++++++++++++----- vllm/distributed/eplb/eplb_state.py | 4 +- vllm/engine/arg_utils.py | 63 +++++++++---- vllm/model_executor/models/deepseek_v2.py | 4 +- vllm/model_executor/models/glm4_moe.py | 4 +- vllm/model_executor/models/qwen3_moe.py | 7 +- vllm/v1/worker/gpu_model_runner.py | 4 +- vllm/v1/worker/gpu_worker.py | 4 +- 9 files changed, 149 insertions(+), 52 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 62dfd4333bee..959f111ced22 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -33,7 +33,8 @@ PrefixCachingHashAlgo) from vllm.config.compilation import (CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig) -from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig +from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, + ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.utils import ConfigType, config from vllm.logger import init_logger diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 7a9e68f0ea33..2b716a77066a 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union import torch -from pydantic import model_validator +from pydantic import TypeAdapter, model_validator from pydantic.dataclasses import dataclass from torch.distributed import ProcessGroup, ReduceOp from typing_extensions import Self @@ -32,6 +32,38 @@ DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] +@config +@dataclass +class EPLBConfig: + """Configuration for Expert Parallel Load Balancing (EP).""" + + window_size: int = 1000 + """Window size for expert load recording.""" + step_interval: int = 3000 + """ + Interval for rearranging experts in expert parallelism. + + Note that if this is greater than the EPLB window size, only the metrics + of the last `lb_window_size` steps will be used for rearranging experts. + """ + + num_redundant_experts: int = 0 + """Number of redundant experts to use for expert parallelism.""" + + log_balancedness: bool = False + """ + Log the balancedness each step of expert parallelism. + This is turned off by default since it will cause communication overhead. + """ + + @classmethod + def from_cli(cls, cli_value: str) -> "EPLBConfig": + """Parse the CLI value for the compilation config. + -O1, -O2, -O3, etc. is handled in FlexibleArgumentParser. + """ + return TypeAdapter(EPLBConfig).validate_json(cli_value) + + @config @dataclass class ParallelConfig: @@ -75,22 +107,24 @@ class ParallelConfig: """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False """Enable expert parallelism load balancing for MoE layers.""" - num_redundant_experts: int = 0 - """Number of redundant experts to use for expert parallelism.""" - eplb_window_size: int = 1000 - """Window size for expert load recording.""" - eplb_step_interval: int = 3000 - """ - Interval for rearranging experts in expert parallelism. - - Note that if this is greater than the EPLB window size, only the metrics - of the last `eplb_window_size` steps will be used for rearranging experts. - """ - eplb_log_balancedness: bool = False - """ - Log the balancedness each step of expert parallelism. - This is turned off by default since it will cause communication overhead. - """ + eplb_config: EPLBConfig = field(default_factory=EPLBConfig) + """Expert parallelism configuration.""" + num_redundant_experts: Optional[int] = None + """`num_redundant_experts` is deprecated and has been replaced with + `eplb_config.num_redundant_experts`. This will be removed in v0.12.0. + Please use `eplb_config.num_redundant_experts` instead.""" + eplb_window_size: Optional[int] = None + """`eplb_window_size` is deprecated and has been replaced with + `eplb_config.window_size`. This will be removed in v0.12.0. + Please use `eplb_config.window_size` instead.""" + eplb_step_interval: Optional[int] = None + """`eplb_step_interval` is deprecated and has been replaced with + `eplb_config.step_interval`. This will be removed in v0.12.0. + Please use `eplb_config.step_interval` instead.""" + eplb_log_balancedness: Optional[bool] = None + """`eplb_log_balancedness` is deprecated and has been replaced with + `eplb_config.log_balancedness`. This will be removed in v0.12.0. + Please use `eplb_config.log_balancedness` instead.""" max_parallel_loading_workers: Optional[int] = None """Maximum number of parallel loading workers when loading model @@ -237,6 +271,38 @@ def compute_hash(self): return hashlib.sha256(str(factors).encode()).hexdigest() def __post_init__(self) -> None: + # Forward deprecated fields to their new location + if self.num_redundant_experts is not None: + self.eplb_config.num_redundant_experts = ( + self.num_redundant_experts) + logger.warning_once( + "num_redundant_experts is deprecated and has been replaced " + "with eplb_config.num_redundant_experts. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_window_size is not None: + self.eplb_config.window_size = self.eplb_window_size + logger.warning_once( + "eplb_window_size is deprecated and has been replaced " + "with eplb_config.window_size. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_step_interval is not None: + self.eplb_config.step_interval = self.eplb_step_interval + logger.warning_once( + "eplb_step_interval is deprecated and has been replaced " + "with eplb_config.step_interval. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_log_balancedness is not None: + self.eplb_config.log_balancedness = self.eplb_log_balancedness + logger.warning_once( + "eplb_log_balancedness is deprecated and has been replaced " + "with eplb_config.log_balancedness. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + + # Continue with the rest of the initialization self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size @@ -275,10 +341,10 @@ def __post_init__(self) -> None: raise ValueError( "Expert parallelism load balancing is only supported on " "CUDA devices now.") - if self.num_redundant_experts < 0: + if self.eplb_config.num_redundant_experts < 0: raise ValueError( "num_redundant_experts must be non-negative, but got " - f"{self.num_redundant_experts}.") + f"{self.eplb_config.num_redundant_experts}.") if not self.enable_expert_parallel: raise ValueError( "enable_expert_parallel must be True to use EPLB.") @@ -289,10 +355,10 @@ def __post_init__(self) -> None: f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}." ) else: - if self.num_redundant_experts != 0: + if self.eplb_config.num_redundant_experts != 0: raise ValueError( "num_redundant_experts should be used with EPLB." - f"{self.num_redundant_experts}.") + f"{self.eplb_config.num_redundant_experts}.") if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 979f2a06cec9..042acf40d67c 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -244,7 +244,7 @@ def build( dtype=torch.int32, device=device, ) - expert_load_window_size = parallel_config.eplb_window_size + expert_load_window_size = parallel_config.eplb_config.window_size expert_load_window = torch.zeros( (expert_load_window_size, model.num_moe_layers, model.num_physical_experts), @@ -253,7 +253,7 @@ def build( ) # Set the initial progress of rearrangement to 3/4 - eplb_step_interval = parallel_config.eplb_step_interval + eplb_step_interval = parallel_config.eplb_config.step_interval expert_rearrangement_step = max( 0, eplb_step_interval - eplb_step_interval // 4) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6869c3f23f31..dcf78758946f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -25,7 +25,7 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, ConfigFormat, ConfigType, ConvertOption, DecodingConfig, DetailedTraceModules, Device, - DeviceConfig, DistributedExecutorBackend, + DeviceConfig, DistributedExecutorBackend, EPLBConfig, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, @@ -305,11 +305,12 @@ class EngineArgs: data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel + eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb - num_redundant_experts: int = ParallelConfig.num_redundant_experts - eplb_window_size: int = ParallelConfig.eplb_window_size - eplb_step_interval: int = ParallelConfig.eplb_step_interval - eplb_log_balancedness: bool = ParallelConfig.eplb_log_balancedness + num_redundant_experts: int = EPLBConfig.num_redundant_experts + eplb_window_size: int = EPLBConfig.window_size + eplb_step_interval: int = EPLBConfig.step_interval + eplb_log_balancedness: bool = EPLBConfig.log_balancedness max_parallel_loading_workers: Optional[ int] = ParallelConfig.max_parallel_loading_workers block_size: Optional[BlockSize] = CacheConfig.block_size @@ -454,6 +455,9 @@ def __post_init__(self): if isinstance(self.compilation_config, dict): self.compilation_config = CompilationConfig( **self.compilation_config) + if isinstance(self.eplb_config, dict): + self.eplb_config = EPLBConfig.from_cli(json.dumps( + self.eplb_config)) # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() @@ -661,14 +665,32 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **parallel_kwargs["enable_expert_parallel"]) parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"]) - parallel_group.add_argument("--num-redundant-experts", - **parallel_kwargs["num_redundant_experts"]) - parallel_group.add_argument("--eplb-window-size", - **parallel_kwargs["eplb_window_size"]) - parallel_group.add_argument("--eplb-step-interval", - **parallel_kwargs["eplb_step_interval"]) - parallel_group.add_argument("--eplb-log-balancedness", - **parallel_kwargs["eplb_log_balancedness"]) + parallel_group.add_argument("--eplb-config", + **parallel_kwargs["eplb_config"]) + parallel_group.add_argument( + "--num-redundant-experts", + type=int, + help= + "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-window-size", + type=int, + help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-step-interval", + type=int, + help= + "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-log-balancedness", + action=argparse.BooleanOptionalAction, + help= + "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( "--max-parallel-loading-workers", **parallel_kwargs["max_parallel_loading_workers"]) @@ -1244,6 +1266,16 @@ def create_engine_config( "Currently, speculative decoding is not supported with " "async scheduling.") + # Forward the deprecated CLI args to the EPLB config. + if self.num_redundant_experts is not None: + self.eplb_config.num_redundant_experts = self.num_redundant_experts + if self.eplb_window_size is not None: + self.eplb_config.window_size = self.eplb_window_size + if self.eplb_step_interval is not None: + self.eplb_config.step_interval = self.eplb_step_interval + if self.eplb_log_balancedness is not None: + self.eplb_config.log_balancedness = self.eplb_log_balancedness + parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, @@ -1257,10 +1289,7 @@ def create_engine_config( data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, - num_redundant_experts=self.num_redundant_experts, - eplb_window_size=self.eplb_window_size, - eplb_step_interval=self.eplb_step_interval, - eplb_log_balancedness=self.eplb_log_balancedness, + eplb_config=self.eplb_config, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, ray_workers_use_nsight=self.ray_workers_use_nsight, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index f199da135ec7..d56224b4b7b3 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -132,10 +132,10 @@ def __init__( # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index aff491f9596c..fe5e46a99826 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -131,10 +131,10 @@ def __init__( # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 05bbb0d2e899..2812f79a66b7 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -121,11 +121,11 @@ def __init__( # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb self.n_logical_experts = self.n_routed_experts - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) self.n_local_physical_experts = self.n_physical_experts // self.ep_size @@ -363,7 +363,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config enable_eplb = parallel_config.enable_eplb - self.num_redundant_experts = parallel_config.num_redundant_experts + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d9770226b14e..33747d6917a5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1435,7 +1435,7 @@ def eplb_step(self, model, is_dummy, is_profile, - log_stats=self.parallel_config.eplb_log_balancedness, + log_stats=self.parallel_config.eplb_config.log_balancedness, ) def get_dp_padding(self, @@ -1977,7 +1977,7 @@ def load_model(self, eep_scale_up: bool = False) -> None: global_expert_load, old_global_expert_indices = ( EplbState.recv_state()) num_logical_experts = global_expert_load.shape[1] - self.parallel_config.num_redundant_experts = ( + self.parallel_config.eplb_config.num_redundant_experts = ( num_local_physical_experts * new_ep_size - num_logical_experts) assert old_global_expert_indices.shape[ 1] % num_local_physical_experts == 0 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 22e639b97d09..d61177d4245d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -515,7 +515,7 @@ def _reconfigure_moe(self, old_ep_size: int, assert self.model_runner.eplb_state is not None new_physical_experts = \ self.model_runner.eplb_state.physical_to_logical_map.shape[1] - parallel_config.num_redundant_experts = ( + parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - self.model_runner.eplb_state.logical_replica_count.shape[1]) global_expert_load = None @@ -531,7 +531,7 @@ def _reconfigure_moe(self, old_ep_size: int, assert self.model_runner.eplb_state is not None global_expert_load = self.model_runner.eplb_state.rearrange( self.model_runner.model, execute_shuffle=False) - parallel_config.num_redundant_experts = ( + parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - global_expert_load.shape[1]) prepare_communication_buffer_for_model(self.model_runner.model) self.model_runner.model.update_physical_experts_metadata( From 649fcea5aaee8718e2e8a03433a009fd3cb2c6db Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Aug 2025 05:15:34 +0800 Subject: [PATCH 198/231] [misc] fix multiple arch wheels for the nightly index (#23110) Signed-off-by: youkaichao Signed-off-by: Duncan Moss --- .buildkite/generate_index.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 7045d8810493..6b5a2a99356a 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -8,7 +8,8 @@

Links for vLLM

-
{wheel}
+ {x86_wheel}
+ {arm_wheel}
""" @@ -21,7 +22,20 @@ with open("index.html", "w") as f: print(f"Generated index.html for {args.wheel}") + if "x86_64" in filename: + x86_wheel = filename + arm_wheel = filename.replace("x86_64", "aarch64") + elif "aarch64" in filename: + x86_wheel = filename.replace("aarch64", "x86_64") + arm_wheel = filename + else: + raise ValueError(f"Unsupported wheel: {filename}") # cloudfront requires escaping the '+' character f.write( - template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) + template.format( + x86_wheel=x86_wheel, + x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"), + arm_wheel=arm_wheel, + arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"), + ) ) From f54d68b0a2cd523e33b5512e4a3b1829be448a6e Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 20 Aug 2025 17:43:17 -0400 Subject: [PATCH 199/231] Remove chunked_prefill_enabled flag in V1 MLA (#23183) Signed-off-by: Matthew Bonanni Signed-off-by: Duncan Moss --- vllm/v1/attention/backends/mla/common.py | 50 +++++++++++------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index f2610671f769..646e4fec836b 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -416,7 +416,6 @@ def __init__(self, self.model_config = vllm_config.model_config cache_config = vllm_config.cache_config parallel_config = vllm_config.parallel_config - self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled self.num_heads = self.model_config.get_num_attention_heads( parallel_config) self.mla_dims = get_mla_dims(self.model_config) @@ -426,30 +425,28 @@ def __init__(self, if self.aot_schedule: self.page_size = self.kv_cache_spec.block_size - if self.chunked_prefill_enabled: - self.chunked_prefill_workspace_size = min( - # Max sure there is enough for 8 full length request or at least - # 4 pages of cache per request - max( - 8 * self.model_config.max_model_len, 4 * - scheduler_config.max_num_seqs * cache_config.block_size), - # For long-context models try not to over-allocate limiting - # kv-cache space, limiting it to 64k tokens, - # which would result in the workspace being: - # 2*(576)*(64*1024) = 144mb - # (assuming 576 MLA head dim, and fp16) - # which would result in up-projected context being - # 2*(192*128)*(64*1024) = 3gb - # (assuming 192 QK head dim, 128 heads, and fp16) - 128 * 1024) - assert self.chunked_prefill_workspace_size >= \ - scheduler_config.max_num_seqs * cache_config.block_size - self.chunked_prefill_workspace = torch.empty( - (self.chunked_prefill_workspace_size, - self.model_config.get_head_size()), - dtype=self.model_config.dtype, - device=device, - ) + self.chunked_prefill_workspace_size = min( + # Max sure there is enough for 8 full length request or at least + # 4 pages of cache per request + max(8 * self.model_config.max_model_len, + 4 * scheduler_config.max_num_seqs * cache_config.block_size), + # For long-context models try not to over-allocate limiting + # kv-cache space, limiting it to 64k tokens, + # which would result in the workspace being: + # 2*(576)*(64*1024) = 144mb + # (assuming 576 MLA head dim, and fp16) + # which would result in up-projected context being + # 2*(192*128)*(64*1024) = 3gb + # (assuming 192 QK head dim, 128 heads, and fp16) + 128 * 1024) + assert self.chunked_prefill_workspace_size >= \ + scheduler_config.max_num_seqs * cache_config.block_size + self.chunked_prefill_workspace = torch.empty( + (self.chunked_prefill_workspace_size, + self.model_config.get_head_size()), + dtype=self.model_config.dtype, + device=device, + ) self._use_cudnn_prefill = use_cudnn_prefill() self._use_fi_prefill = use_flashinfer_prefill() @@ -620,8 +617,7 @@ def build(self, reqs_start:] - query_start_loc[reqs_start] chunked_context_metadata = None - if self.chunked_prefill_enabled and num_prefills > 0 \ - and max_context_len_cpu > 0: + if max_context_len_cpu > 0: # NOTE: it is recommend you read the `Chunked Prefill` section # in the comment at the top of the file before trying to # understand the following code From a473c5bd3a1122a1cfffb7ff8ac3221c583469b2 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 20 Aug 2025 17:46:47 -0400 Subject: [PATCH 200/231] Feature/mla tests (#23195) Signed-off-by: Matthew Bonanni Signed-off-by: Matthew Bonanni Signed-off-by: Duncan Moss --- tests/v1/attention/test_attention_backends.py | 26 +- tests/v1/attention/test_mla_backends.py | 522 ++++++++++++++++++ tests/v1/attention/utils.py | 11 +- vllm/v1/attention/backends/mla/common.py | 16 +- 4 files changed, 551 insertions(+), 24 deletions(-) create mode 100644 tests/v1/attention/test_mla_backends.py diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index ac08b9052cd8..60e04ad9069e 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -150,15 +150,15 @@ def create_and_prepopulate_kv_cache( # Permute the context blocks (excluding block 0 which is null) if randomize_blocks: - perm = torch.randperm( - blocks_end - 1) + 1 # Random permutation starting from block 1 + # Random permutation starting from block 1 + perm = torch.randperm(blocks_end - 1) + 1 else: - perm = torch.arange( - 1, blocks_end) # Sequential order starting from block 1 + # Sequential order starting from block 1 + perm = torch.arange(1, blocks_end) inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device) - inv_perm[1:] = torch.argsort( - perm) + 1 # Add 1 to account for starting from block 1 + # Add 1 to account for starting from block 1 + inv_perm[1:] = torch.argsort(perm) + 1 kv_cache[:, 1:blocks_end, ...] = kv_cache[:, perm, ...] # Construct the right block table @@ -281,7 +281,8 @@ def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls): @pytest.mark.parametrize("batch_spec_name", [ "small_decode", "small_prefill", "mixed_small", "medium_decode", - "medium_prefill", "mixed_medium" + "medium_prefill", "mixed_medium", "large_decode", "large_prefill", + "single_decode", "single_prefill" ]) @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) def test_backend_correctness(batch_spec_name: str, model: str): @@ -302,7 +303,8 @@ def test_backend_correctness(batch_spec_name: str, model: str): """ batch_spec = BATCH_SPECS[batch_spec_name] vllm_config = create_vllm_config(model_name=model, - max_model_len=max(batch_spec.seq_lens)) + max_model_len=max(batch_spec.seq_lens), + num_gpu_blocks=8192) device = torch.device("cuda:0") kv_cache_spec = create_standard_kv_cache_spec(vllm_config) @@ -465,12 +467,6 @@ def test_backend_correctness(batch_spec_name: str, model: str): rtol=rtol, atol=atol) - if not all_close: - print(f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})") - print(f"[{backend_name}] output: {backend_output}") - print(f"[{backend_name}] SDPA baseline: {sdpa_output}") - assert all_close, ( f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})") + f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})") \ No newline at end of file diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py new file mode 100644 index 000000000000..24070358799e --- /dev/null +++ b/tests/v1/attention/test_mla_backends.py @@ -0,0 +1,522 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for v1 MLA backends without GPUModelRunner dependency.""" + +import pytest +import torch + +from tests.v1.attention.utils import (BatchSpec, _Backend, + create_common_attn_metadata, + create_standard_kv_cache_spec, + create_vllm_config, + get_attention_backend) +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv +from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.kv_cache_interface import FullAttentionSpec + +BACKENDS_TO_TEST = [ + _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, + _Backend.TRITON_MLA_VLLM_V1 +] + +# Remove CUTLASS_MLA from the list if not using sm100 +if not torch.cuda.is_available() or torch.cuda.get_device_properties( + 0).major < 10: + BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA) + +torch.manual_seed(42) + + +def _convert_dtype_to_torch(dtype): + """Convert ModelDType to torch.dtype.""" + if isinstance(dtype, str): + if dtype == "auto": + return torch.float16 # Default dtype for testing + elif dtype in STR_DTYPE_TO_TORCH_DTYPE: + return STR_DTYPE_TO_TORCH_DTYPE[dtype] + else: + raise ValueError(f"Unknown dtype: {dtype}") + elif isinstance(dtype, torch.dtype): + return dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + +# Define common batch configurations +BATCH_SPECS = { + "small_decode": + BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]), + "small_prefill": + BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]), + "mixed_small": + BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]), + "medium_decode": + BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024], + query_lens=[1, 1, 1, 1, 1, 1, 1, 1]), + "medium_prefill": + BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]), + "mixed_medium": + BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048], + query_lens=[1, 1, 1, 7, 7, 7]), + "large_decode": + BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32), + "large_prefill": + BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8), + "single_decode": + BatchSpec(seq_lens=[1024], query_lens=[1]), + "single_prefill": + BatchSpec(seq_lens=[1024], query_lens=[64]), +} + + +def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec, + device: torch.device, + num_blocks: int = 100) -> torch.Tensor: + """Create a dummy KV cache tensor for testing.""" + kv_cache = torch.randn( + num_blocks, + kv_cache_spec.block_size, + kv_cache_spec.head_size, # latent dimension + dtype=_convert_dtype_to_torch(kv_cache_spec.dtype), + device=device, + ) + return kv_cache + + +def create_and_prepopulate_kv_cache( + kv_c_contexts: list[torch.Tensor], + k_pe_contexts: list[torch.Tensor], + block_size: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: torch.device, + num_blocks: int, + common_attn_metadata: CommonAttentionMetadata, + randomize_blocks: bool = True) -> torch.Tensor: + """Create and prepopulate an MLA KV cache with context data. + + Args: + kv_c_contexts: List of latent KV context tensors for each sequence + k_pe_contexts: List of key positional embedding context tensors + for each sequence + block_size: Size of each block + num_kv_heads: Number of KV heads (should be 1 for MLA) + head_size: Size of each head (latent dimension) + dtype: Data type for the cache + device: Device to create the cache on + num_blocks: Total number of blocks in the cache + common_attn_metadata: Common attention metadata + randomize_blocks: Whether to randomly permute blocks + or use sequential order + + Returns: + MLA KV cache tensor + """ + batch_size = len(kv_c_contexts) + seq_lens = common_attn_metadata.seq_lens_cpu + query_lens = common_attn_metadata.query_start_loc_cpu[ + 1:] - common_attn_metadata.query_start_loc_cpu[:-1] + context_lens = common_attn_metadata.num_computed_tokens_cpu + block_table = common_attn_metadata.block_table_tensor + slot_mapping = common_attn_metadata.slot_mapping + + # Create MLA KV cache: (num_blocks, block_size, head_size) + kv_cache = torch.empty(num_blocks, + block_size, + head_size, + dtype=dtype, + device=device) + kv_cache_flat = kv_cache.view(-1, head_size) + + # Populate the cache with the context tokens + # Start from block_id=1 since block_id=0 is considered the null block + start_block_idx = 1 + for i in range(batch_size): + kv_c_context, k_pe_context = kv_c_contexts[i], k_pe_contexts[i] + kv_context = torch.cat([kv_c_context, k_pe_context.squeeze(1)], dim=-1) + start = start_block_idx * block_size + end = start + kv_context.shape[0] + kv_cache_flat[start:end, ...] = kv_context + + # Stay block aligned and allocate enough blocks for the new tokens + start_block_idx += cdiv(int(seq_lens[i]), block_size) + + blocks_end = start_block_idx + + # Permute the context blocks (excluding block 0 which is null) + if randomize_blocks: + perm = torch.randperm( + blocks_end - 1) + 1 # Random permutation starting from block 1 + else: + perm = torch.arange( + 1, blocks_end) # Sequential order starting from block 1 + + inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device) + inv_perm[1:] = torch.argsort( + perm) + 1 # Add 1 to account for starting from block 1 + kv_cache[1:blocks_end, ...] = kv_cache[perm, ...] + + # Construct the right block table + # Start from block_id=1 since block_id=0 is considered the null block + start_block_idx = 1 + for i in range(batch_size): + num_blocks_for_seq = cdiv(int(seq_lens[i]), block_size) + start = start_block_idx + end = start + num_blocks_for_seq + block_table[i, :num_blocks_for_seq] = inv_perm[start:end] + start_block_idx += num_blocks_for_seq + + # Create a realistic slot mapping that corresponds to the block table + for i in range(batch_size): + token_offsets = torch.arange(int(query_lens[i])) + int(context_lens[i]) + block_indices = token_offsets // block_size + token_inter_block_offsets = token_offsets % block_size + start = common_attn_metadata.query_start_loc_cpu[i] + end = common_attn_metadata.query_start_loc_cpu[i + 1] + slot_mapping[start:end] = block_table[ + i, + block_indices] * block_size + token_inter_block_offsets.to(device) + + return kv_cache + + +class MockAttentionLayer: + """A mock attention layer for testing.""" + + def __init__(self, device: torch.device): + self._q_scale = torch.tensor(1.0, device=device) + self._k_scale = torch.tensor(1.0, device=device) + self._v_scale = torch.tensor(1.0, device=device) + + +def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, + layer_names: list[str], vllm_config, + device: torch.device, + common_attn_metadata: CommonAttentionMetadata, + query: torch.Tensor, kv_c: torch.Tensor, + k_pe: torch.Tensor, kv_cache: torch.Tensor, + kv_lora_rank: int, qk_nope_head_dim: int, + qk_rope_head_dim: int, v_head_dim: int, + mock_kv_b_proj) -> torch.Tensor: + """Run attention computation using the specified backend's AttentionImpl.""" + + builder_cls, impl_cls = get_attention_backend(backend) + + # Build metadata + builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + + # Instantiate MLA implementation + num_heads = vllm_config.model_config.get_num_attention_heads( + vllm_config.parallel_config) + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config) + head_size = vllm_config.model_config.get_head_size() + scale = 1.0 / (head_size**0.5) + impl = impl_cls( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="auto", + logits_soft_cap=None, + attn_type="decoder", + kv_sharing_target_layer_name=None, + q_lora_rank=None, + kv_lora_rank=kv_lora_rank, + qk_nope_head_dim=qk_nope_head_dim, + qk_rope_head_dim=qk_rope_head_dim, + qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, + v_head_dim=v_head_dim, + kv_b_proj=mock_kv_b_proj, + ) + + # Process weights to create W_UK_T and W_UV attributes needed by MLA + act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) + impl.process_weights_after_loading(act_dtype) + + # Create mock layer and output buffer + mock_layer = MockAttentionLayer(device) + num_tokens = query.shape[0] + output = torch.empty(num_tokens, + num_heads * v_head_dim, + dtype=query.dtype, + device=query.device) + + # Run forward pass + # NOTE: The query, key, and value are already shaped correctly + # in the calling test function. + output = impl.forward(mock_layer, + query, + kv_c, + k_pe, + kv_cache, + attn_metadata, + output=output) + + return output + + +@pytest.mark.parametrize("batch_spec_name", [ + "small_decode", "small_prefill", "mixed_small", "medium_decode", + "medium_prefill", "mixed_medium", "large_decode", "large_prefill", + "single_decode", "single_prefill" +]) +@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) +def test_backend_correctness(dist_init, batch_spec_name: str, model: str): + """ + Test that all backends produce similar outputs to a reference implementation + using torch.nn.functional.scaled_dot_product_attention. + + This test works by: + 1. Generating a batch of sequences with specified context and query lengths. + 2. Computing a ground-truth attention output using torch.sdpa on + contiguous Q, K, and V tensors. + 3. Simulating vLLM's paged KV cache: It takes the context portion of the + K/V tensors and manually places them into a paged buffer according to + the test's (randomly generated) block table. + 4. Running each vLLM attention backend with the new queries and the + simulated paged KV cache. + 5. Comparing the vLLM backend's output to the ground-truth SDPA output. + """ + batch_spec = BATCH_SPECS[batch_spec_name] + vllm_config = create_vllm_config(model_name=model, + max_model_len=max(batch_spec.seq_lens), + num_gpu_blocks=2048) + device = torch.device("cuda:0") + + kv_cache_spec = create_standard_kv_cache_spec(vllm_config) + + # 1. Setup + batch_size = batch_spec.batch_size + seq_lens = batch_spec.seq_lens + query_lens = batch_spec.query_lens + num_q_heads = vllm_config.model_config.get_num_attention_heads( + vllm_config.parallel_config) + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config) + head_size = vllm_config.model_config.get_head_size() + dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) + block_size = vllm_config.cache_config.block_size + kv_lora_rank = 512 + qk_rope_head_dim = 64 + qk_nope_head_dim = 128 + v_head_dim = 128 + total_head_size = kv_lora_rank + qk_rope_head_dim + assert kv_lora_rank + qk_rope_head_dim == head_size, \ + f"MLA dimensions don't match: {total_head_size} != {head_size}" + scale = 1.0 / (total_head_size**0.5) + + # 2. Generate data and compute SDPA reference output for MLA + all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], [] + all_sdpa_outputs = [] + kv_c_contexts, k_pe_contexts = [], [] + + # Create shared MLA weight matrices for consistency across all sequences + W_UK = torch.randn(kv_lora_rank, + num_q_heads, + qk_nope_head_dim, + dtype=dtype, + device=device) + W_UV = torch.randn(kv_lora_rank, + num_q_heads, + v_head_dim, + dtype=dtype, + device=device) + kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1) + + for i in range(batch_size): + s_len = seq_lens[i] + q_len = query_lens[i] + context_len = s_len - q_len + + # Generate MLA tensors + # Q has both nope and rope components: + # [q_len, num_heads, qk_nope_head_dim + qk_rope_head_dim] + q_c = torch.randn(q_len, + num_q_heads, + qk_nope_head_dim + qk_rope_head_dim, + dtype=dtype, + device=device) + + # KV_C (latent K/V): [s_len, kv_lora_rank] + kv_c_full = torch.randn(s_len, + kv_lora_rank, + dtype=dtype, + device=device) + + # K_PE (rope component): [s_len, 1, qk_rope_head_dim] + k_pe_full = torch.randn(s_len, + 1, + qk_rope_head_dim, + dtype=dtype, + device=device) + + # Determine if this is decode (single token) + # or prefill (multiple tokens) + is_decode = q_len == 1 + + # Split q into nope and rope components + q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1) + + if is_decode: + # Decode path: MQA-style attention in latent space + # Transform q_nope to latent space: q_nope @ W_UK + # q_nope: [1, num_heads, qk_nope_head_dim] + # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim] + ql_nope = torch.einsum("qnh,lnh->qnl", q_nope, + W_UK) # [1, num_heads, kv_lora_rank] + + # Build MQA attention inputs + # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim] + q_mqa = torch.cat([ql_nope, q_pe], dim=-1) + # K: [s_len, kv_lora_rank + qk_rope_head_dim] + # (broadcasted to all heads) + k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1) + k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1) + # V: [s_len, kv_lora_rank] (broadcasted to all heads) + v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1) + + # SDPA expects (N, H, L, D) + q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2) + + sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, k_sdpa_in, v_sdpa_in, is_causal=False, scale=scale) + sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze( + 0) # [1, num_heads, kv_lora_rank] + + # Project back to output space: sdpa_out @ W_UV + sdpa_out_i = torch.einsum("qnl,lnv->qnv", sdpa_out_i, W_UV) + sdpa_out_i = sdpa_out_i.flatten(start_dim=-2) + else: + # Prefill path: MHA-style attention with full sequence + # Apply kv_b_proj to the full kv_c tensor + kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full, + kv_b_proj_weight) + k_nope_full, v_full = kv_nope_full.split( + [qk_nope_head_dim, v_head_dim], dim=-1) + + # Build attention inputs for full sequence + q_mha = torch.cat([q_nope, q_pe], + dim=-1) # [q_len, num_heads, total_dim] + k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1) + k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1) + + # Create custom attention mask: + # - Query tokens can attend to all context tokens + # - Query tokens can only attend to query tokens up to their pos + attn_mask = torch.ones(q_len, + s_len, + dtype=torch.bool, + device=device) + # Apply causal mask only to the query portion (context_len onwards) + causal_mask = torch.tril(torch.ones(q_len, q_len, device=device)) + attn_mask[:, context_len:] = causal_mask + + # SDPA expects (N, H, L, D) + q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2) + + # Single attention call with custom mask + sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, + k_sdpa_in, + v_sdpa_in, + attn_mask=attn_mask, + scale=scale) + sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(0) + sdpa_out_i = sdpa_out_i.flatten(start_dim=-2) + + all_sdpa_outputs.append(sdpa_out_i) + + # Inputs for vLLM MLA backends are just the new tokens + all_q_vllm.append(q_c) + all_kv_c_vllm.append(kv_c_full[context_len:]) # New kv_c tokens + all_k_pe_vllm.append(k_pe_full[context_len:]) # New k_pe tokens + + # Contextual K/V data used to populate the paged cache (MLA format) + kv_c_contexts.append(kv_c_full[:context_len]) + k_pe_contexts.append(k_pe_full[:context_len]) + + # Concatenate all sequences (no reordering needed) + query_vllm = torch.cat(all_q_vllm, dim=0) + kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0) + k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0) + sdpa_output = torch.cat(all_sdpa_outputs, dim=0) + + # Create mock kv_b_proj using the same weights as reference implementation + from vllm.model_executor.layers.linear import ColumnParallelLinear + mock_kv_b_proj = ColumnParallelLinear(input_size=kv_lora_rank, + output_size=num_q_heads * + (qk_nope_head_dim + v_head_dim), + bias=False).to(device=device, + dtype=dtype) + + # Set the mock weights to match our reference implementation + # Reshape W_UK and W_UV to match the expected kv_b_proj format + # [kv_lora_rank, num_heads, qk_nope_head_dim + v_head_dim] + kv_b_proj_weight = kv_b_proj_weight.view( + kv_lora_rank, num_q_heads * (qk_nope_head_dim + v_head_dim)) + mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T) + + # Create metadata using original batch spec + common_attn_metadata = create_common_attn_metadata( + batch_spec, vllm_config.cache_config.block_size, device) + + # 3. Simulate Paged KV Cache and a realistic slot_mapping + kv_cache = create_and_prepopulate_kv_cache( + kv_c_contexts=kv_c_contexts, + k_pe_contexts=k_pe_contexts, + block_size=block_size, + num_kv_heads=num_kv_heads, + head_size=head_size, + dtype=dtype, + device=device, + num_blocks=vllm_config.cache_config.num_gpu_blocks, + common_attn_metadata=common_attn_metadata, + randomize_blocks=True) + + # 4. Run vLLM backends and compare + for backend_name in BACKENDS_TO_TEST: + backend_output = run_attention_backend( + backend_name, kv_cache_spec, ["placeholder"], vllm_config, device, + common_attn_metadata, query_vllm, kv_c_vllm, k_pe_vllm, kv_cache, + kv_lora_rank, qk_nope_head_dim, qk_rope_head_dim, v_head_dim, + mock_kv_b_proj) + + # Check shape and dtype consistency + assert backend_output.shape == sdpa_output.shape, ( + f"[{backend_name}] shape {backend_output.shape} != " + f"SDPA shape {sdpa_output.shape}") + assert backend_output.dtype == sdpa_output.dtype, ( + f"[{backend_name}] dtype {backend_output.dtype} != " + f"SDPA dtype {sdpa_output.dtype}") + + assert torch.isfinite(backend_output).all(), ( + f"[{backend_name}] produced non-finite values") + + # Check numerical similarity + rtol = 1e-2 + atol = 5e-1 + + max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item() + max_rel_diff = torch.max( + torch.abs(backend_output - sdpa_output) / + torch.abs(sdpa_output)).item() + all_close = torch.allclose(backend_output, + sdpa_output, + rtol=rtol, + atol=atol) + + assert all_close, ( + f"[{backend_name}] output differs from SDPA baseline. " + f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})") diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index e547e71e0cdb..6a08cdc56f73 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -135,6 +135,12 @@ def get_attention_backend(backend_name: _Backend): "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend", _Backend.XFORMERS_VLLM_V1: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", + _Backend.CUTLASS_MLA: + "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", + _Backend.FLASHMLA_VLLM_V1: + "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend", + _Backend.TRITON_MLA_VLLM_V1: + "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", } if backend_name not in backend_map: @@ -167,9 +173,11 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", tensor_parallel_size: int = 1, max_model_len: int = 1024, dtype: Union[ModelDType, torch.dtype] = "auto", + num_gpu_blocks: int = 1000, block_size: int = 16, max_num_seqs: int = 256, max_num_batched_tokens: int = 8192, + enable_chunked_prefill: bool = True, add_mock_model_methods: bool = True) -> VllmConfig: """Create a VllmConfig for testing with reasonable defaults.""" @@ -189,7 +197,7 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", ) # Set cache blocks for testing # (these may be set during initialization normally) - cache_config.num_gpu_blocks = 1000 + cache_config.num_gpu_blocks = num_gpu_blocks cache_config.num_cpu_blocks = 0 parallel_config = ParallelConfig( @@ -198,6 +206,7 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", scheduler_config = SchedulerConfig( max_num_seqs=max_num_seqs, max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=enable_chunked_prefill, ) device_config = DeviceConfig() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 646e4fec836b..03028ebfe76a 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -24,7 +24,7 @@ (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). Deepseek's MLA attention works the following way: -* Use a single latent vector to represent the per-token entry of the KV cache. +* Use a single latent vector to represent the per-token entry of the KV cache. * For decode (i.e. the memory friendly approach) the attention "simulates" a multi-head attention, while the compute is similar to multi-query attention. @@ -82,7 +82,7 @@ torch.cat([q_nope, q_pe], dim=-1), torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1), v -) +) return spda_o @ W_O NOTE: in the actual code, @@ -120,20 +120,20 @@ ## Chunked Prefill -For chunked prefill we want to use the compute friendly algorithm. We are -assuming sufficiently large Sq / Skv ratio, in the future may want to switch to +For chunked prefill we want to use the compute friendly algorithm. We are +assuming sufficiently large Sq / Skv ratio, in the future may want to switch to the data-movement friendly approach if the chunk (i.e. `Sq`) is small. However, the compute-friendly approach can potentially run out of memory if Skv is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)` -To mitigate this, we chunk the computation of attention with respect to the -current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a +To mitigate this, we chunk the computation of attention with respect to the +current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a fixed workspace size. The chunked prefill approach is as follows: -MCC Max chunk of context to process per iter, computed dynamically, +MCC Max chunk of context to process per iter, computed dynamically, used to bound the memory usage q_c = h_t @ W_DQ @@ -155,7 +155,7 @@ new_v, casual=True, return_softmax_lse=True -) +) // Compute attention with the already existing context for chunk_idx in range(cdiv(C, MCC)): From b1602a8b63fc05adb4e2786b0d90694f8375cc83 Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Wed, 20 Aug 2025 15:04:21 -0700 Subject: [PATCH 201/231] [Fix] remove is_marlin param in benchmark_moe (#23286) Signed-off-by: Duncan Moss From 8c26b4701964fc6b778b1320f57f0e9b3b60fe91 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Wed, 20 Aug 2025 16:46:06 -0700 Subject: [PATCH 202/231] [EP] Add logging for experts map (#22685) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Simon Mo Signed-off-by: Duncan Moss --- vllm/model_executor/layers/fused_moe/layer.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index aa8ceda1bb25..b16c21b7013a 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -695,6 +695,26 @@ def determine_expert_map( return (local_num_experts, expert_map) +def get_compressed_expert_map(expert_map: torch.Tensor) -> str: + """ + Compresses the expert map by removing any -1 entries. + + Args: + expert_map (torch.Tensor): A tensor of shape (global_num_experts,) + mapping from global to local index. Contains -1 for experts not + assigned to the current rank. + + Returns: + str: A string mapping from local to global index. + Using str to support hashing for logging once only. + """ + global_indices = torch.where(expert_map != -1)[0] + local_indices = expert_map[global_indices] + return ", ".join( + f"{local_index.item()}->{global_index.item()}" + for local_index, global_index in zip(local_indices, global_indices)) + + @CustomOp.register("fused_moe") class FusedMoE(CustomOp): """FusedMoE layer for MoE models. @@ -795,6 +815,12 @@ def __init__( ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts) + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", self.ep_rank, self.ep_size, self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self.expert_map)) else: self.local_num_experts, self.expert_map = (self.global_num_experts, None) From 97b7516aa7b05a3cc85ececfa6ae3d391c334a98 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 20:14:59 -0400 Subject: [PATCH 203/231] Remove duplicate entry in vllm.attention.__all__ (#23296) Signed-off-by: Russell Bryant Signed-off-by: Duncan Moss --- vllm/attention/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 344040586a53..dcb2aa68fbee 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -14,7 +14,6 @@ "AttentionMetadata", "AttentionType", "AttentionMetadataBuilder", - "Attention", "AttentionState", "get_attn_backend", ] From c300639757a1ad6e215c59ef827950716bc548fb Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 20:18:12 -0400 Subject: [PATCH 204/231] [CI Bugfix] Fix CI by fully removing --enable-prompt-adapter (#23284) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- vllm/engine/arg_utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index dcf78758946f..f3afc015f669 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -888,12 +888,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--disable-log-stats', action='store_true', help='Disable logging statistics.') - parser.add_argument('--enable-prompt-adapter', - action='store_true', - deprecated=True, - help='[DEPRECATED] Prompt adapter has been ' - 'removed. Setting this flag to True or False' - ' has no effect on vLLM behavior.') return parser From 24c8bb66ac110ed9ac4af35d896a4b2f20abd78d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 20 Aug 2025 18:25:56 -0700 Subject: [PATCH 205/231] [Optimization] Make new_block_ids None if empty (#23262) Signed-off-by: Woosuk Kwon Signed-off-by: Duncan Moss --- vllm/v1/core/kv_cache_manager.py | 30 ++++++++++++++++++++++++++---- vllm/v1/core/sched/output.py | 2 +- vllm/v1/core/sched/scheduler.py | 24 ++++++++++++------------ vllm/v1/worker/gpu_model_runner.py | 14 +++++++++----- vllm/v1/worker/tpu_model_runner.py | 14 +++++++++----- 5 files changed, 57 insertions(+), 27 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index bfaa7ab08f5c..fd0bdb2c80fc 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional +from typing import Literal, Optional, overload from vllm.distributed.kv_events import KVCacheEvent from vllm.logger import init_logger @@ -37,7 +37,24 @@ def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks": tuple(blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks))) - def get_block_ids(self) -> tuple[list[int], ...]: + @overload + def get_block_ids( + self, + allow_none: Literal[False] = False, + ) -> tuple[list[int], ...]: + ... + + @overload + def get_block_ids( + self, + allow_none: Literal[True] = True, + ) -> Optional[tuple[list[int], ...]]: + ... + + def get_block_ids( + self, + allow_none: bool = False, + ): """ Converts the KVCacheBlocks instance to block_ids. @@ -46,6 +63,8 @@ def get_block_ids(self) -> tuple[list[int], ...]: * the outer tuple corresponds to KV cache groups * each inner list contains the block_ids of the blocks in that group """ + if allow_none and all(len(group) == 0 for group in self.blocks): + return None return tuple([blk.block_id for blk in group] for group in self.blocks) def get_unhashed_block_ids(self) -> list[int]: @@ -348,10 +367,13 @@ def take_events(self) -> list[KVCacheEvent]: """ return self.block_pool.take_events() + def get_blocks(self, request_id: str) -> KVCacheBlocks: + """Get the blocks of a request.""" + return KVCacheBlocks(self.coordinator.get_blocks(request_id)) + def get_block_ids(self, request_id: str) -> tuple[list[int], ...]: """Get the block ids of a request.""" - return KVCacheBlocks( - self.coordinator.get_blocks(request_id)).get_block_ids() + return self.get_blocks(request_id).get_block_ids() def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: """Cache the blocks for the request, if enabled.""" diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index fac07f97195b..9ba7ec9d9693 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -91,7 +91,7 @@ class CachedRequestData: # NOTE(woosuk): new_token_ids is only used for pipeline parallelism. # When PP is not used, new_token_ids will be empty. new_token_ids: list[list[int]] - new_block_ids: list[tuple[list[int], ...]] + new_block_ids: list[Optional[tuple[list[int], ...]]] num_computed_tokens: list[int] @property diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4b167da5c8f8..0b528587b933 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -19,7 +19,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, compute_encoder_budget) -from vllm.v1.core.kv_cache_manager import KVCacheManager +from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager from vllm.v1.core.sched.interface import SchedulerInterface from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) @@ -185,7 +185,7 @@ def schedule(self) -> SchedulerOutput: # uses structured decoding. structured_output_request_ids: dict[str, int] = {} - req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {} + req_to_new_blocks: dict[str, KVCacheBlocks] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. @@ -288,8 +288,7 @@ def schedule(self) -> SchedulerOutput: # Therefore, we might introduce some additional # cycle to fill in the bitmask, which could be a big no-op. structured_output_request_ids[request.request_id] = req_index - req_to_new_block_ids[request.request_id] = ( - new_blocks.get_block_ids()) + req_to_new_blocks[request.request_id] = new_blocks num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens req_index += 1 @@ -496,8 +495,8 @@ def schedule(self) -> SchedulerOutput: if self.lora_config and request.lora_request: scheduled_loras.add(request.lora_request.lora_int_id) - req_to_new_block_ids[request.request_id] = ( - self.kv_cache_manager.get_block_ids(request.request_id)) + req_to_new_blocks[request.request_id] = ( + self.kv_cache_manager.get_blocks(request.request_id)) num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING @@ -546,8 +545,8 @@ def schedule(self) -> SchedulerOutput: ) # Construct the scheduler output. new_reqs_data = [ - NewRequestData.from_request(req, - req_to_new_block_ids[req.request_id]) + NewRequestData.from_request( + req, req_to_new_blocks[req.request_id].get_block_ids()) for req in scheduled_new_reqs ] cached_reqs_data = self._make_cached_request_data( @@ -555,7 +554,7 @@ def schedule(self) -> SchedulerOutput: scheduled_resumed_reqs, num_scheduled_tokens, scheduled_spec_decode_tokens, - req_to_new_block_ids, + req_to_new_blocks, ) scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, @@ -628,11 +627,11 @@ def _make_cached_request_data( resumed_reqs: list[Request], num_scheduled_tokens: dict[str, int], spec_decode_tokens: dict[str, list[int]], - req_to_new_block_ids: dict[str, tuple[list[int], ...]], + req_to_new_blocks: dict[str, KVCacheBlocks], ) -> CachedRequestData: req_ids: list[str] = [] new_token_ids: list[list[int]] = [] - new_block_ids: list[tuple[list[int], ...]] = [] + new_block_ids: list[Optional[tuple[list[int], ...]]] = [] num_computed_tokens: list[int] = [] use_connector = self.connector is not None @@ -655,7 +654,8 @@ def _make_cached_request_data( # out of bounds errors. TODO: Remove this once the KVConnector # is updated to handle token IDs properly. new_token_ids.append([]) - new_block_ids.append(req_to_new_block_ids[req_id]) + new_block_ids.append( + req_to_new_blocks[req_id].get_block_ids(allow_none=True)) num_computed_tokens.append(req.num_computed_tokens) # Because resumed_reqs is usually empty, it is more efficient to do # in-place appending so that we don't need to allocate a new list. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 33747d6917a5..cc86f9826491 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -574,11 +574,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the block IDs. if not resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_ids in zip(req_state.block_ids, - new_block_ids): - block_ids.extend(new_ids) + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: + assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids @@ -594,7 +596,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(new_block_ids, req_index) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) # For the last rank, we don't need to update the token_ids_cpu # because the sampled tokens are already cached. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 9196c62377b9..0f569500cdf6 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -418,11 +418,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: # Update the cached states. req_state.num_computed_tokens = num_computed_tokens if not resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_ids in zip(req_state.block_ids, - new_block_ids): - block_ids.extend(new_ids) + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: + assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids @@ -438,7 +440,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(new_block_ids, req_index) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. From a0c60ea6f58b62bbaf101fc1fcf1e0b7e4303c9d Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 21 Aug 2025 09:34:24 +0800 Subject: [PATCH 206/231] [CPU] Refactor CPU W8A8 scaled_mm (#23071) Signed-off-by: jiang1.li Signed-off-by: Duncan Moss --- .../scripts/hardware_ci/run-cpu-test.sh | 7 +- cmake/cpu_extension.cmake | 59 +- csrc/cpu/cpu_types_x86.hpp | 8 +- csrc/cpu/dnnl_helper.cpp | 346 +++++++ csrc/cpu/dnnl_helper.h | 169 ++++ csrc/cpu/dnnl_helper.hpp | 206 ---- csrc/cpu/dnnl_kernels.cpp | 494 +++++++++ csrc/cpu/quant.cpp | 951 ------------------ csrc/cpu/torch_bindings.cpp | 96 +- tests/kernels/test_onednn.py | 144 +++ vllm/_custom_ops.py | 83 ++ vllm/model_executor/layers/fused_moe/layer.py | 11 +- vllm/model_executor/layers/linear.py | 8 +- .../kernels/scaled_mm/__init__.py | 4 +- .../quantization/kernels/scaled_mm/cpu.py | 206 ++++ .../quantization/kernels/scaled_mm/cutlass.py | 4 +- vllm/model_executor/layers/utils.py | 6 + 17 files changed, 1527 insertions(+), 1275 deletions(-) create mode 100644 csrc/cpu/dnnl_helper.cpp create mode 100644 csrc/cpu/dnnl_helper.h delete mode 100644 csrc/cpu/dnnl_helper.hpp create mode 100644 csrc/cpu/dnnl_kernels.cpp delete mode 100644 csrc/cpu/quant.cpp create mode 100644 tests/kernels/test_onednn.py create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 57a7bc4e5f5d..9dec9f8e9eb3 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -46,6 +46,11 @@ function cpu_tests() { set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + # Run kernel tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -v -s tests/kernels/test_onednn.py" + # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e @@ -99,4 +104,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index e0da46e2acca..cc38cd41a5b2 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -182,17 +182,17 @@ endif() # # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms) # Flag to enable ACL kernels for AARCH64 platforms -if ( VLLM_BUILD_ACL STREQUAL "ON") +if (VLLM_BUILD_ACL STREQUAL "ON") set(USE_ACL ON) else() set(USE_ACL OFF) endif() -if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) +if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.8.1 + GIT_TAG v3.9 GIT_PROGRESS TRUE GIT_SHALLOW TRUE ) @@ -204,7 +204,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) endif() set(ONEDNN_AARCH64_USE_ACL "ON") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/") - endif() + endif() set(ONEDNN_LIBRARY_TYPE "STATIC") set(ONEDNN_BUILD_DOC "OFF") @@ -217,38 +217,23 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) set(ONEDNN_ENABLE_ITT_TASKS "OFF") set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") + set(ONEDNN_VERBOSE "OFF") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) FetchContent_MakeAvailable(oneDNN) - - list(APPEND LIBS dnnl) -elseif(POWER10_FOUND) - FetchContent_Declare( - oneDNN - GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.7.2 - GIT_PROGRESS TRUE - GIT_SHALLOW TRUE + add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp") + target_include_directories( + dnnl_ext + PUBLIC ${oneDNN_SOURCE_DIR}/include + PUBLIC ${oneDNN_BINARY_DIR}/include + PRIVATE ${oneDNN_SOURCE_DIR}/src ) - - set(ONEDNN_LIBRARY_TYPE "STATIC") - set(ONEDNN_BUILD_DOC "OFF") - set(ONEDNN_BUILD_EXAMPLES "OFF") - set(ONEDNN_BUILD_TESTS "OFF") - set(ONEDNN_ENABLE_WORKLOAD "INFERENCE") - set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") - set(ONEDNN_BUILD_GRAPH "OFF") - set(ONEDNN_ENABLE_JIT_PROFILING "OFF") - set(ONEDNN_ENABLE_ITT_TASKS "OFF") - set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") - set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") - set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) - - set(DNNL_CPU_RUNTIME "OMP") - - FetchContent_MakeAvailable(oneDNN) - - list(APPEND LIBS dnnl) + target_link_libraries(dnnl_ext dnnl) + target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) + list(APPEND LIBS dnnl_ext) + set(USE_ONEDNN ON) +else() + set(USE_ONEDNN OFF) endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") @@ -275,7 +260,6 @@ set(VLLM_EXT_SRC if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" "csrc/cpu/shm.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) @@ -289,14 +273,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) ${VLLM_EXT_SRC}) add_compile_definitions(-DCPU_CAPABILITY_AVX512) endif() -elseif(POWER10_FOUND) - set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" - ${VLLM_EXT_SRC}) endif() -if (ASIMD_FOUND) + +if(USE_ONEDNN) set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" + "csrc/cpu/dnnl_kernels.cpp" ${VLLM_EXT_SRC}) endif() diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 3952c43cbc72..982f7c07a13b 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -89,7 +89,7 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(const FP32Vec16&); - void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } + void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -126,7 +126,7 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(const FP32Vec16&); - void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } + void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -180,8 +180,8 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 1)) {} void save(void* ptr) const { - *reinterpret_cast<__m256i*>(ptr) = reg_low; - *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; + _mm256_storeu_si256((__m256i*)ptr, reg_low); + _mm256_storeu_si256((__m256i*)ptr + 1, reg_high); } }; #endif diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp new file mode 100644 index 000000000000..f3f00edb3606 --- /dev/null +++ b/csrc/cpu/dnnl_helper.cpp @@ -0,0 +1,346 @@ +#include +#include + +#include "common/memory_desc.hpp" +#include "common/memory.hpp" + +#include "dnnl_helper.h" + +static dnnl::engine& default_engine() { + static dnnl::engine engine(dnnl::engine::kind::cpu, 0); + return engine; +} + +static dnnl::stream& default_stream() { + static dnnl::stream stream(default_engine()); + return stream; +} + +void release_dnnl_matmul_handler(int64_t handler) { + DNNLMatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + delete ptr; +} + +template +class DNNLPrimitiveCache { + public: + using cache_value_t = std::pair; + using result_value_t = VT; + using container_t = std::list; + using value_iterator_t = typename container_t::iterator; + using map_t = std::unordered_map; + using creator_t = VT (*)(); + + public: + DNNLPrimitiveCache(size_t capacity) + : capacity_(capacity), + values_(), + key_to_value_(std::min(256lu, capacity)) { + assert(capacity > 0); + } + + template + result_value_t get_or_create(const KT& key, F&& creator) { + std::optional value = get_value(key); + if (value.has_value()) { + return value.value()->second; + } else { + return add_value({key, creator()})->second; + } + } + + size_t size() const { return values_.size(); } + + private: + void dump_data() { + std::stringstream ss; + ss << "table_id: " << std::hex << reinterpret_cast(this) << std::dec + << "\n"; + ss << "container: ["; + for (auto&& iter : values_) { + ss << "(" << iter.first << ", " << std::hex + << reinterpret_cast(iter.second.get()) << "), " << std::dec; + } + ss << "]\n"; + + ss << "map: ["; + for (auto&& iter : key_to_value_) { + ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex + << reinterpret_cast(iter.second->second.get()) << std::dec + << "), "; + } + ss << "]\n"; + std::printf("%s\n", ss.str().c_str()); + } + + value_iterator_t add_value(cache_value_t&& new_value) { + if (size() == capacity_) { + cache_value_t& last_item = values_.back(); + key_to_value_.erase(last_item.first); + values_.pop_back(); + } + + auto& added_value_ = values_.emplace_front(std::move(new_value)); + key_to_value_.emplace(added_value_.first, values_.begin()); + return values_.begin(); + } + + std::optional get_value(const KT& key) { + if (key_to_value_.size() > 0 && key == values_.begin()->first) { + return values_.begin(); + } + + auto value_map_iterator = key_to_value_.find(key); + if (value_map_iterator != key_to_value_.end()) { + values_.splice(values_.begin(), values_, value_map_iterator->second); + return value_map_iterator->second; + } else { + return {}; + } + } + + private: + const size_t capacity_; + container_t values_; + map_t key_to_value_; +}; + +DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler( + const Args& args, dnnl::memory::data_type b_type) + : b_n_size_(args.b_n_size), + b_n_stride_(args.b_n_stride), + b_k_size_(args.b_k_size), + b_k_stride_(args.b_k_stride), + b_type_(b_type), + c_type_(args.c_type), + runtime_memory_ptrs_(8), + primitive_cache_size_(args.primitive_cache_size) { + assert(primitive_cache_size_ > 0); +} + +void DNNLMatMulPrimitiveHandler::prepack_weight( + void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) { + dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, + {b_k_stride_, b_n_stride_}); + dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr); + dnnl::memory packed_weight(b_target_mem_desc, default_engine()); + { + dnnl::reorder(original_weight, packed_weight) + .execute(default_stream(), original_weight, packed_weight); + default_stream().wait(); + } + memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight; + b_target_mem_desc_ = b_target_mem_desc; +} + +void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr( + size_t index, dnnl_memory* memory_ptr) { + dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage(); + dnnl_memory_desc* mem_desc = const_cast(memory_ptr->md()); + runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc}; +} + +std::pair +DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) { + return runtime_memory_ptrs_[index]; +} + +namespace std { +template <> +struct hash { + size_t operator()( + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const { + return hash()(val.b_n_size) ^ hash()(val.b_k_size) ^ + hash()(static_cast(val.a_qs)) ^ + hash()(static_cast(val.b_qs)) ^ hash()(val.use_azp) ^ + hash()(static_cast(val.c_type)); + } +}; + +template <> +struct hash { + size_t operator()( + const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const { + return hash()(val.a_m_size) ^ hash()(val.use_bias) ^ + hash()(static_cast(val.bias_type)); + } +}; +} // namespace std + +bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l, + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) { + return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size && + l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp && + l.c_type == r.c_type; +} + +bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l, + const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) { + return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size && + l.bias_type == r.bias_type; +} + +static std::shared_ptr +get_w8a8_class_primitive_cache( + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key, + int64_t cache_size) { + static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128); + assert(cache_size > 0); + return cache.get_or_create(key, [&]() { + return std::make_shared(cache_size); + }); +} + +W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args) + : DNNLMatMulPrimitiveHandler( + static_cast(args), + dnnl::memory::data_type::s8), + use_azp_(args.use_a_zero_point), + a_qs_(args.a_quantization_strategy), + b_qs_(args.b_quantization_strategy), + m_size_cache_(nullptr) { + assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL); + assert(b_qs_ != QuantizationStrategy::PER_TOKEN); + if (a_qs_ == QuantizationStrategy::PER_TOKEN) { + assert(!use_azp_); + }; + prepack_weight(args.b_ptr, + create_primitive_desc( + MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, + .use_bias = false, + .bias_type = dnnl::memory::data_type::undef}, + true) + .weights_desc()); + init_runtime_memory_cache(args); +} + +void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) { + auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0); + auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1); + a_storage->set_data_handle((void*)args.a_ptr); + a_mem_desc->dims[0] = args.a_m_size; + c_storage->set_data_handle((void*)args.c_ptr); + c_mem_desc->dims[0] = args.a_m_size; + + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2); + a_scale_storage->set_data_handle((void*)args.a_scales_ptr); + } + if (use_azp_) { + auto&& [a_zero_point_storage, a_zero_point_mem_desc] = + get_runtime_memory_ptr(3); + a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr); + } + + if (args.use_bias) { + auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4); + bias_storage->set_data_handle((void*)args.bias_ptr); + } + + dnnl::matmul matmul = get_matmul_cache(args); + matmul.execute(default_stream(), memory_cache_); + default_stream().wait(); +} + +dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache( + const MSizeCacheKey& key) { + if (m_size_cache_.get() == nullptr) { + ClassMatmulCacheKey key = {.b_n_size = b_n_size_, + .b_k_size = b_k_size_, + .a_qs = a_qs_, + .b_qs = b_qs_, + .use_azp = use_azp_, + .c_type = c_type_}; + m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_); + } + + return m_size_cache_->get_or_create(key, [&]() { + dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + return dnnl::matmul(desc); + }); +} + +void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { + memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_}, + dnnl::memory::data_type::s8, + dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get()); + memory_cache_[DNNL_ARG_DST] = + dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); + + // For PER_TOKEN, scales will be applied in outside epilogue + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory( + {{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr); + set_runtime_memory_ptr( + 2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get()); + if (use_azp_) { + memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory( + {{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr); + set_runtime_memory_ptr( + 3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get()); + } + } + + if (b_qs_ == QuantizationStrategy::PER_TENSOR) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), + (void*)args.b_scales_ptr); + } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), (void*)args.b_scales_ptr); + } + + memory_cache_[DNNL_ARG_BIAS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get()); +} + +dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( + const MSizeCacheKey& key, bool first_time) { + dnnl::memory::desc a_md({key.a_m_size, b_k_size_}, + dnnl::memory::data_type::s8, + dnnl::memory::format_tag::ab); + dnnl::memory::desc b_md; + if (first_time) { + b_md = + dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8, + dnnl::memory::format_tag::any); + } else { + b_md = b_target_mem_desc_; + } + dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, + dnnl::memory::format_tag::ab); + + dnnl::primitive_attr attr; + // For PER_TOKEN, scales will be applied in outside epilogue + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + attr.set_scales_mask(DNNL_ARG_SRC, 0); + if (use_azp_) { + attr.set_zero_points_mask(DNNL_ARG_SRC, 0); + } + } + + if (b_qs_ == QuantizationStrategy::PER_TENSOR) { + attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0); + } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) { + attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2); + } + + if (key.use_bias) { + // For PER_TOKEN, bias will be applied in epilogue + assert(a_qs_ == QuantizationStrategy::PER_TENSOR); + dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, + c_md, attr); + } else { + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, + attr); + } +} diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h new file mode 100644 index 000000000000..54ceefced9e9 --- /dev/null +++ b/csrc/cpu/dnnl_helper.h @@ -0,0 +1,169 @@ +#ifndef DNNL_HELPER_H +#define DNNL_HELPER_H + +#include +#include + +#include "oneapi/dnnl/dnnl.hpp" + +namespace c10 { +struct BFloat16; +struct Half; +} // namespace c10 + +namespace dnnl { +namespace impl { +struct memory_storage_t; +struct matmul_pd_t; +struct matmul_desc_t; +} // namespace impl +} // namespace dnnl +struct dnnl_memory_desc; + +template +class DNNLPrimitiveCache; + +template +struct DNNLType { + static constexpr dnnl::memory::data_type type = + dnnl::memory::data_type::undef; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; +}; + +template +constexpr inline dnnl::memory::data_type get_dnnl_type() { + return DNNLType>::type; +} + +class DNNLMatMulPrimitiveHandler { + public: + virtual ~DNNLMatMulPrimitiveHandler() = default; + + protected: + struct Args { + dnnl_dim_t b_n_size; + dnnl_dim_t b_n_stride; + dnnl_dim_t b_k_size; + dnnl_dim_t b_k_stride; + void* b_ptr; + dnnl::memory::data_type c_type; + size_t primitive_cache_size; + }; + + protected: + DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type); + + void prepack_weight(void* original_b_ptr, + dnnl::memory::desc b_target_mem_desc); + + void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr); + + std::pair + get_runtime_memory_ptr(size_t index); + + protected: + const dnnl_dim_t b_n_size_; + const dnnl_dim_t b_n_stride_; + const dnnl_dim_t b_k_size_; + const dnnl_dim_t b_k_stride_; + dnnl::memory::data_type b_type_; + dnnl::memory::data_type c_type_; + std::unordered_map memory_cache_; + std::vector> + runtime_memory_ptrs_; + dnnl::memory::desc b_target_mem_desc_; + int64_t primitive_cache_size_; +}; + +class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { + public: + enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL }; + + struct Args : public DNNLMatMulPrimitiveHandler::Args { + bool use_a_zero_point; + QuantizationStrategy a_quantization_strategy; + QuantizationStrategy b_quantization_strategy; + float* b_scales_ptr; + }; + + struct ClassMatmulCacheKey { + dnnl_dim_t b_n_size; + dnnl_dim_t b_k_size; + QuantizationStrategy a_qs; + QuantizationStrategy b_qs; + bool use_azp; + dnnl::memory::data_type c_type; + + friend bool operator==(const ClassMatmulCacheKey& l, + const ClassMatmulCacheKey& r); + }; + + struct MSizeCacheKey { + dnnl_dim_t a_m_size; + bool use_bias; + dnnl::memory::data_type bias_type; + + friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r); + }; + + using MSizeCache = DNNLPrimitiveCache; + using ClassMatmulCache = + DNNLPrimitiveCache>; + + struct ExecArgs : public MSizeCacheKey { + const int8_t* a_ptr; + const float* a_scales_ptr; + const int32_t* a_zero_points_ptr; + const void* bias_ptr; + void* c_ptr; + }; + + public: + W8A8MatMulPrimitiveHandler(const Args& args); + + QuantizationStrategy get_input_scale_strategy() const { return a_qs_; } + + bool get_input_use_zero_point() const { return use_azp_; } + + void execute(ExecArgs& args); + + private: + dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key, + bool first_time); + + void init_runtime_memory_cache(const Args& args); + + dnnl::matmul get_matmul_cache(const MSizeCacheKey& key); + + private: + const bool use_azp_; + const QuantizationStrategy a_qs_; + const QuantizationStrategy b_qs_; + std::shared_ptr m_size_cache_; +}; + +#endif diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp deleted file mode 100644 index 1cb8dc5b25a6..000000000000 --- a/csrc/cpu/dnnl_helper.hpp +++ /dev/null @@ -1,206 +0,0 @@ -#ifndef DNNL_HELPER_HPP -#define DNNL_HELPER_HPP - -#include -#include - -#include "oneapi/dnnl/dnnl.hpp" - -namespace { -template -struct DNNLType { - static constexpr dnnl::memory::data_type type = - dnnl::memory::data_type::undef; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; -}; - -template -constexpr inline dnnl::memory::data_type get_dnnl_type() { - return DNNLType>::type; -} -}; // namespace - -template -class DNNLPrimitiveHelper { - public: - // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias) - // A: [M, K], row-major - // B: [K, N], column-major - // C: [M, N], row-major - // bias: [N], row-major, optional - // a_scales: [MS] - // b_scales: [NS] - // Note: Due to the limitation of oneDNN - // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is - // not supported. - - template - static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c, - const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N, - dnnl_dim_t K, const float* a_scales, - const float* b_scales, dnnl_dim_t MS, - dnnl_dim_t NS) { - auto&& OutputType = get_dnnl_type(); - auto&& BiasType = get_dnnl_type(); - - dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1}); - dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K}); - dnnl::memory::desc c_md({M, N}, OutputType, {N, 1}); - - dnnl::primitive_attr attr; - if constexpr (!InputNoScale) { - if (MS == 1) { - // per-tensor - attr.set_scales_mask(DNNL_ARG_SRC, 0); - } else { - // per-token - TORCH_CHECK(false, "per-token quantization is unsupported."); - } - } - - if (NS == 1) { - // per-tensor - attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0); - } else { - // per-channel - attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2); - } - - dnnl::matmul::primitive_desc matmul_pd; -// Create memory descriptors with format_tag::any for the primitive. This -// enables the matmul primitive to choose memory layouts for an -// optimized primitive implementation, and these layouts may differ from the -// ones provided by the user. -#ifdef __aarch64__ - auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8, - dnnl::memory::format_tag::any); - auto mat_weights_md = dnnl::memory::desc( - {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any); - auto mat_dst_md = - dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any); - if (bias) { - dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1}); - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md, - mat_weights_md, bias_md, - mat_dst_md, attr); - } else { - matmul_pd = dnnl::matmul::primitive_desc( - default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr); - } -#else - if (bias) { - dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1}); - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, - bias_md, c_md, attr); - } else { - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, - c_md, attr); - } -#endif - dnnl::matmul matmul(matmul_pd); - - auto& engine = default_engine(); - - dnnl::memory a_m(a_md, engine, (void*)a); - dnnl::memory b_m(b_md, engine, (void*)b); - dnnl::memory c_m(c_md, engine, (void*)c); - dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine, - (void*)a_scales); - dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine, - (void*)b_scales); - - auto& stream = default_stream(); - - auto mat_src_mem = a_m; - auto mat_weights_mem = b_m; - auto mat_dst_mem = c_m; -#ifdef __aarch64__ - if (matmul_pd.weights_desc() != b_m.get_desc()) { - mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine); - dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem); - } -#endif - if constexpr (InputNoScale) { - if (bias) { - dnnl::memory::desc bias_md({N}, BiasType, {1}); - dnnl::memory bias_m(bias_md, engine, (void*)bias); - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_BIAS, bias_m}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } else { - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } - } else { - if (bias) { - dnnl::memory::desc bias_md({N}, BiasType, {1}); - dnnl::memory bias_m(bias_md, engine, (void*)bias); - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_BIAS, bias_m}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } else { - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } - } - stream.wait(); - } - - private: - static dnnl::engine& default_engine() { - static dnnl::engine engine(dnnl::engine::kind::cpu, 0); - return engine; - } - - static dnnl::stream& default_stream() { - static dnnl::stream stream(default_engine()); - return stream; - } -}; -#endif diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp new file mode 100644 index 000000000000..acc3b9ecde14 --- /dev/null +++ b/csrc/cpu/dnnl_kernels.cpp @@ -0,0 +1,494 @@ +#include "cpu_types.hpp" +#include "dnnl_helper.h" + +namespace { +template +struct KernelVecType { + using load_vec_type = void; + using cvt_vec_type = void; +}; + +template <> +struct KernelVecType { + using load_vec_type = vec_op::FP32Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; + +#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) +template <> +struct KernelVecType { + using load_vec_type = vec_op::BF16Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; +#endif + +template <> +struct KernelVecType { +#if defined(__powerpc64__) || defined(__s390x__) + // Power architecture-specific vector type + using load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures + using load_vec_type = vec_op::FP16Vec16; +#endif + using cvt_vec_type = vec_op::FP32Vec16; +}; + +template +void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, + const float* scale, const int32_t* azp, + const int64_t num_tokens, + const int64_t input_stride, + const int64_t hidden_size) { + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int64_t vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t inv_scale(1.0 / *scale); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + + cvt_vec_t zp_vec; + if constexpr (AZP) { + zp_vec = cvt_vec_t(static_cast(*azp)); + } + +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + int8_t* output_ptr = output + i * hidden_size; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j); + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j, hidden_size - j); + } +} + +template +void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, + float* scale, int32_t* azp, + const int64_t num_tokens, + const int64_t input_stride, + const int64_t hidden_size) { + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + cvt_vec_t max_value(std::numeric_limits::lowest()); + cvt_vec_t min_value(std::numeric_limits::max()); + { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + + if (j + vec_elem_num == hidden_size) { + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } + } else { + if constexpr (AZP) { + max_value = max_value.max(elems_fp32, hidden_size - j); + min_value = min_value.min(elems_fp32, hidden_size - j); + } else { + max_value = max_value.max(elems_fp32.abs(), hidden_size - j); + } + } + } + + float scale_val, azp_val; + if constexpr (AZP) { + float max_scalar = max_value.reduce_max(); + float min_scalar = min_value.reduce_min(); + scale_val = (max_scalar - min_scalar) / 255.0f; + azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); + azp[i] = azp_val; + scale[i] = scale_val; + } else { + scale_val = max_value.reduce_max() / 127.0f; + scale[i] = scale_val; + } + + const cvt_vec_t inv_scale(1.0 / scale_val); + const cvt_vec_t azp_vec(azp_val); + + { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + int8_t* output_ptr = output + i * hidden_size; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j); + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j, hidden_size - j); + } + } +} + +template +void dynamic_quant_epilogue(const float* input, scalar_t* output, + const float* a_scale, const int32_t* azp, + const float* azp_adj, const scalar_t* bias, + const int64_t num_tokens, + const int64_t hidden_size) { + CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + const int64_t thread_num = omp_get_max_threads(); + if (num_tokens > thread_num) { +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + const float* input_ptr = input + i * hidden_size; + scalar_t* output_ptr = output + i * hidden_size; + int64_t j = 0; + cvt_vec_t token_scale_vec(a_scale[i]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[i] * static_cast(azp[i]); + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + for (; j < hidden_size - vec_elem_num; ++j) { + cvt_vec_t elems_fp32(input_ptr + j); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + j); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + j); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + j); + } + cvt_vec_t elems_fp32(input_ptr + j); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + j); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + j); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + j, hidden_size - j); + } + } else { + const int64_t vec_iteration = + (hidden_size + vec_elem_num - 1) / vec_elem_num; + const int64_t vec_iteration_per_thread = + (vec_iteration + thread_num - 1) / thread_num; + const int64_t elem_num_per_thread = vec_iteration_per_thread * vec_elem_num; +#pragma omp parallel for schedule(static, 1) + for (int64_t i = 0; i < thread_num; ++i) { + const int64_t start = elem_num_per_thread * i; + const int64_t end = std::min(hidden_size, elem_num_per_thread + start); + for (int64_t j = 0; j < num_tokens; ++j) { + cvt_vec_t token_scale_vec(a_scale[j]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[j] * static_cast(azp[j]); + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + int64_t k = start; + const float* input_ptr = input + j * hidden_size; + scalar_t* output_ptr = output + j * hidden_size; + for (; k < end - vec_elem_num; k += vec_elem_num) { + cvt_vec_t elems_fp32(input_ptr + k); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + k); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + k); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + k); + } + if (k < end) { + cvt_vec_t elems_fp32(input_ptr + k); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + k); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + k); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + k, end - k); + } + } + } + } +} +} // namespace + +int64_t create_onednn_scaled_mm_handler( + const torch::Tensor& b, // [IC, OC], column-major + const torch::Tensor& b_scales, // [1] or [OC] + at::ScalarType output_type, bool dynamic_act_quant, bool use_azp, + int64_t primitive_cache_size) { + TORCH_CHECK(b.dim() == 2); + TORCH_CHECK(b.stride(0) == 1); // Column-major + TORCH_CHECK(b_scales.is_contiguous()); + + W8A8MatMulPrimitiveHandler::Args args; + args.primitive_cache_size = primitive_cache_size; + + if (b_scales.numel() == 1) { + args.b_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR; + } else { + TORCH_CHECK_EQ(b_scales.numel(), b.size(1)); + args.b_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_OUTPUT_CHANNEL; + } + args.b_scales_ptr = b_scales.data_ptr(); + args.b_k_size = b.size(0); + args.b_k_stride = b.stride(0); + args.b_n_size = b.size(1); + args.b_n_stride = b.stride(1); + args.b_ptr = b.data_ptr(); + + if (dynamic_act_quant) { + // dynamic per-token, bias, A scales and A zps will be applied in outside. + args.a_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN; + args.use_a_zero_point = false; + } else { + // static per-tensor + args.a_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR; + args.use_a_zero_point = use_azp; + } + + VLLM_DISPATCH_FLOATING_TYPES(output_type, "create_onednn_scaled_mm_handler", + [&] { + if (dynamic_act_quant) { + args.c_type = get_dnnl_type(); + } else { + args.c_type = get_dnnl_type(); + } + }); + + return reinterpret_cast(new W8A8MatMulPrimitiveHandler(args)); +} + +void onednn_scaled_mm( + torch::Tensor& c, // [M, OC], row-major + const torch::Tensor& a, // [M, IC], row-major + const torch::Tensor& a_scales, // [M] or [1] + const std::optional& azp, // [M] or [1] + const std::optional& azp_adj, // [M] or [1] + const std::optional& bias, // [N] + int64_t handler) { + CPU_KERNEL_GUARD_IN(onednn_scaled_mm) + TORCH_CHECK(a.dim() == 2); + TORCH_CHECK(a.is_contiguous()); + TORCH_CHECK(c.is_contiguous()); + W8A8MatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + const int32_t* azp_ptr = nullptr; + if (azp.has_value()) { + azp_ptr = azp->data_ptr(); + } + if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) { + TORCH_CHECK_EQ(a_scales.numel(), 1); + } + + W8A8MatMulPrimitiveHandler::ExecArgs exec_args; + exec_args.a_ptr = a.data_ptr(); + exec_args.a_m_size = a.size(0); + exec_args.bias_ptr = nullptr; + exec_args.use_bias = false; + exec_args.a_scales_ptr = nullptr; + exec_args.a_zero_points_ptr = nullptr; + + VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "onednn_scaled_mm", [&] { + if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) { + if (bias.has_value()) { + exec_args.bias_ptr = bias->data_ptr(); + exec_args.bias_type = get_dnnl_type(); + exec_args.use_bias = true; + } + exec_args.a_scales_ptr = a_scales.data_ptr(); + exec_args.a_zero_points_ptr = azp_ptr; + exec_args.c_ptr = c.data_ptr(); + ptr->execute(exec_args); + } else if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN) { + torch::Tensor tmp_fp32_out = + torch::empty_like(c, ::at::ScalarType::Float); + exec_args.c_ptr = tmp_fp32_out.data_ptr(); + ptr->execute(exec_args); + if (bias.has_value()) { + if (azp.has_value()) { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(), + bias->data_ptr(), c.size(0), c.size(1)); + } else { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, nullptr, + bias->data_ptr(), c.size(0), c.size(1)); + } + } else { + if (azp.has_value()) { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(), + (scalar_t*)nullptr, c.size(0), c.size(1)); + } else { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, nullptr, (scalar_t*)nullptr, + c.size(0), c.size(1)); + } + } + } else { + TORCH_CHECK(false, "invalid act quant type."); + } + }); +} + +// static-per-tensor quantization. +void static_scaled_int8_quant( + torch::Tensor& out, // [batch, hidden_size] + const torch::Tensor& input, // [batch, hidden_size] + const torch::Tensor& scale, std::optional const& azp) { + CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK_EQ(input.dim(), 2); + TORCH_CHECK_EQ(input.stride(1), 1); + TORCH_CHECK(scale.numel() == 1); + TORCH_CHECK(!azp.has_value() || azp->numel() == 1); + + const int64_t stride = input.stride(0); + const int64_t hidden_size = input.size(1); + const int64_t num_tokens = input.size(0); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "static_scaled_int8_quant_impl", [&] { + if (azp.has_value()) { + static_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + stride, hidden_size); + } else { + static_scaled_int8_quant_impl(input.data_ptr(), + out.data_ptr(), + scale.data_ptr(), nullptr, + num_tokens, stride, hidden_size); + } + }); +} + +// dynamic-per-token quantization. +void dynamic_scaled_int8_quant( + torch::Tensor& out, // [batch, hidden_size] + const torch::Tensor& input, // [batch, hidden_size] + torch::Tensor& scale, // [batch, 1] + std::optional const& azp) { + CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK_EQ(input.dim(), 2); + TORCH_CHECK_EQ(input.stride(1), 1); + + const int64_t hidden_size = input.size(1); + const int64_t num_tokens = input.size(0); + const int64_t stride = input.stride(0); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { + if (azp.has_value()) { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + stride, hidden_size); + } else { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), nullptr, num_tokens, stride, + hidden_size); + } + }); +} diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp deleted file mode 100644 index 6e120b8d20a7..000000000000 --- a/csrc/cpu/quant.cpp +++ /dev/null @@ -1,951 +0,0 @@ -#include "cpu_types.hpp" -#include "dnnl_helper.hpp" - -namespace { -template -struct KernelVecType { - using load_vec_type = void; - using azp_adj_load_vec_type = void; - using cvt_vec_type = void; -}; - -template <> -struct KernelVecType { - using load_vec_type = vec_op::FP32Vec16; - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) -template <> -struct KernelVecType { - using load_vec_type = vec_op::BF16Vec16; - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; -#endif - -template <> -struct KernelVecType { -#if defined(__powerpc64__) || defined(__s390x__) - // Power architecture-specific vector type - using load_vec_type = vec_op::FP32Vec16; -#else - // Fallback for other architectures - using load_vec_type = vec_op::FP16Vec16; -#endif - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -#if defined(__AVX512F__) || defined(__aarch64__) -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t inv_scale(1.0 / *scale); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - cvt_vec_t zp_vec; - if constexpr (AZP) { - zp_vec = cvt_vec_t(static_cast(*azp)); - } - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } -} - -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_value(std::numeric_limits::lowest()); - cvt_vec_t min_value(std::numeric_limits::max()); - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - - if (j + vec_elem_num == hidden_size) { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } else { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32, hidden_size - j); - min_value = min_value.min(elems_fp32, hidden_size - j); - } else { - max_value = max_value.max(elems_fp32.abs(), hidden_size - j); - } - } - } - - float scale_val, azp_val; - if constexpr (AZP) { - float max_scalar = max_value.reduce_max(); - float min_scalar = min_value.reduce_min(); - scale_val = (max_scalar - min_scalar) / 255.0f; - azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); - azp[i] = static_cast(azp_val); - scale[i] = scale_val; - } else { - scale_val = max_value.reduce_max() / 127.0f; - scale[i] = scale_val; - } - - const cvt_vec_t inv_scale(1.0 / scale_val); - const cvt_vec_t azp_vec(azp_val); - - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } - } -} - -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t a_scale_vec(a_scale); - cvt_vec_t b_scale_vec(*b_scale); - cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; - - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} - -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - cvt_vec_t token_scale_vec(a_scale[i]); - cvt_vec_t token_zp_scale_vec; - if constexpr (AZP) { - float zp_scale_val = a_scale[i] * static_cast(azp[i]); - if constexpr (!PerChannel) { - zp_scale_val *= *b_scale; - } - token_zp_scale_vec = cvt_vec_t(zp_scale_val); - } - - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -#elif defined(__powerpc64__) -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - - const cvt_vec_t inv_scale(1.0 / *scale); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - cvt_vec_t zp_vec; - if constexpr (AZP) { - zp_vec = cvt_vec_t(static_cast(*azp)); - } - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } -} -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_value(std::numeric_limits::lowest()); - cvt_vec_t min_value(std::numeric_limits::max()); - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - - if (j + vec_elem_num == hidden_size) { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } else { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32, hidden_size - j); - min_value = min_value.min(elems_fp32, hidden_size - j); - } else { - max_value = max_value.max(elems_fp32.abs(), hidden_size - j); - } - } - } - - float scale_val, azp_val; - if constexpr (AZP) { - float max_scalar = max_value.reduce_max(); - float min_scalar = min_value.reduce_min(); - scale_val = (max_scalar - min_scalar) / 255.0f; - azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); - azp[i] = static_cast(azp_val); - scale[i] = scale_val; - } else { - scale_val = max_value.reduce_max() / 127.0f; - scale[i] = scale_val; - } - - const cvt_vec_t inv_scale(1.0 / scale_val); - const cvt_vec_t azp_vec(azp_val); - - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } - } -} -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t a_scale_vec(a_scale); - cvt_vec_t b_scale_vec(*b_scale); - cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; - - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - cvt_vec_t token_scale_vec(a_scale[i]); - cvt_vec_t token_zp_scale_vec; - if constexpr (AZP) { - float zp_scale_val = a_scale[i] * static_cast(azp[i]); - if constexpr (!PerChannel) { - zp_scale_val *= *b_scale; - } - token_zp_scale_vec = cvt_vec_t(zp_scale_val); - } - - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -#else -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - TORCH_CHECK(false, - "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 " - "support.") -} - -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - TORCH_CHECK(false, - "dynamic_scaled_int8_quant_impl requires " - "AVX512/powerpc64/AArch64 support.") -} - -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - TORCH_CHECK( - false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.") -} - -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_with_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - TORCH_CHECK( - false, - "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.") -} -#endif -} // namespace - -void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm only supports INT8 inputs.") - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() && - bias->dim() == 1); - } - - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] { - if (a_scales.numel() != 1) { - // per-token - // Note: oneDNN doesn't support per-token activation quantization - // Ideally we want to fuse the GEMM and the scale procedure with oneDNN - // JIT, the intermediate data is cached in registers or L1. But for now - // the oneDNN GEMM code generation only supports two quantization - // patterns: per-tensor or per-output-channel of weight. - // So we have to apply the per-token scale with a 'epilogue'. In C=s_a * - // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN - // GEMM, then the per-token scale (and bias) is applied with the epilogue - // C=s_a * C_inter + bias. - torch::Tensor tmp_fp32_out = - torch::empty_like(c, ::at::ScalarType::Float); - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter + bias - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Compute C=s_a * C_inter - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, - c.size(0), c.size(1)); - } - } else { - // per-tensor - if (bias.has_value()) { - // Compute C=s_a * s_b * (A@B) + bias - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), c.data_ptr(), - bias->data_ptr(), a.size(0), b.size(1), a.size(1), - a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } else { - // Compute C=s_a * s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), c.data_ptr(), - nullptr, a.size(0), b.size(1), a.size(1), - a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } - } - }); -} - -void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const torch::Tensor& azp_adj, // [OC] - const std::optional& azp, // [1] or [M] - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm_azp only supports INT8 inputs.") - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous()); - } - if (azp) { - TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous()); - } - TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous()); - - // azp & bias types - TORCH_CHECK(azp_adj.dtype() == torch::kInt32); - TORCH_CHECK(!azp || azp->dtype() == torch::kInt32); - TORCH_CHECK(!bias || bias->dtype() == c.dtype(), - "currently bias dtype must match output dtype ", c.dtype()); - - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] { - torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - if (a_scales.numel() != 1) { - // per-token - // Note: oneDNN doesn't support per-token activation quantization - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias - if (b_scales.numel() != 1) { - // Per-Channel - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Per-Tensor - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), - bias->data_ptr(), c.size(0), c.size(1)); - } - } else { - // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj - if (b_scales.numel() != 1) { - // Per-Channel - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), nullptr, - c.size(0), c.size(1)); - } else { - // Per-Tensor - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), nullptr, - c.size(0), c.size(1)); - } - } - } else { - // per-tensor - if (bias.has_value()) { - // Compute C_inter=s_a * s_b * (A@B) + bias - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), bias->data_ptr(), - a.size(0), b.size(1), a.size(1), a_scales.data_ptr(), - b_scales.data_ptr(), a_scales.numel(), b_scales.numel()); - } else { - // Compute C_inter=s_a * s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } - - // Compute C=C_inter - s_a * s_b * azp_adj - if (b_scales.numel() != 1) { - // Per-Channel - static_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - *a_scales.data_ptr(), b_scales.data_ptr(), - azp_adj.data_ptr(), a.size(0), b.size(1)); - } else { - // Per-Tensor - static_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - *a_scales.data_ptr(), b_scales.data_ptr(), - azp_adj.data_ptr(), a.size(0), b.size(1)); - } - } - }); -} - -// static-per-tensor quantization. -void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - const torch::Tensor& scale, - std::optional const& azp) { - CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(scale.numel() == 1); - TORCH_CHECK(!azp.has_value() || azp->numel() == 1); - - const int hidden_size = input.size(-1); - const int num_tokens = input.numel() / hidden_size; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "static_scaled_int8_quant_impl", [&] { - if (azp.has_value()) { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), num_tokens, - hidden_size); - } else { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), nullptr, num_tokens, hidden_size); - } - }); -} - -// dynamic-per-token quantization. -void dynamic_scaled_int8_quant( - torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - torch::Tensor& scale, // [..., 1] - std::optional const& azp) { - CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - - int const hidden_size = input.size(-1); - int const num_tokens = input.numel() / hidden_size; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { - if (azp.has_value()) { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), num_tokens, - hidden_size); - } else { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), nullptr, num_tokens, hidden_size); - } - }); -} - -#if defined(__powerpc64__) -void int8_scaled_mm_ppc64le(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm_ppc64le only supports INT8 inputs."); - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - // We dont need this - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() && - bias->dim() == 1); - } - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_ppc64le", [&] { - torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter + bias - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Compute C=s_a * C_inter - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, - c.size(0), c.size(1)); - } - }); -} - -#endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index b20a05464842..c9f426bdf618 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -6,25 +6,20 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); -void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias); - -void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const torch::Tensor& azp_adj, - const std::optional& azp, - const std::optional& bias); - -#if defined(__powerpc64__) -void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, - const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias); -#endif +void release_dnnl_matmul_handler(int64_t handler); + +int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b, + const torch::Tensor& b_scales, + at::ScalarType output_type, + bool dynamic_act_quant, bool use_azp, + int64_t primitive_cache_size); + +void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a, + const torch::Tensor& a_scales, + const std::optional& azp, + const std::optional& azp_adj, + const std::optional& bias, + int64_t handler); void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, torch::Tensor& kv_cache, double scale, @@ -151,8 +146,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding); // Quantization -#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) +#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \ + defined(__powerpc64__) at::Tag stride_tag = at::Tag::needs_fixed_stride_order; + // Helper function to release oneDNN handlers + ops.def("release_dnnl_matmul_handler(int handler) -> ()", + &release_dnnl_matmul_handler); + + // Create oneDNN W8A8 handler + ops.def( + "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType " + "output_type, bool dynamic_act_quant, bool use_azp, int " + "primitive_cache_size) -> int", + &create_onednn_scaled_mm_handler); + + // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization + ops.def( + "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, " + "Tensor? azp_adj, Tensor? bias, int handler) -> ()"); + ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm); // Compute int8 quantized tensor for given scaling factor. ops.def( @@ -168,50 +180,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { {stride_tag}); ops.impl("dynamic_scaled_int8_quant", torch::kCPU, &dynamic_scaled_int8_quant); - // W8A8 GEMM, supporting symmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); - ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm); - // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm_azp(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()", - {stride_tag}); - ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); -#elif defined(__powerpc64__) - // Compute int8 quantized tensor for given scaling factor. - ops.def( - "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()"); - ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); - - // Compute int8 quantized tensor and scaling factor - ops.def( - "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()"); - ops.impl("dynamic_scaled_int8_quant", torch::kCPU, - &dynamic_scaled_int8_quant); - // W8A8 GEMM, supporting symmetric quantization. - ops.def( - "cutlass_scaled_mm(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()"); - ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le); - // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm_azp(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()"); - ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); #endif // SHM CCL diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py new file mode 100644 index 000000000000..17692384ac9a --- /dev/null +++ b/tests/kernels/test_onednn.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Integration tests for FlexAttention backend vs default backend""" + +from typing import Optional + +import pytest +import torch + +from tests.kernels.utils import to_int8 +from vllm import _custom_ops as ops +from vllm.platforms import current_platform + +if not current_platform.is_cpu(): + pytest.skip("skipping CPU-only tests", allow_module_level=True) + +NK_FACTORS = [ + (256, 128), + (4096, 4096), + (16384, 4096), + (1023, 491), + (1001, 15), +] +M_FACTORS = [ + (16, 1, 32, 128, 64), + (1, 17, 1, 31, 17), +] +CACHE_SIZES = [2] +DTYPE = [torch.bfloat16] + + +def rand_int8(shape: tuple, device: str = "cpu"): + return to_int8(torch.rand(shape, device=device) * 255 - 128) + + +def ref_int8_scaled_mm( + a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + azp: Optional[torch.Tensor], + bias: Optional[torch.Tensor], + output_type: torch.dtype, +): + if azp is not None: + a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32) + output = torch.mm((scale_a * a.to(dtype=torch.float32)), + (scale_b * b.to(dtype=torch.float32))) + if bias is not None: + output += bias.float() + + return output.to(dtype=output_type) + + +def onednn_int8_gemm_test_helper(primitive_cache_size: int, + m: int, + n: int, + k: int, + per_tensor_a_quant: bool, + per_tensor_b_quant: bool, + use_azp: bool, + use_bias: bool, + out_dtype: torch.dtype = torch.bfloat16, + device: str = "cpu"): + # Test for a oneDNN kernel with per-tensor / per-token activation + # quantization and per-tensor / per-output channel weight quantization. + a = to_int8(torch.randn((m, k), device=device) * 5) + b = to_int8(torch.randn((n, k), device=device).t() * 5) + + a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1) + b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n) + + scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32)) + scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32)) + + if use_azp: + azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5 + azp = (azp / scale_a).round().to(dtype=torch.int32) + azp_adj = scale_b * b.sum(dim=0, keepdim=True, dtype=torch.float32) + else: + azp = None + azp_adj = None + + if use_bias: + bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10 + else: + bias = None + + handler = ops.create_onednn_scaled_mm( + b, + scale_b, + out_dtype, + not per_tensor_a_quant, + use_azp, + primitive_cache_size, + ) + + out = torch.zeros((m, n), dtype=out_dtype) + ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, bias) + baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, bias, out_dtype) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + if use_bias: + # To test runtime bias setting + out = torch.zeros((m, n), dtype=out_dtype) + ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None) + baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None, + out_dtype) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + +@pytest.mark.parametrize("n,k", NK_FACTORS) +@pytest.mark.parametrize("m_list", M_FACTORS) +@pytest.mark.parametrize("per_tensor_a_scale", [True, False]) +@pytest.mark.parametrize("per_tensor_b_scale", [True, False]) +@pytest.mark.parametrize("use_bias", [True, False]) +@pytest.mark.parametrize("use_azp", [True, False]) +@pytest.mark.parametrize("output_type", DTYPE) +@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES) +def test_onednn_int8_scaled_gemm( + n: int, + k: int, + m_list: tuple[int], + per_tensor_a_scale: bool, + per_tensor_b_scale: bool, + use_bias: bool, + use_azp: bool, + output_type: torch.dtype, + primitive_cache_size: int, +): + for m in m_list: + onednn_int8_gemm_test_helper( + primitive_cache_size=primitive_cache_size, + m=m, + n=n, + k=k, + per_tensor_a_quant=per_tensor_a_scale, + per_tensor_b_quant=per_tensor_b_scale, + use_bias=use_bias, + use_azp=use_azp, + out_dtype=output_type, + ) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 59f2d7737f19..3081aff114fc 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1827,3 +1827,86 @@ def int8_scaled_mm_with_quant_fake( M = mat1.size(0) N = mat2.size(0) return torch.empty((M, N), dtype=out_dtype) + + +class CPUDNNLGEMMHandler: + + def __init__(self) -> None: + self.handler: Optional[int] = None + self.n = -1 + self.k = -1 + + def __del__(self): + if self.handler is not None: + torch.ops._C.release_dnnl_matmul_handler(self.handler) + + +def create_onednn_scaled_mm( + weight: torch.Tensor, # [K, N] + weight_scales: torch.Tensor, + output_type: torch.dtype, + dynamic_quant: bool, + use_azp: bool, + primitive_cache_size: int = 128, +) -> CPUDNNLGEMMHandler: + handler = CPUDNNLGEMMHandler() + handler.k, handler.n = weight.size() + handler.handler = torch.ops._C.create_onednn_scaled_mm_handler( + weight, weight_scales, output_type, dynamic_quant, use_azp, + primitive_cache_size) + return handler + + +def onednn_scaled_int8_quant(input: torch.Tensor, + scale: Optional[torch.Tensor] = None, + azp: Optional[torch.Tensor] = None, + symmetric: bool = True): + """ + Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp. + + Args: + input: The input tensor to be quantized to int8. + scale: Optional scaling factor for the int8 quantization. + When not provided, we invoke dynamic-per-token quantization. + azp: Optional zero-point for the int8 quantization. + Must be provided for asymmetric quantization if `scale` is provided. + symmetric: Whether to use symmetric quantization (scale only, azp ignored). + + Returns: + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. + """ + output = torch.empty_like(input, dtype=torch.int8) + token_num = input.numel() // input.shape[-1] + input = input.view((token_num, input.shape[-1])) + if scale is not None: + # static-per-tensor quantization. + assert symmetric == ( + azp + is None), "azp must only be provided for asymmetric quantization." + torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) + return output, scale, azp + + # dynamic-per-token quantization. + input_scales = torch.empty((token_num, 1), + device=input.device, + dtype=torch.float32) + input_azp = None if symmetric else torch.empty_like(input_scales, + dtype=torch.int32) + torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, + input_azp) + return output, input_scales, input_azp + + +def onednn_scaled_mm( + dnnl_handler: CPUDNNLGEMMHandler, + x: torch.Tensor, + output: torch.Tensor, + input_scale: Optional[torch.Tensor], + input_zp: Optional[torch.Tensor], + input_zp_adj: Optional[torch.Tensor], + bias: Optional[torch.Tensor], +) -> torch.Tensor: + torch.ops._C.onednn_scaled_mm(output, x, input_scale, input_zp, + input_zp_adj, bias, dnnl_handler.handler) + + return output diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b16c21b7013a..fcc6987d26bb 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -360,10 +360,15 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: elif current_platform.is_cpu(): if current_platform.get_cpu_architecture() == CpuArchEnum.X86: from vllm.model_executor.layers.fused_moe import cpu_fused_moe - dtype = layer.w13_weight.dtype + from vllm.model_executor.layers.utils import ( + check_cpu_sgl_kernel) + dtype_w13 = layer.w13_weight.dtype + _, n_w13, k_w13 = layer.w13_weight.size() + dtype_w2 = layer.w2_weight.dtype + _, n_w2, k_w2 = layer.w2_weight.size() if (envs.VLLM_CPU_SGL_KERNEL - and torch._C._cpu._is_amx_tile_supported() - and dtype == torch.bfloat16): + and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13) + and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)): packed_w13_weight = torch.ops._C.convert_weight_packed( layer.w13_weight) assert packed_w13_weight.size() == layer.w13_weight.size() diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 654e2ec7b2fa..9b1ab7af0ac8 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -199,11 +199,10 @@ def create_weights(self, layer: torch.nn.Module, def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL: + from vllm.model_executor.layers.utils import check_cpu_sgl_kernel N, K = layer.weight.size() dtype = layer.weight.dtype - if (torch._C._cpu._is_amx_tile_supported() - and dtype == torch.bfloat16 and N % 32 == 0 - and K % 32 == 0): + if check_cpu_sgl_kernel(N, K, dtype): packed_weight = torch.ops._C.convert_weight_packed( layer.weight) assert packed_weight.size() == layer.weight.size() @@ -215,7 +214,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: else: logger.warning( "CPU SGL kernels require Intel AMX support," - " bfloat16 weight, IC and OC are divisible by 32.") + " bf16/fp16/int8 weight, IC and OC are divisible by " + "32 and 16.") layer.use_cpu_sgl = False def apply(self, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 18f5ce04fd35..2bc68ab3ebd1 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -6,6 +6,8 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import ( AiterScaledMMLinearKernel) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import ( + CPUScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import ( CutlassScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 @@ -18,7 +20,7 @@ # in priority/performance order (when available) _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = { - PlatformEnum.CPU: [CutlassScaledMMLinearKernel], + PlatformEnum.CPU: [CPUScaledMMLinearKernel], PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel], PlatformEnum.TPU: [XLAScaledMMLinearKernel], diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py new file mode 100644 index 000000000000..59d2b5bce962 --- /dev/null +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py @@ -0,0 +1,206 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch + +from vllm import _custom_ops as ops +from vllm import envs +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise) +from vllm.model_executor.layers.utils import check_cpu_sgl_kernel +from vllm.platforms import current_platform +from vllm.platforms.interface import CpuArchEnum + +from .ScaledMMLinearKernel import (ScaledMMLinearKernel, + ScaledMMLinearLayerConfig) + + +class CPUScaledMMLinearKernel(ScaledMMLinearKernel): + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement( + cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: + if not current_platform.is_cpu(): + return False, "CPUScaledMM requires running on CPU." + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight = getattr(layer, self.w_q_name) + dtype = weight.dtype + N, K = weight.size() + if (current_platform.get_cpu_architecture() == CpuArchEnum.X86 + and envs.VLLM_CPU_SGL_KERNEL and self.config.input_symmetric + and check_cpu_sgl_kernel(N, K, dtype)): + self.linear_method = self._apply_weights_sgl + self.process_weights_for_sgl(layer) + else: + self.linear_method = self._apply_weights_onednn + self.process_weights_for_onednn(layer) + + def process_weights_for_onednn(self, layer: torch.nn.Module) -> None: + # WEIGHT + # Transpose to [K, N] for convenience + weight = getattr(layer, self.w_q_name) + replace_parameter( + layer, self.w_q_name, + torch.nn.Parameter(weight.t().data, requires_grad=False)) + + # WEIGHT SCALE + # oneDNN kernels support only per-tensor and per-channel. + # If we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), convert to the per-channel case. + is_fused_module = len(layer.logical_widths) > 1 + weight_scale = getattr(layer, self.w_s_name) + if is_fused_module and not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, + layer.logical_widths) + replace_parameter( + layer, self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False)) + + # INPUT SCALE + if self.config.is_static_input_scheme: + input_scale = getattr(layer, self.i_s_name) + + if self.config.input_symmetric: + replace_parameter( + layer, self.i_s_name, + torch.nn.Parameter(input_scale.max(), requires_grad=False)) + setattr(layer, self.i_zp_name, None) + else: + input_zero_point = getattr(layer, self.i_zp_name) + + # reconstruct the ranges + int8_traits = torch.iinfo(torch.int8) + azps = input_zero_point.to(dtype=torch.int32) + range_max = (input_scale * (int8_traits.max - azps)).max() + range_min = (input_scale * (int8_traits.min - azps)).min() + + scale = (range_max - range_min) / (int8_traits.max - + int8_traits.min) + replace_parameter( + layer, self.i_s_name, + torch.nn.Parameter(scale, requires_grad=False)) + + azp = (int8_traits.min - + range_min / scale).round().to(dtype=torch.int32) + replace_parameter(layer, self.i_zp_name, + torch.nn.Parameter(azp, requires_grad=False)) + + else: + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + + # Different from cutlass, oneDNN kernels only need the AZP adjustment + # term for dynamic quantization. And s_b should be folded into the + # term. Such as: + # s_a * s_b * [(A - zp_a)B] + bias = + # s_a * (s_b * AB) - s_a * s_b * zp_a * B + bias = + # s_a * GEMM_output - s_a * zp_a * adj + bias + if not (self.config.input_symmetric + and self.config.is_static_input_scheme): + weight = getattr(layer, self.w_q_name) + weight_scale = getattr(layer, self.w_s_name) + azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.float32) + azp_adj = azp_adj * weight_scale.squeeze() + setattr(layer, self.azp_adj_name, + torch.nn.Parameter(azp_adj, requires_grad=False)) + else: + setattr(layer, self.azp_adj_name, None) + + weight = getattr(layer, self.w_q_name) + self.dnnl_handler = ops.create_onednn_scaled_mm( + weight, + getattr(layer, self.w_s_name), + torch.get_default_dtype(), + getattr(layer, self.i_s_name) is None, + not self.config.input_symmetric, + 32, + ) + # weight is prepacked and maintained by the dnnl_handler, + # release the original weight + setattr(layer, self.w_q_name, None) + del weight + + def process_weights_for_sgl(self, layer: torch.nn.Module) -> None: + # WEIGHT + weight = getattr(layer, self.w_q_name) + packed_weight = torch.ops._C.convert_weight_packed(weight) + replace_parameter( + layer, self.w_q_name, + torch.nn.Parameter(packed_weight, requires_grad=False)) + + if layer.bias is not None: + bias = layer.bias + layer.register_parameter( + "bias_fp32", + torch.nn.Parameter(bias.float().data, requires_grad=False)) + + # WEIGHT SCALE + # CPU SGL kernels only support per-channel. + # For per-tensor quant, convert to the per-channel case. + weight_scale = getattr(layer, self.w_s_name) + if not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, + layer.logical_widths) + replace_parameter( + layer, self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False)) + + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + setattr(layer, self.azp_adj_name, None) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return self.linear_method( + layer, + x, + bias, + ) + + def _apply_weights_onednn( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) + + # ops.scaled_int8_quant supports both dynamic and static quant: + # * dynamic, i_s is None and x_s computed from x. + # * static, i_s is scalar and x_s is i_s. + x_q, x_s, x_zp = ops.onednn_scaled_int8_quant( + x, i_s, i_zp, self.config.input_symmetric) + + m = x.size(0) + n = self.dnnl_handler.n + out = torch.empty((m, n), dtype=x.dtype) + ops.onednn_scaled_mm(self.dnnl_handler, x_q, out, x_s, x_zp, azp_adj, + bias) + + return out + + def _apply_weights_sgl( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + w_q, w_s, _, _, _ = self._get_weight_params(layer) + return torch.ops._C.int8_scaled_mm_with_quant( + x, + w_q, + w_s, + layer.bias_fp32 if bias is not None else None, + x.dtype, + True, + ) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 6ddd4a9ec423..2f982f96b0d0 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -25,8 +25,8 @@ def get_min_capability(cls) -> int: def can_implement( cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: - if (not current_platform.is_cuda() and not current_platform.is_cpu()): - return False, "CutlassScaledMM requires running on CUDA or CPU." + if not current_platform.is_cuda(): + return False, "CutlassScaledMM requires running on CUDA." return True, None diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 48a347a8f561..2897f75b3129 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -142,6 +142,12 @@ def rocm_unquantized_gemm(layer: torch.nn.Module, ) +def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype): + return (torch._C._cpu._is_amx_tile_supported() + and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0 + and n % 16 == 0) + + def cpu_unquantized_gemm(layer: torch.nn.Module, x: torch.Tensor, weight: torch.Tensor, From 03825212f4e22047bf456c3a4d959c32c66d3319 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 21 Aug 2025 11:05:20 +0800 Subject: [PATCH 207/231] [CI/Build] Split out mm processor tests (#23260) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- .buildkite/test-pipeline.yaml | 15 +++++++++++---- .../{ => processing}/test_tensor_schema.py | 7 +++---- vllm/model_executor/models/cohere2_vision.py | 2 ++ 3 files changed, 16 insertions(+), 8 deletions(-) rename tests/models/multimodal/{ => processing}/test_tensor_schema.py (98%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 745420664010..5869ae21d5c7 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -545,6 +545,15 @@ steps: commands: - pytest -v -s models/language/pooling -m 'not core_model' +- label: Multi-Modal Processor Test + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + - pytest -v -s models/multimodal/processing/test_tensor_schema.py + - label: Multi-Modal Models Test (Standard) mirror_hardwares: [amdexperimental] torch_nightly: true @@ -554,9 +563,7 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal/processing - - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model - - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model # Needs mp_method="spawn" + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Models Test (Extended) 1 @@ -567,7 +574,7 @@ steps: - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - label: Multi-Modal Models Test (Extended) 2 mirror_hardwares: [amdexperimental] diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py similarity index 98% rename from tests/models/multimodal/test_tensor_schema.py rename to tests/models/multimodal/processing/test_tensor_schema.py index 143b4c8fc8c4..79164f02c339 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -24,9 +24,9 @@ from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.engine.core import EngineCore as V1EngineCore -from ...conftest import VllmRunner -from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS -from ..utils import dummy_hf_overrides +from ....conftest import VllmRunner +from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS +from ...utils import dummy_hf_overrides ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", @@ -147,7 +147,6 @@ def get_model_id_to_test( return filtered_results -@pytest.mark.core_model @pytest.mark.parametrize( "model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index fca1aee835b8..179cc2af8eb3 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -170,6 +170,8 @@ def get_num_patches( # The current implementation of get_number_of_image_patches # is incorrect, so we patch it here. + # TODO: Revert once + # https://github.com/huggingface/transformers/pull/40312 is released. # return image_processor.get_number_of_image_patches(image_height, # image_width, {}) From 93c5489eac237c55e125ecbd942d03bdfd16c655 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Thu, 21 Aug 2025 06:08:51 +0300 Subject: [PATCH 208/231] [V1][Mamba1] - Full CUDA and Piecewise CUDA Graphs Support (#23035) Signed-off-by: asafg Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> Co-authored-by: asafg Signed-off-by: Duncan Moss --- docs/usage/v1_guide.md | 2 +- .../models/language/generation/test_hybrid.py | 20 ++---- vllm/config/compilation.py | 1 + .../layers/mamba/mamba_mixer.py | 66 ++++++++++++++++--- vllm/model_executor/models/jamba.py | 8 ++- vllm/model_executor/models/mamba.py | 7 +- vllm/v1/attention/backends/mamba1_attn.py | 37 +++++------ vllm/v1/attention/backends/mamba2_attn.py | 45 ++----------- vllm/v1/attention/backends/mamba_attn.py | 55 ++++++++++++++++ 9 files changed, 154 insertions(+), 87 deletions(-) create mode 100644 vllm/v1/attention/backends/mamba_attn.py diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 54af970ea842..9bf0c5842c6b 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -107,7 +107,7 @@ to enable simultaneous generation and embedding using the same engine instance i #### Mamba Models Models using selective state-space mechanisms instead of standard transformer attention are supported. -Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Additionally, Mamba-1 models require `enforce_eager=True`. +Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index aee0a50336c0..f8c0eaa8cf3a 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -54,16 +54,14 @@ "tiiuae/Falcon-H1-0.5B-Base", ] -# Avoid OOM -MAX_NUM_SEQS = 4 - -# Once we add support for FCG in Mamba1, this list will be removed and tests -# all test cases will use enforce_eager=False -ENFORCE_EAGER_MODELS_V1 = [ - "state-spaces/mamba-130m-hf", +FULL_CUDA_GRAPH_MODELS = [ "ai21labs/Jamba-tiny-dev", + "Zyphra/Zamba2-1.2B-instruct", ] +# Avoid OOM +MAX_NUM_SEQS = 4 + @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @@ -101,19 +99,13 @@ def test_models( example_prompts, max_tokens, num_logprobs) if model in V1_SUPPORTED_MODELS: - enforce_eager = False with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") if model in HYBRID_MODELS: # required due to reorder_batch behaviour m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") - - if model in ENFORCE_EAGER_MODELS_V1: - enforce_eager = True - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, - enforce_eager=enforce_eager, enable_prefix_caching=False) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) @@ -373,7 +365,7 @@ def test_distributed_correctness( ) -@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"]) +@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) def test_full_cuda_graph( diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 56a2183f8e2c..c654485f4fe9 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -336,6 +336,7 @@ class CompilationConfig: "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2", + "vllm.mamba_mixer", ] def compute_hash(self) -> str: diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 3c7322260df4..a24e72778b34 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -27,6 +27,8 @@ selective_scan_fn, selective_state_update) from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata @@ -183,22 +185,26 @@ def _ssm_transform( def forward(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): if not envs.VLLM_USE_V1: - return CustomOp.forward(self, hidden_states, mamba_cache_params) + CustomOp.forward(self, hidden_states, output, mamba_cache_params) else: - return self.forward_cuda( + torch.ops.vllm.mamba_mixer( hidden_states, - mamba_cache_params, + output, + self.prefix, ) def forward_native(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): pass def forward_cuda(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): """ Run the Mamba-1 SSM pipeline. @@ -237,6 +243,7 @@ def forward_cuda(self, conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] has_initial_states = mamba1_metadata.has_initial_states + num_padded_decodes = mamba1_metadata.num_padded_decodes else: assert isinstance(attn_metadata, AttentionMetadata) assert mamba_cache_params is not None @@ -248,6 +255,7 @@ def forward_cuda(self, has_initial_states = None if context_lens_tensor is not None: has_initial_states = context_lens_tensor > 0 + num_padded_decodes = attn_metadata.num_decode_tokens # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) @@ -267,6 +275,7 @@ def forward_cuda(self, num_decodes = attn_metadata.num_decode_tokens # token count (=request) has_prefill = num_prefill_tokens > 0 has_decode = num_decode_tokens > 0 + num_actual_tokens = num_prefill_tokens + num_decode_tokens prefill_decode_split = split_batch_to_prefill_and_decode( hidden_states_BC, @@ -278,6 +287,7 @@ def forward_cuda(self, num_decode_tokens, num_prefills, num_decodes, + num_padded_decodes, ) hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d @@ -371,7 +381,7 @@ def forward_cuda(self, else: out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0] - return out + output[:num_actual_tokens] = out def get_state_dtype(self) -> tuple[torch.dtype]: assert self.model_config is not None @@ -421,18 +431,27 @@ def split_batch_to_prefill_and_decode( num_decode_tokens: int, num_prefills: int, num_decodes: int, + num_padded_decodes: int, ) -> PrefillDecodeSplit: + num_actual_tokens = num_prefill_tokens + num_padded_decodes + if envs.VLLM_USE_V1: # In v1, decode tokens come first, then prefill tokens. hidden_states_BC_d, hidden_states_BC_p = torch.split( - hidden_states_BC, [num_decode_tokens, num_prefill_tokens], dim=-1) - gate_d, gate_p = torch.split(gate, - [num_decode_tokens, num_prefill_tokens], + hidden_states_BC[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], + dim=-1) + gate_d, gate_p = torch.split(gate[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], dim=-1) + + # num_padded_decodes accounts for CUDA graph padding when applicable state_indices_tensor_d, state_indices_tensor_p = torch.split( - state_indices_tensor, [num_decodes, num_prefills], dim=0) + state_indices_tensor[:num_padded_decodes + num_prefills], + [num_padded_decodes, num_prefills], + dim=0) query_start_loc_p = (query_start_loc[-num_prefills - 1:] - - num_decodes if num_prefills > 0 else None) + num_padded_decodes if num_prefills > 0 else None) has_initial_states_p = has_initial_states[-num_prefills:] if ( has_initial_states is not None and num_prefills > 0) else None else: @@ -459,3 +478,32 @@ def split_batch_to_prefill_and_decode( query_start_loc_p=query_start_loc_p, has_initial_states_p=has_initial_states_p, ) + + +def mamba_mixer( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self.forward_cuda(hidden_states=hidden_states, + output=output, + mamba_cache_params=None) + + +def mamba_mixer_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="mamba_mixer", + op_func=mamba_mixer, + mutates_args=["output"], + fake_impl=mamba_mixer_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 0b32d6f25659..3c1a0b68df56 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -10,6 +10,7 @@ from vllm import envs from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group @@ -154,10 +155,10 @@ def forward( hidden_states, residual = self.input_layernorm( hidden_states, residual) - hidden_states = self.mamba(hidden_states, mamba_cache_params) + output = torch.empty_like(hidden_states) + self.mamba(hidden_states, output, mamba_cache_params) # Fully Connected - hidden_states, residual = self.pre_ff_layernorm( - hidden_states, residual) + hidden_states, residual = self.pre_ff_layernorm(output, residual) hidden_states = self.feed_forward(hidden_states) return hidden_states, residual @@ -278,6 +279,7 @@ def forward( } +@support_torch_compile class JambaModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index f4aaf0c6f467..f02499a4f96b 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -9,6 +9,7 @@ from transformers import MambaConfig from vllm import envs +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm @@ -81,10 +82,12 @@ def forward( else: hidden_states, residual = self.norm(hidden_states, residual) - hidden_states = self.mixer(hidden_states, mamba_cache_params) - return hidden_states, residual + output = torch.empty_like(hidden_states) + self.mixer(hidden_states, output, mamba_cache_params) + return output, residual +@support_torch_compile class MambaModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index 6cdc509083ae..97a1aa86dda0 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -2,16 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch from vllm.attention.backends.abstract import AttentionBackend -from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, - CommonAttentionMetadata, +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadataBuilder) +from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, split_decodes_and_prefills) -from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec class Mamba1AttentionBackend(AttentionBackend): @@ -31,24 +31,11 @@ class Mamba1AttentionMetadata: num_prefill_tokens: int num_decodes: int num_decode_tokens: int + num_padded_decodes: int class Mamba1AttentionMetadataBuilder( - AttentionMetadataBuilder[Mamba1AttentionMetadata]): - reorder_batch_threshold: ClassVar[int] = 1 - - def __init__( - self, - kv_cache_spec: AttentionSpec, - vllm_config: VllmConfig, - device: torch.device, - layer_names: list[str], - ): - assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec - self.device = device - self.vllm_config = vllm_config - self.layer_names = layer_names + BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]): def build( self, @@ -67,9 +54,18 @@ def build( decode_threshold=1)) has_initial_states = None + padded_decodes = num_decodes if num_prefills > 0: has_initial_states = context_lens_tensor > 0 + elif (num_decodes > 0 and num_decodes <= self.decode_cudagraph_max_bs + and self.compilation_config.full_cuda_graph): + state_indices_for_decode = state_indices_tensor[:num_decodes] + padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes) + self.state_indices_tensor[:num_decodes].copy_( + state_indices_for_decode, non_blocking=True) + state_indices_tensor = self.state_indices_tensor[:padded_decodes] + state_indices_tensor[num_decodes:] = PAD_SLOT_ID return Mamba1AttentionMetadata( query_start_loc=query_start_loc, @@ -80,4 +76,5 @@ def build( num_prefill_tokens=num_prefill_tokens, num_decodes=num_decodes, num_decode_tokens=num_decode_tokens, + num_padded_decodes=padded_decodes, ) diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index ace078e2b27c..ed30884fdbc9 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -2,18 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import (AttentionCGSupport, - AttentionMetadataBuilder, - CommonAttentionMetadata, +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadataBuilder) +from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, split_decodes_and_prefills) -from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec +from vllm.v1.kv_cache_interface import AttentionSpec def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor, @@ -88,29 +88,14 @@ class Mamba2AttentionMetadata: class Mamba2AttentionMetadataBuilder( - AttentionMetadataBuilder[Mamba2AttentionMetadata]): - cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE - - reorder_batch_threshold: ClassVar[int] = 1 + BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]): def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): - assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec + super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.chunk_size = vllm_config.model_config.get_mamba_chunk_size() - self.vllm_config = vllm_config - self.compilation_config = vllm_config.compilation_config assert self.chunk_size is not None, ( "chunk_size needs to be set in the model config for Mamba2 models") - self.decode_cudagraph_max_bs = min( - self.vllm_config.scheduler_config.max_num_seqs, - self.compilation_config.max_capture_size) - self.state_indices_tensor = torch.empty( - (self.decode_cudagraph_max_bs, ), - dtype=torch.int32, - device=device, - ) def build(self, common_prefix_len: int, @@ -187,19 +172,3 @@ def build(self, state_indices_tensor=state_indices_tensor, ) return attn_metadata - - def build_for_cudagraph_capture( - self, common_attn_metadata: CommonAttentionMetadata): - """ - This method builds the metadata for full cudagraph capture. - Currently, only decode is supported for full cudagraphs with Mamba. - """ - m = common_attn_metadata - - assert m.num_reqs == m.num_actual_tokens, \ - "Mamba only supports decode-only full CUDAGraph capture. " \ - "Make sure all cudagraph capture sizes <= max_num_seq." - - m.max_query_len = 1 # decode-only - - return self.build(0, m) diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py new file mode 100644 index 000000000000..07ef7cb69a16 --- /dev/null +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import abc +from typing import ClassVar, TypeVar + +import torch + +from vllm.config import VllmConfig +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata) +from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec + +M = TypeVar("M") + + +class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): + reorder_batch_threshold: ClassVar[int] = 1 + cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE + + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + assert isinstance(kv_cache_spec, MambaSpec) + self.kv_cache_spec = kv_cache_spec + self.device = device + self.vllm_config = vllm_config + self.layer_names = layer_names + + self.compilation_config = vllm_config.compilation_config + self.decode_cudagraph_max_bs = min( + self.vllm_config.scheduler_config.max_num_seqs, + self.compilation_config.max_capture_size) + self.state_indices_tensor = torch.empty( + (self.decode_cudagraph_max_bs, ), + dtype=torch.int32, + device=device, + ) + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata) -> M: + """ + This method builds the metadata for full cudagraph capture. + Currently, only decode is supported for full cudagraphs with Mamba. + """ + m = common_attn_metadata + + assert m.num_reqs == m.num_actual_tokens, \ + "Mamba only supports decode-only full CUDAGraph capture. " \ + "Make sure all cudagraph capture sizes <= max_num_seq." + + m.max_query_len = 1 # decode-only + + return self.build(0, m) \ No newline at end of file From 83fb982964646164e6acfcc7579bd6849e4d0a30 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 20 Aug 2025 23:09:39 -0400 Subject: [PATCH 209/231] [Compile] Fix Compile Warning SM100 Cutlass MLA (#23287) Signed-off-by: yewentao256 Signed-off-by: Duncan Moss --- csrc/attention/mla/sm100_cutlass_mla_kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index e0e95d06290d..6dd6f269f3dc 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -167,7 +167,7 @@ typename T::Fmha::Arguments args_from_options( // TODO(trevor-m): Change split_kv back to -1 when // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will // perform worse with larger context length and smaller batch sizes. - num_kv_splits, // split_kv + static_cast(num_kv_splits), // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute @@ -264,7 +264,7 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba // Assumes device 0 when getting sm_count. arguments.hw_info.sm_count = sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count; - arguments.split_kv = num_kv_splits; + arguments.split_kv = static_cast(num_kv_splits); MlaSm100Type::Fmha::set_split_kv(arguments); return MlaSm100Type::Fmha::get_workspace_size(arguments); From 048330f8cdb9fe4247b746342c0aa2d8b63356b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E5=A5=87=28yann=20qi=29?= <51905299+yannqi@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:08:52 +0800 Subject: [PATCH 210/231] [Model][VLM] Support R-4B Model (#23246) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: yannqi Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com> Signed-off-by: Cyrus Leung Co-authored-by: yannqiyang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung Signed-off-by: Duncan Moss --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 23 ++++ .../vision_language_multi_image.py | 34 ++++++ .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/rvl.py | 103 ++++++++++++++++++ 7 files changed, 165 insertions(+) create mode 100644 vllm/model_executor/models/rvl.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7308d0010690..831bfb1e939e 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -652,6 +652,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ | +| `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | ✅︎ | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | | `Step3VLForConditionalGeneration` | Step3-VL | T + I+ | `stepfun-ai/step3` | | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 88bbbfdfbd18..e7a7a30dd31a 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1436,6 +1436,28 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# R-4B +def run_r_vl(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "YannQi/R-4B" + + prompts = [ + f"<|im_start|>user \n{question}<|im_end|><|im_start|>assistant\n" + for question in questions + ] + + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + limit_mm_per_prompt={modality: 1}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # SkyworkR1V def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1622,6 +1644,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: "qwen2_vl": run_qwen2_vl, "qwen2_5_vl": run_qwen2_5_vl, "qwen2_5_omni": run_qwen2_5_omni, + "rvl": run_r_vl, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, "step3": run_step3, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index eabd9453f3c5..d9242efa8547 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -992,6 +992,39 @@ def post_process_image(image: Image) -> Image: ) +def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "YannQi/R-4B" + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" @@ -1193,6 +1226,7 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData: "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, + "rvl": load_r_vl, "smolvlm": load_smolvlm, "step3": load_step3, "tarsier": load_tarsier, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 02aecfad8281..adc8b2510d67 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -316,6 +316,7 @@ def _test_processing_correctness_one( "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2.5-Omni-3B", + "YannQi/R-4B", "Skywork/Skywork-R1V-38B", "HuggingFaceTB/SmolVLM2-2.2B-Instruct", "stepfun-ai/step3", diff --git a/tests/models/registry.py b/tests/models/registry.py index 6e6acfb8cd22..4f69f90b6aae 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -489,6 +489,8 @@ def check_available_online( max_model_len=4096), "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 + "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B", + trust_remote_code=True), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", trust_remote_code=True), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct", # noqa: E501 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 78ef270598b8..39a3e425a46d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -217,6 +217,7 @@ "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501 "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"), + "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py new file mode 100644 index 000000000000..efdb01004663 --- /dev/null +++ b/vllm/model_executor/models/rvl.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Mapping + +import torch +import torch.nn as nn +from transformers.activations import GELUActivation + +from vllm.config import VllmConfig +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalDataDict + +from .llava_next import (LlavaDummyInputsBuilder, LlavaNextMultiModalProcessor, + LlavaNextProcessingInfo) +from .llava_onevision import LlavaOnevisionForConditionalGeneration +from .utils import WeightsMapper + + +class RVLProcessingInfo(LlavaNextProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(**kwargs) + + +class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + image_token = "" + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = ( + self.info.get_image_size_with_most_features()) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + } + + +class RVLMultiModalProjector(nn.Module): + + def __init__(self, config): + super().__init__() + self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size, + eps=1e-06) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + self.act = GELUActivation() + self.linear_2 = nn.Linear( + config.text_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + + def forward(self, image_feature: torch.Tensor) -> torch.Tensor: + image_feature = self.pre_norm(image_feature) + hidden_states = self.linear_1(image_feature) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor( + LlavaNextMultiModalProcessor, + info=RVLProcessingInfo, + dummy_inputs=RVLDummyInputsBuilder, +) +class RForConditionalGeneration(LlavaOnevisionForConditionalGeneration): + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers + # v4.52 + "model.language_model.": "language_model.model.", + "model.vision_tower.": "vision_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + "model.image_newline": "image_newline", + "lm_head.": "language_model.lm_head.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix) + config = vllm_config.model_config.hf_config + self.multi_modal_projector = RVLMultiModalProjector(config) From 6ae6cf15199968183b67f645d0e401dd3659b646 Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Thu, 21 Aug 2025 04:15:20 +0000 Subject: [PATCH 211/231] [CI] Delete images older than 24h. (#23291) Signed-off-by: Qiliang Cui Signed-off-by: Duncan Moss --- .buildkite/scripts/tpu/cleanup_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh index 209d9c4341cd..740d81fb39bb 100755 --- a/.buildkite/scripts/tpu/cleanup_docker.sh +++ b/.buildkite/scripts/tpu/cleanup_docker.sh @@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f # Remove unused volumes / force the system prune for old images as well. - docker volume prune -f && docker system prune --force --filter "until=72h" --all + docker volume prune -f && docker system prune --force --filter "until=24h" --all echo "Docker images and volumes cleanup completed." else echo "Disk usage is below $threshold%. No cleanup needed." From 453d8987bdc8f974fe987f9a17e591dd396c3866 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 21 Aug 2025 00:21:05 -0400 Subject: [PATCH 212/231] [CI] Block the cu126 wheel build while broken (#23285) Signed-off-by: mgoin Signed-off-by: Duncan Moss --- .buildkite/release-pipeline.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index e20ce54ca795..f96c38bf57db 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -27,7 +27,12 @@ steps: env: DOCKER_BUILDKIT: "1" + - block: "Build CUDA 12.6 wheel" + key: block-build-cu126-wheel + depends_on: ~ + - label: "Build wheel - CUDA 12.6" + depends_on: block-build-cu126-wheel id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge From 583387641a78a9394641dc6df06a7b754850290f Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:28:32 -0700 Subject: [PATCH 213/231] [Sampler] Support returning final logprobs (#22387) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Nick Hill Co-authored-by: Woosuk Kwon Signed-off-by: Duncan Moss --- docs/usage/v1_guide.md | 7 ++- tests/v1/sample/test_logprobs.py | 10 +-- vllm/config/__init__.py | 30 +++++---- vllm/engine/arg_utils.py | 1 + vllm/v1/sample/ops/topk_topp_sampler.py | 65 +++++++++++--------- vllm/v1/sample/sampler.py | 81 +++++++++++++++++++------ vllm/v1/sample/tpu/sampler.py | 2 +- 7 files changed, 126 insertions(+), 70 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 9bf0c5842c6b..b89768913681 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -154,12 +154,15 @@ differences compared to V0: ##### Logprobs Calculation -Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e. +By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e. before applying any logits post-processing such as temperature scaling or penalty adjustments). As a result, the returned logprobs do not reflect the final adjusted probabilities used during sampling. -Support for logprobs with post-sampling adjustments is in progress and will be added in future updates. +You can adjust this behavior by setting the `--logprobs-mode` flag. +Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`. +Raw means the values before applying any logit processors, like bad words. +Processed means the values after applying all processors, including temperature and top_k/top_p. ##### Prompt Logprobs with Prefix Caching diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 8bd142e87b06..e835c029634c 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -456,9 +456,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): assert len(logprob) == vocab_size -@pytest.mark.parametrize( - "logprobs_mode", - ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"]) +@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode)) def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): """Test with LLM engine with different logprobs_mode. @@ -487,12 +485,14 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode, for logprobs in output.logprobs: for token_id in logprobs: logprob = logprobs[token_id] - if "logprobs" in logprobs_mode: + if logprobs_mode in (LogprobsMode.RAW_LOGPROBS, + LogprobsMode.PROCESSED_LOGPROBS): assert logprob.logprob <= 0 if logprob.logprob > 0: positive_values = positive_values + 1 total_token_with_logprobs = total_token_with_logprobs + 1 assert total_token_with_logprobs >= len(results[0].outputs) - if "logits" in logprobs_mode: + if logprobs_mode in (LogprobsMode.RAW_LOGITS, + LogprobsMode.PROCESSED_LOGITS): assert positive_values > 0 del llm diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 959f111ced22..2973cb92d195 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -257,11 +257,16 @@ def is_init_field(cls: ConfigType, name: str) -> bool: TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] -LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs", - "processed_logits"] MMEncoderTPMode = Literal["weights", "data"] +class LogprobsMode(enum.Enum): + RAW_LOGITS = "raw_logits" + RAW_LOGPROBS = "raw_logprobs" + PROCESSED_LOGITS = "processed_logits" + PROCESSED_LOGPROBS = "processed_logprobs" + + @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class ModelConfig: @@ -363,12 +368,13 @@ class ModelConfig: specified in `SamplingParams`. The default value comes the default for the OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * vocab_size) logprobs are allowed to be returned and it may cause OOM.""" - logprobs_mode: LogprobsMode = "raw_logprobs" + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS """Indicates the content returned in the logprobs and prompt_logprobs. Supported mode: 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. - Raw means the values before applying logit processors, like bad words. - Processed means the values after applying such processors. + Raw means the values before applying any logit processors, like bad words. + Processed means the values after applying all processors, including + temperature and top_k/top_p. """ disable_sliding_window: bool = False """Whether to disable sliding window. If True, we will disable the sliding @@ -2586,7 +2592,7 @@ class MultiModalConfig: skip_mm_profiling: bool = False """ - When enabled, skips multimodal memory profiling and only profiles with + When enabled, skips multimodal memory profiling and only profiles with language backbone model during engine initialization. This reduces engine startup time but shifts the responsibility to users for @@ -2649,24 +2655,24 @@ class PoolerConfig: ## for embeddings models normalize: Optional[bool] = None """ - Whether to normalize the embeddings outputs. + Whether to normalize the embeddings outputs. """ dimensions: Optional[int] = None """ - Reduce the dimensions of embeddings if model + Reduce the dimensions of embeddings if model support matryoshka representation. """ ## for classification models activation: Optional[bool] = None """ - Whether to apply activation function to the classification outputs. + Whether to apply activation function to the classification outputs. """ ## for reward models softmax: Optional[bool] = None """ - Whether to apply softmax to the reward outputs. + Whether to apply softmax to the reward outputs. """ step_tag_id: Optional[int] = None """ @@ -2692,9 +2698,9 @@ class PoolerConfig: max_embed_len: Optional[int] = None """ - Maximum input length allowed for embedding generation. When set, allows + Maximum input length allowed for embedding generation. When set, allows inputs longer than max_embed_len to be accepted for embedding models. - This parameter enables accepting long inputs without requiring + This parameter enables accepting long inputs without requiring VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds max_embed_len, it will be handled according to the original max_model_len validation logic. Defaults to None (i.e. set to max_model_len). diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f3afc015f669..b0f50b4429a8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -516,6 +516,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", + choices=[f.value for f in LogprobsMode], **model_kwargs["logprobs_mode"]) model_group.add_argument("--disable-sliding-window", **model_kwargs["disable_sliding_window"]) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index e0434c8f3d71..7bd4a5a380ac 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -8,6 +8,7 @@ from packaging import version from vllm import envs +from vllm.config import LogprobsMode from vllm.logger import init_logger from vllm.platforms import current_platform @@ -28,9 +29,16 @@ class TopKTopPSampler(nn.Module): Implementations may update the logits tensor in-place. """ - def __init__(self): + def __init__( + self, + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS) -> None: super().__init__() - if current_platform.is_cuda(): + self.logprobs_mode = logprobs_mode + # flashinfer optimization does not apply if intermediate + # logprobs/logits after top_k/top_p need to be returned + if logprobs_mode not in (LogprobsMode.PROCESSED_LOGITS, + LogprobsMode.PROCESSED_LOGPROBS + ) and current_platform.is_cuda(): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ if version.parse(flashinfer_version) < version.parse("0.2.3"): @@ -63,10 +71,12 @@ def __init__(self): "native implementation of top-p & top-k sampling. For the " "best performance, please install FlashInfer.") self.forward = self.forward_native - elif current_platform.is_tpu(): - self.forward = self.forward_tpu else: self.forward = self.forward_native + if current_platform.is_tpu(): + self.apply_top_k_top_p = apply_top_k_top_p_tpu + else: + self.apply_top_k_top_p = apply_top_k_top_p def forward_native( self, @@ -74,15 +84,20 @@ def forward_native( generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """ PyTorch-native implementation of top-k and top-p sampling. The logits tensor may be updated in-place. """ - logits = apply_top_k_top_p(logits, k, p) + logits = self.apply_top_k_top_p(logits, k, p) + logits_to_return = None + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + logits_to_return = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + return random_sample(probs, generators), logits_to_return def forward_cuda( self, @@ -90,34 +105,24 @@ def forward_cuda( generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """More optimized implementation for top-k and top-p sampling.""" - if k is None and p is None: - # We prefer `random_sample` over `flashinfer_sample` when sorting is - # not needed. This is because `random_sample` does not require - # CPU-GPU synchronization while `flashinfer_sample` does. - probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) - if generators: - logger.warning_once("FlashInfer 0.2.3+ does not support " - "per-request generators. Falling back to " - "PyTorch-native implementation.") + # We prefer `random_sample` over `flashinfer_sample` when sorting is + # not needed. This is because `random_sample` does not require + # CPU-GPU synchronization while `flashinfer_sample` does. + if (k is None and p is None) or generators: + if generators: + logger.warning_once("FlashInfer 0.2.3+ does not support " + "per-request generators. Falling back to " + "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) + assert self.logprobs_mode not in ( + LogprobsMode.PROCESSED_LOGITS, LogprobsMode.PROCESSED_LOGPROBS + ), "FlashInfer does not support returning logits/logprobs" # flashinfer sampling functions expect contiguous logits. # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous # because of slicing operation in logits_processor. - return flashinfer_sample(logits.contiguous(), k, p, generators) - - def forward_tpu( - self, - logits: torch.Tensor, - generators: dict[int, torch.Generator], - k: Optional[torch.Tensor], - p: Optional[torch.Tensor], - ) -> torch.Tensor: - logits = apply_top_k_top_p_tpu(logits, k, p) - probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + return flashinfer_sample(logits.contiguous(), k, p, generators), None def apply_top_k_top_p_tpu( diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 82f51298f1b5..70ec8a0c26dd 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that samples the next tokens from the model's outputs.""" +from typing import Optional + import torch import torch.nn as nn @@ -18,10 +20,50 @@ class Sampler(nn.Module): - - def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"): + """ + A layer that samples the next tokens from the model's outputs + with the following steps in order: + + 1. If logprobs are requested: + a) If `logprobs_mode` is `raw_logprobs`, compute logprobs + as the final logprobs to return. + b) If `logprobs_mode` is `raw_logits`, clone the logits + as the final logprobs to return. + 2. Convert logits to float32. + 3. Apply allowed token ids whitelist. + 4. Apply bad words exclusion. + 5. Apply logit processors which are not argmax-invariant, + i.e. that can impact greedy sampling. + a) Min tokens processor + b) Logit bias processor + 6. Apply penalties + a) Repetition penalty + b) Frequency penalty + c) Presence penalty + 7. Sample the next tokens. `sample` method performs the following steps: + a) If not `all_random`, perform greedy sampling. If `all_greedy`, + return the greedily sampled tokens and final logprobs if requested. + b) Apply temperature. + c) Apply logit processors which are argmax-invariant, by default + the min_p processor. + d) Apply top_k and/or top_p. + e) Sample the next tokens with the probability distribution. + f) If `all_random` or temperature >= epsilon (1e-5), return the + randomly sampled tokens and final logprobs if requested. Else, + return the greedily sampled tokens and logprobs if requested. + 8. Gather the logprobs of the top `max_num_logprobs` and sampled token + (if requested). Note that if the sampled token is within the top + `max_num_logprobs`, the logprob will be eventually merged in + `LogprobsProcessor` during output processing. Therefore, the + final output may contain either `max_num_logprobs + 1` or + `max_num_logprobs` logprobs. + 9. Return the final `SamplerOutput`. + """ + + def __init__(self, + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS): super().__init__() - self.topk_topp_sampler = TopKTopPSampler() + self.topk_topp_sampler = TopKTopPSampler(logprobs_mode) self.pin_memory = is_pin_memory_available() self.logprobs_mode = logprobs_mode @@ -34,13 +76,11 @@ def forward( # temperature scaling) for the top-k logprobs. # This is different from the V0 sampler, which uses the logits that # is used for sampling (after penalties and temperature scaling). - # TODO(rob): provide option for logprobs post sampling. - # See https://vllm-dev.slack.com/archives/C07UUL8E61Z/p1735907856007919 # noqa: E501 num_logprobs = sampling_metadata.max_num_logprobs if num_logprobs is not None: - if self.logprobs_mode == "raw_logprobs": + if self.logprobs_mode == LogprobsMode.RAW_LOGPROBS: raw_logprobs = self.compute_logprobs(logits) - elif self.logprobs_mode == "raw_logits": + elif self.logprobs_mode == LogprobsMode.RAW_LOGITS: raw_logprobs = logits.clone() # Use float32 for the logits. @@ -57,15 +97,10 @@ def forward( # Apply penalties (e.g., min_tokens, freq_penalties). logits = self.apply_penalties(logits, sampling_metadata) - # Get the process logprobs or logits. - if num_logprobs is not None: - if self.logprobs_mode == "processed_logprobs": - raw_logprobs = self.compute_logprobs(logits) - elif self.logprobs_mode == "processed_logits": - raw_logprobs = logits.clone() - # Sample the next token. - sampled = self.sample(logits, sampling_metadata) + sampled, processed_logprobs = self.sample(logits, sampling_metadata) + if processed_logprobs is not None: + raw_logprobs = processed_logprobs # Convert sampled token ids to int64 (long) type to ensure compatibility # with subsequent operations that may use these values as indices. # This conversion is necessary because FlashInfer sampling operations @@ -105,7 +140,7 @@ def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Sample logits based on sampling metadata. The various logits processing functions called in this method @@ -119,7 +154,13 @@ def sample( else: greedy_sampled = self.greedy_sample(logits) if sampling_metadata.all_greedy: - return greedy_sampled + processed_logprobs = None + if sampling_metadata.max_num_logprobs is not None: + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + processed_logprobs = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + processed_logprobs = self.compute_logprobs(logits) + return greedy_sampled, processed_logprobs assert sampling_metadata.temperature is not None @@ -132,7 +173,7 @@ def sample( logits = processor.apply(logits) # Apply top_k and/or top_p. - random_sampled = self.topk_topp_sampler( + random_sampled, processed_logprobs = self.topk_topp_sampler( logits, sampling_metadata.generators, sampling_metadata.top_k, @@ -140,7 +181,7 @@ def sample( ) if greedy_sampled is None: - return random_sampled + return random_sampled, processed_logprobs sampled = torch.where( sampling_metadata.temperature < _SAMPLING_EPS, @@ -148,7 +189,7 @@ def sample( random_sampled, out=greedy_sampled, # Reuse tensor ) - return sampled + return sampled, processed_logprobs def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor: return logits.log_softmax(dim=-1, dtype=torch.float32) diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 2c9f4892bc24..04545d587e4a 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -65,7 +65,7 @@ def sample( logits = self.apply_min_p(logits, sampling_metadata.min_p) # Apply top_k and/or top_p. - random_sampled = self.topk_topp_sampler( + random_sampled, _ = self.topk_topp_sampler( logits, sampling_metadata.generators, sampling_metadata.top_k, From 352d13e539621d79325caade1c09259fb33d99de Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 21 Aug 2025 13:03:00 +0800 Subject: [PATCH 214/231] [Bugfix] Fix extra whitespace in strings caused by newline (#23272) Signed-off-by: DarkLight1337 Signed-off-by: Duncan Moss --- benchmarks/benchmark_dataset.py | 6 ++++-- examples/offline_inference/vision_language.py | 15 +++++++-------- vllm/benchmarks/datasets.py | 6 ++++-- vllm/model_executor/model_loader/tpu.py | 11 ++++++----- vllm/model_executor/models/hyperclovax_vision.py | 9 ++++----- vllm/model_executor/models/phi4mm.py | 6 +++--- vllm/transformers_utils/configs/eagle.py | 4 ++-- 7 files changed, 30 insertions(+), 27 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index e1a856026c4a..2ea4f9ccaff2 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -958,8 +958,10 @@ def sample( for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break - prompt = f"{item['input']}\n\n{item['instruction']} Just output \ - the code, do not include any explanation." + prompt = ( + f"{item['input']}\n\n{item['instruction']} Just output " + "the code, do not include any explanation." + ) # apply template prompt = tokenizer.apply_chat_template( diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index e7a7a30dd31a..8d97ba266826 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -283,8 +283,10 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: ) prompts = [ - f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ - {question}<|assistant|>" + ( + "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>" + f"{question}<|assistant|>" + ) for question in questions ] @@ -767,15 +769,13 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData: if modality == "video": prompts = [ - f"<|im_start|>user