Skip to content

Commit 0aed031

Browse files
committed
RC specific changes (commandr compilation, CAR size increase, tags fetch from upstream, config defaults)
1 parent 6c805b9 commit 0aed031

File tree

5 files changed

+21
-11
lines changed

5 files changed

+21
-11
lines changed

docker/Dockerfile.rocm

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@ FROM base AS fetch_vllm_1
2727
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
2828
ARG VLLM_BRANCH="main"
2929
ONBUILD RUN git clone ${VLLM_REPO} \
30-
&& cd vllm \
31-
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
32-
&& git checkout FETCH_HEAD
30+
&& cd vllm \
31+
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
32+
&& git checkout FETCH_HEAD \
33+
&& git remote add upstream "https://github.com/vllm-project/vllm.git" \
34+
&& git fetch upstream
3335
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
3436

3537
# -----------------------

vllm/config/compilation.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,11 @@ class PassConfig:
7575
don't all have access to full configuration - that would create a cycle as
7676
the `PassManager` is set as a property of config."""
7777

78-
enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
78+
enable_fusion: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
7979
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
8080
enable_attn_fusion: bool = False
8181
"""Whether to enable the custom attention+quant fusion pass."""
82-
enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
82+
enable_noop: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
8383
"""Whether to enable the custom no-op elimination pass."""
8484
enable_sequence_parallelism: bool = False
8585
"""Whether to enable sequence parallelism."""
@@ -223,7 +223,7 @@ class CompilationConfig:
223223
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
224224

225225
# CudaGraph compilation
226-
cudagraph_mode: Optional[CUDAGraphMode] = None
226+
cudagraph_mode: Optional[CUDAGraphMode] = CUDAGraphMode.FULL
227227
"""
228228
The mode of the cudagraph:
229229
@@ -408,6 +408,16 @@ def __post_init__(self) -> None:
408408
count_all = self.custom_ops.count("all")
409409
assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
410410

411+
if "+rms_norm" not in self.custom_ops and \
412+
"-rms_norm" not in self.custom_ops:
413+
self.custom_ops.append("+rms_norm")
414+
if "+silu_and_mul" not in self.custom_ops and \
415+
"-silu_and_mul" not in self.custom_ops:
416+
self.custom_ops.append("+silu_and_mul")
417+
if "+quant_fp8" not in self.custom_ops and \
418+
"-quant_fp8" not in self.custom_ops:
419+
self.custom_ops.append("+quant_fp8")
420+
411421
# TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
412422
# 1. A bug in PyTorch, fixed in 2.7:
413423
# https://github.com/pytorch/pytorch/issues/147924

vllm/distributed/device_communicators/custom_all_reduce.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class CustomAllreduce:
5454
def __init__(self,
5555
group: ProcessGroup,
5656
device: Union[int, str, torch.device],
57-
max_size=8192 * 1024) -> None:
57+
max_size=2 * 8192 * 1024) -> None:
5858
"""
5959
Args:
6060
group: the process group to work on. If None, it will use the

vllm/envs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
VLLM_NCCL_SO_PATH: Optional[str] = None
1818
LD_LIBRARY_PATH: Optional[str] = None
1919
VLLM_USE_TRITON_FLASH_ATTN: bool = True
20-
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
20+
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = True
2121
VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False
2222
VLLM_FLASH_ATTN_VERSION: Optional[int] = None
2323
LOCAL_RANK: int = 0
@@ -351,7 +351,7 @@ def get_vllm_port() -> Optional[int]:
351351
# the unified triton kernel.
352352
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
353353
lambda:
354-
(os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
354+
(os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "True").lower() in
355355
("true", "1")),
356356

357357
# Use AITER triton unified attention for V1 attention

vllm/model_executor/models/commandr.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
row_parallel_weight_loader)
4848
from vllm.model_executor.sampling_metadata import SamplingMetadata
4949
from vllm.model_executor.utils import set_weight_attrs
50-
from vllm.platforms import current_platform
5150
from vllm.sequence import IntermediateTensors
5251

5352
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
@@ -57,7 +56,6 @@
5756
maybe_prefix)
5857

5958

60-
@torch.compile(backend=current_platform.simple_compile_backend)
6159
def layer_norm_func(hidden_states, weight, variance_epsilon):
6260
input_dtype = hidden_states.dtype
6361
hidden_states = hidden_states.to(torch.float32)

0 commit comments

Comments
 (0)