Skip to content

[Bug]: MoeDistributeCombineV2_116(MoeDistributeCombineV2) Verify failed. reason: contains negative or zero dimension. #4213

@shaojun0

Description

@shaojun0

Your current environment

services:
  vllm-ascend-head:
    image: quay.io/ascend/vllm-ascend:v0.11.0-dev
    container_name: vllm-ascend-head
    network_mode: host
    privileged: true
    restart: always
    shm_size: 500g
    environment:
      - HCCL_IF_IP=10.48.205.243
      - GLOO_SOCKET_IFNAME=bond4
      - TP_SOCKET_IFNAME=bond4
      - HCCL_SOCKET_IFNAME=bond4
      - RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
      - ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
      - OMP_PROC_BIND=false
      - OMP_NUM_THREADS=100
      - VLLM_USE_V1=1
      - HCCL_BUFFSIZE=1024
    devices:
      - /dev/davinci0
      - /dev/davinci1
      - /dev/davinci2
      - /dev/davinci3
      - /dev/davinci4
      - /dev/davinci5
      - /dev/davinci6
      - /dev/davinci7
      - /dev/davinci_manager
      - /dev/devmm_svm
      - /dev/hisi_hdc
    volumes:
      - /usr/local/dcmi:/usr/local/dcmi
      - /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi
      - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
      - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
      - /etc/ascend_install.info:/etc/ascend_install.info
      - /data:/data
    command: >
      vllm serve /data/deepseek-V3.1-w8a8
      --host 0.0.0.0
      --port 1025
      --headless
      --data-parallel-size 4
      --data-parallel-size-local 2
      --data-parallel-start-rank 2
      --data-parallel-address 10.48.205.242
      --data-parallel-rpc-port 13389
      --tensor-parallel-size 4
      --seed 1024
      --quantization ascend
      --served-model-name DeepSeek-r1-32k_token
      --enable-expert-parallel
      --max-num-seqs 16
      --max-model-len 32768
      --max-num-batched-tokens 32768
      --trust-remote-code
      --no-enable-prefix-caching
      --gpu-memory-utilization 0.99
      --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
services:
  vllm-ascend-head:
    image: quay.io/ascend/vllm-ascend:v0.11.0-dev
    container_name: vllm-ascend-head
    network_mode: host
    privileged: true
    restart: always
    shm_size: 500g
    environment:
      - HCCL_IF_IP=10.48.205.243
      - GLOO_SOCKET_IFNAME=bond4
      - TP_SOCKET_IFNAME=bond4
      - HCCL_SOCKET_IFNAME=bond4
      - RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
      - ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
      - OMP_PROC_BIND=false
      - OMP_NUM_THREADS=100
      - VLLM_USE_V1=1
      - HCCL_BUFFSIZE=1024
    devices:
      - /dev/davinci0
      - /dev/davinci1
      - /dev/davinci2
      - /dev/davinci3
      - /dev/davinci4
      - /dev/davinci5
      - /dev/davinci6
      - /dev/davinci7
      - /dev/davinci_manager
      - /dev/devmm_svm
      - /dev/hisi_hdc
    volumes:
      - /usr/local/dcmi:/usr/local/dcmi
      - /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi
      - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
      - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
      - /etc/ascend_install.info:/etc/ascend_install.info
      - /data:/data
    command: >
      vllm serve /data/deepseek-V3.1-w8a8
      --host 0.0.0.0
      --port 1025
      --headless
      --data-parallel-size 4
      --data-parallel-size-local 2
      --data-parallel-start-rank 2
      --data-parallel-address 10.48.205.242
      --data-parallel-rpc-port 13389
      --tensor-parallel-size 4
      --seed 1024
      --quantization ascend
      --served-model-name DeepSeek-r1-32k_token
      --enable-expert-parallel
      --max-num-seqs 16
      --max-model-len 32768
      --max-num-batched-tokens 32768
      --trust-remote-code
      --no-enable-prefix-caching
      --gpu-memory-utilization 0.99
      --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'

The error occurs after a period of usage

🐛 Describe the bug

(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671] WorkerProc hit an exception.
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671] Traceback (most recent call last):
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]   File "/usr/local/python3.11.13/lib/python3.11/site-packages/torch_npu/dynamo/torchair/_utils/error_code.py", line 43, in wapper
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]     return func(*args, **kwargs)
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]            ^^^^^^^^^^^^^^^^^^^^^
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]   File "/usr/local/python3.11.13/lib/python3.11/site-packages/torch_npu/dynamo/torchair/core/_backend.py", line 125, in compile
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]     return super(TorchNpuGraph, self).compile()
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671] RuntimeError: E19014: [PID: 1085] 2025-11-15-05:36:42.238.323 Value [input assist_info_for_combine shape] for Op [MoeDistributeCombineV2_116] is invalid. Reason: contains negative or zero dimension.
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]         Solution: Invalid operator information. Check the operator information in the error message.
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]         TraceBack (most recent call last):
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]         MoeDistributeCombineV2_116(MoeDistributeCombineV2) Verify failed.[FUNC:Verify][FILE:node_utils_ex.cc][LINE:165]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]         Verify ing MoeDistributeCombineV2_116 failed.[FUNC:InferShapeAndType][FILE:infershape_pass.cc][LINE:131]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]         Call InferShapeAndType for node:MoeDistributeCombineV2_116(MoeDistributeCombineV2) failed[FUNC:Infer][FILE:infershape_pass.cc][LINE:118]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]         process pass InferShapePass on node:MoeDistributeCombineV2_116 failed, ret:4294967295[FUNC:RunPassesOnNode][FILE:base_pass.cc][LINE:565]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]         [Call][PreRun] Failed, graph_id:2, session_id:2.[FUNC:CompileGraph][FILE:graph_manager.cc][LINE:4654]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]         [Compile][Graph]Compile graph failed, error code:1343225857, session_id:2, graph_id:2, isEnableSliceSchedule:0.[FUNC:CompileGraph][FILE:ge_api.cc][LINE:1365]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671] During handling of the above exception, another exception occurred:
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671] Traceback (most recent call last):
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]   File "/vllm-workspace/vllm/vllm/v1/executor/multiproc_executor.py", line 666, in worker_busy_loop
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]     output = func(*args, **kwargs)
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]              ^^^^^^^^^^^^^^^^^^^^^
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]   File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker_v1.py", line 375, in execute_dummy_batch
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]     self.model_runner._dummy_run(
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]   File "/usr/local/python3.11.13/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]     return func(*args, **kwargs)
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]            ^^^^^^^^^^^^^^^^^^^^^
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]   File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py", line 2506, in _dummy_run
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]     hidden_states = self._generate_dummy_run_hidden_states(
(Worker_DP1_TP1_EP5 pid=1085) ERROR 11-15 05:36:42 [multiproc_executor.py:671]          

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions