[Feat] Phase 1 foundation types for multimodal output decoupling (vllm-project#1816)

meghaagr13 · tzhouam · web-flow · commit 9eeac248f234 · 2026-03-23T22:33:01.000+08:00
Signed-off-by: Megha Agarwal &lt;agarwalmegha1308@gmail.com&gt;
Signed-off-by: Megha Agarwal &lt;agarwalmegha@microsoft.com&gt;
Co-authored-by: Zhou Taichang &lt;tzhouam@connect.ust.hk&gt;
diff --git a/tests/engine/test_output_modality.py b/tests/engine/test_output_modality.py
@@ -0,0 +1,153 @@
+"""Unit tests for Phase 1 foundation types (RFC #1601).
+
+Note: Uses importlib to load modules directly, bypassing the vllm_omni
+package __init__ which requires the vllm base package.
+"""
+
+import importlib.util
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+
+# ── Load modules without triggering vllm_omni.__init__ ─────────────
+
+_ENGINE_DIR = Path(__file__).resolve().parents[2] / "vllm_omni" / "engine"
+
+
+def _load_module(name: str, filepath: Path):
+    spec = importlib.util.spec_from_file_location(name, filepath)
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+
+
+_om_mod = _load_module(
+    "vllm_omni.engine.output_modality",
+    _ENGINE_DIR / "output_modality.py",
+)
+_mm_mod = _load_module(
+    "vllm_omni.engine.mm_outputs",
+    _ENGINE_DIR / "mm_outputs.py",
+)
+
+OutputModality = _om_mod.OutputModality
+TensorAccumulationStrategy = _om_mod.TensorAccumulationStrategy
+get_accumulation_strategy = _om_mod.get_accumulation_strategy
+MultimodalPayload = _mm_mod.MultimodalPayload
+MultimodalCompletionOutput = _mm_mod.MultimodalCompletionOutput
+
+
+def test_output_modality_parsing_and_flags():
+    """Test OutputModality enum: from_string, aliases, compounds, properties, and accumulation strategy."""
+    # Defaults
+    assert OutputModality.from_string(None) == OutputModality.TEXT
+    assert OutputModality.from_string("") == OutputModality.TEXT
+
+    # Direct names and case insensitivity
+    assert OutputModality.from_string("image") == OutputModality.IMAGE
+    assert OutputModality.from_string("Audio") == OutputModality.AUDIO
+
+    # Aliases
+    assert OutputModality.from_string("speech") == OutputModality.AUDIO
+    assert OutputModality.from_string("latents") == OutputModality.LATENT
+    assert OutputModality.from_string("pixel_values") == OutputModality.IMAGE
+
+    # Compound
+    compound = OutputModality.from_string("text+image")
+    assert compound.has_text and compound.has_multimodal
+
+    # Flag properties
+    assert OutputModality.TEXT.has_text and not OutputModality.TEXT.has_multimodal
+    assert OutputModality.IMAGE.has_multimodal and not OutputModality.IMAGE.has_text
+
+    # Accumulation strategy
+    assert get_accumulation_strategy(OutputModality.AUDIO) == TensorAccumulationStrategy.CONCAT_LAST
+    assert get_accumulation_strategy(OutputModality.IMAGE) == TensorAccumulationStrategy.CONCAT_DIM0
+
+    # Unknown raises
+    with pytest.raises(ValueError, match="Unknown modality"):
+        OutputModality.from_string("video")
+
+
+def test_multimodal_payload_and_completion_output():
+    """Test MultimodalPayload and MultimodalCompletionOutput wrapper."""
+    # Payload from_dict separates tensors and metadata
+    data = {"waveform": torch.ones(1, 16000), "sample_rate": 16000}
+    p = MultimodalPayload.from_dict(data)
+    assert p is not None
+    assert "waveform" in p.tensors and torch.equal(p.primary_tensor, data["waveform"])
+    assert p.metadata["sample_rate"] == 16000
+    assert not p.is_empty and len(p) == 1
+
+    # None/empty returns None
+    assert MultimodalPayload.from_dict(None) is None
+    assert MultimodalPayload.from_dict({}) is None
+
+    wrapper = MultimodalCompletionOutput(
+        multimodal_output=p,
+        index=0,
+        text="hello",
+        token_ids=[],
+        cumulative_logprob=None,
+        logprobs=None,
+    )
+    assert wrapper.text == "hello"
+    assert wrapper.multimodal_output is p
+
+
+def test_output_modality_printed_examples(capsys):
+    """Printed examples for output modality types."""
+    print("\n=== OutputModality Parsing ===")
+    for s in [None, "", "image", "Audio", "speech", "latents", "pixel_values", "text+image"]:
+        print(f"  from_string({s!r:20s}) -> {OutputModality.from_string(s)}")
+
+    print("\n=== Flag Properties ===")
+    for m in [
+        OutputModality.TEXT,
+        OutputModality.IMAGE,
+        OutputModality.AUDIO,
+        OutputModality.TEXT | OutputModality.IMAGE,
+    ]:
+        print(f"  {str(m):40s} has_text={m.has_text}  has_multimodal={m.has_multimodal}")
+
+    print("\n=== Accumulation Strategies ===")
+    for m in [OutputModality.AUDIO, OutputModality.IMAGE, OutputModality.LATENT]:
+        print(f"  {str(m):30s} -> {get_accumulation_strategy(m)}")
+
+    print("\n=== MultimodalPayload ===")
+    data = {"waveform": torch.ones(1, 16000), "sample_rate": 16000}
+    p = MultimodalPayload.from_dict(data)
+    print("  from_dict({waveform: tensor, sample_rate: 16000})")
+    print(f"    tensors keys : {list(p.tensors.keys())}")
+    print(f"    primary_tensor: shape={p.primary_tensor.shape}, dtype={p.primary_tensor.dtype}")
+    print(f"    metadata      : {p.metadata}")
+    print(f"    is_empty={p.is_empty}, len={len(p)}")
+    print(f"  from_dict(None) -> {MultimodalPayload.from_dict(None)}")
+    print(f"  from_dict({{}})   -> {MultimodalPayload.from_dict({})}")
+
+    print("\n=== MultimodalCompletionOutput ===")
+    wrapper = MultimodalCompletionOutput(
+        multimodal_output=p,
+        index=0,
+        text="hello",
+        token_ids=[],
+        cumulative_logprob=None,
+        logprobs=None,
+    )
+    print(f"  text             : {wrapper.text}")
+    print(f"  index            : {wrapper.index}")
+    print(f"  multimodal_output: {wrapper.multimodal_output}")
+    print(f"  repr             : {wrapper!r}")
+
+    print("\n=== Unknown Modality ===")
+    try:
+        OutputModality.from_string("video")
+    except ValueError as e:
+        print(f'  from_string("video") raised ValueError: {e}')
+
+    captured = capsys.readouterr()
+    assert "OutputModality Parsing" in captured.out
+    assert "MultimodalPayload" in captured.out
diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py
@@ -7,6 +7,7 @@
 from vllm.logger import init_logger
 
 from vllm_omni.config import OmniModelConfig
+from vllm_omni.engine.output_modality import OutputModality
 from vllm_omni.plugins import load_omni_general_plugins
 
 logger = init_logger(__name__)
@@ -143,3 +144,8 @@ def create_model_config(self) -> OmniModelConfig:
             task_type=self.task_type,
         )
         return omni_config
+
+    @property
+    def output_modality(self) -> OutputModality:
+        """Parse engine_output_type into a type-safe OutputModality flag."""
+        return OutputModality.from_string(self.engine_output_type)
diff --git a/vllm_omni/engine/mm_outputs.py b/vllm_omni/engine/mm_outputs.py
@@ -0,0 +1,90 @@
+"""Multimodal output data structures for vLLM-Omni.
+
+This module defines structured types for multimodal outputs.
+
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import torch
+from vllm.outputs import CompletionOutput
+
+
+@dataclass
+class MultimodalPayload:
+    """Structured multimodal output payload.
+
+    Attributes:
+        tensors: Dictionary mapping modality/key names to their tensors.
+        metadata: Optional dictionary for non-tensor metadata
+            (e.g., sample rate for audio, image dimensions).
+    """
+
+    tensors: dict[str, torch.Tensor] = field(default_factory=dict)
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def primary_tensor(self) -> torch.Tensor | None:
+        """Return the first tensor in the payload, or None if empty."""
+        if self.tensors:
+            return next(iter(self.tensors.values()))
+        return None
+
+    @property
+    def is_empty(self) -> bool:
+        """Return True if the payload has no tensors."""
+        return len(self.tensors) == 0
+
+    def get(self, key: str) -> torch.Tensor | None:
+        """Get a tensor by key, returning None if not found."""
+        return self.tensors.get(key)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self.tensors
+
+    def __len__(self) -> int:
+        return len(self.tensors)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any] | None) -> MultimodalPayload | None:
+        """Create a MultimodalPayload from a raw dictionary.
+
+        Separates torch.Tensor values into tensors and everything
+        else into metadata.
+        """
+        if not data:
+            return None
+        tensors: dict[str, torch.Tensor] = {}
+        metadata: dict[str, Any] = {}
+        for k, v in data.items():
+            if isinstance(v, torch.Tensor):
+                tensors[k] = v
+            else:
+                metadata[k] = v
+        if not tensors and not metadata:
+            return None
+        return cls(tensors=tensors, metadata=metadata)
+
+
+@dataclass
+class MultimodalCompletionOutput(CompletionOutput):
+    """CompletionOutput with multimodal support.
+
+    Inherits all CompletionOutput fields and adds multimodal_output.
+    As a CompletionOutput subclass, compatible with all existing vLLM consumers.
+    """
+
+    def __init__(
+        self,
+        multimodal_output: MultimodalPayload | None = None,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.multimodal_output = multimodal_output
+
+    def __repr__(self) -> str:
+        base = super().__repr__()
+        return f"{base[:-1]}, multimodal_output={self.multimodal_output!r})"
diff --git a/vllm_omni/engine/output_modality.py b/vllm_omni/engine/output_modality.py
@@ -0,0 +1,97 @@
+"""Output modality types for vLLM-Omni.
+
+This module defines the OutputModality enum and TensorAccumulationStrategy
+for type-safe multimodal output routing and tensor merging.
+
+"""
+
+from __future__ import annotations
+
+import re
+from enum import Enum, Flag, auto
+
+_MODALITY_ALIASES: dict[str, str] = {
+    "speech": "audio",
+    "images": "image",
+    "latents": "latent",
+    "wav": "audio",
+    "waveform": "audio",
+    "pixel_values": "image",
+    "pixels": "image",
+}
+
+
+class OutputModality(Flag):
+    """Bit-flag enum for output modalities.
+
+    Compose freely with ``|`` — no need to enumerate every combination.
+
+    Single:   ``OutputModality.TEXT``, ``OutputModality.IMAGE``, ...
+    Compound: ``OutputModality.TEXT | OutputModality.IMAGE``  (text+image)
+
+    Note: POOLING is intentionally excluded. Pooling/embedding is vLLM's
+    native path (pooling_output → PoolingRequestOutput), handled entirely
+    by the base OutputProcessor. vLLM-Omni's layer does not participate.
+    """
+
+    TEXT = auto()
+    IMAGE = auto()
+    AUDIO = auto()
+    LATENT = auto()
+
+    @classmethod
+    def from_string(cls, s: str | None) -> OutputModality:
+        """Parse a free-text modality string into an OutputModality flag.
+
+        Handles common aliases and compound strings separated by + or ,.
+
+        Examples::
+
+            OutputModality.from_string("text+image")
+            # → OutputModality.TEXT | OutputModality.IMAGE
+        """
+        if not s or not s.strip():
+            return cls.TEXT
+
+        parts = [p.strip().lower() for p in re.split(r"[+,]", s.strip())]
+        result = cls(0)
+        for p in parts:
+            p = _MODALITY_ALIASES.get(p, p)
+            try:
+                result |= cls[p.upper()]
+            except KeyError:
+                raise ValueError(f"Unknown modality: {p!r}. Supported: {[m.name.lower() for m in cls]}")
+        return result
+
+    @property
+    def has_text(self) -> bool:
+        return OutputModality.TEXT in self
+
+    @property
+    def has_multimodal(self) -> bool:
+        return bool(self & ~OutputModality.TEXT)
+
+
+class TensorAccumulationStrategy(Enum):
+    """Strategy for merging incremental multimodal tensors."""
+
+    CONCAT_DIM0 = "concat_dim0"
+    """Concatenate along dimension 0. Used for image/latent tensors."""
+
+    CONCAT_LAST = "concat_last"
+    """Concatenate along the last dimension. Used for audio waveforms."""
+
+    APPEND_LIST = "append_list"
+    """Append to a list (no tensor concatenation)."""
+
+    REPLACE = "replace"
+    """Replace previous tensor entirely with the latest one."""
+
+
+def get_accumulation_strategy(modality: OutputModality) -> TensorAccumulationStrategy:
+    """Determine tensor merge strategy from the multimodal flags."""
+    if OutputModality.AUDIO in modality:
+        return TensorAccumulationStrategy.CONCAT_LAST
+    if OutputModality.IMAGE in modality or OutputModality.LATENT in modality:
+        return TensorAccumulationStrategy.CONCAT_DIM0
+    return TensorAccumulationStrategy.CONCAT_DIM0  # default