vllm_0.15.0

skf-1999 · skf-1999 · commit dd9e8881d96f · 2026-02-03T07:55:25.000-08:00
diff --git a/vllm_omni/diffusion/models/hunyuan/hunyuan_image3_utils.py b/vllm_omni/diffusion/models/hunyuan/hunyuan_image3_utils.py
@@ -8,7 +8,285 @@
 import os
 import glob
 from safetensors.torch import load_file
+from transformers import PretrainedConfig
+from typing import Optional, Tuple, Any, List, Union, Iterable, cast
+import math
+import inspect
+from torch import nn
+
+def _is_moe(config: PretrainedConfig) -> bool:
+    num_experts = getattr(config, "num_experts", None)
+    if isinstance(num_experts, int):
+        return num_experts > 1
+    if isinstance(num_experts, list) and num_experts:
+        # Ensure all elements are integers before calling max.
+        if all(isinstance(e, int) for e in num_experts):
+            return max(num_experts) > 1
+        else:
+            return False
+    return False
+
+
+def _get_cla_factor(config: PretrainedConfig) -> int:
+    if not getattr(config, "use_cla", False):
+        return 1
+    return getattr(config, "cla_share_factor", 1)
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+def real_batched_index_select(t, dim, idx):
+    """ index_select for batched index and batched t """
+    assert t.ndim >= 2 and idx.ndim >= 2, f"{t.ndim=} {idx.ndim=}"
+    assert len(t) == len(idx), f"{len(t)=} != {len(idx)=}"
+    return torch.stack([torch.index_select(t[i], dim - 1, idx[i]) for i in range(len(t))])
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
 
+def normalization(channels, **kwargs):
+    """
+    Make a standard normalization layer.
+
+    :param channels: number of input channels.
+    :return: a nn.Module for normalization.
+    """
+    return nn.GroupNorm(32, channels, **kwargs)
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+
+
+def get_meshgrid_nd(start, *args, dim=2):
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+        # assert num are all integers
+        num_int = [int(x) for x in num]
+        assert (torch.tensor(num) == torch.tensor(num_int)).all(), f"num should be int, but got {num}"
+        num = num_int
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)       # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)      # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)       # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")   # dim x [H, W]
+    grid = torch.stack(grid, dim=0)     # [dim, H, W]
+
+    return grid
+
+def build_2d_rope(
+        seq_len: int, n_elem: int, image_infos: Optional[List[Tuple[slice, Tuple[int, int]]]] = None,
+        device: Optional[torch.device] = None, base: int = 10000, base_rescale_factor: float = 1.0,
+        return_all_pos: bool = False,
+):
+    
+    assert n_elem % 4 == 0, f"n_elem must be divisible by 4, but got {n_elem}."
+
+    # theta
+    if base_rescale_factor != 1.0:
+        base *= base_rescale_factor ** (n_elem / (n_elem - 2))
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
+    theta = theta.reshape(1, n_elem // 4, 2)    # [1, half_d, 2]
+
+    # position indices
+    if image_infos is None:
+        image_infos = []
+
+    image_infos_list = [image_infos]
+    sample_seq_lens = [seq_len]
+
+    # Prepare position indices for each sample
+    x_sections = []
+    y_sections = []
+    for sample_id, sample_image_infos in enumerate(image_infos_list):
+        last_pos = 0
+        for sec_slice, (h, w) in sample_image_infos:
+            L = sec_slice.start   # start from 0, so image_slice.start is just L
+            # previous text
+            if last_pos < L:
+                y_sections.append(torch.arange(last_pos, L))
+                x_sections.append(torch.arange(last_pos, L))
+            elif h is None:
+                # Interleave data has overlapped positions for <boi> <size> <ratio> <timestep> <eoi> tokens.
+                y_sections.append(torch.arange(sec_slice.start, sec_slice.stop))
+                x_sections.append(torch.arange(sec_slice.start, sec_slice.stop))
+                continue
+            else:
+                # Interleave data has overlapped positions for noised image and the successive clean image,
+                # leading to last_pos (= last text end L + noise w * h) > L (last text end L).
+                pass
+            # current image
+            beta_y = L + (w * h - h) / 2
+            beta_x = L + (w * h - w) / 2
+            grid = get_meshgrid_nd((beta_y, beta_x), (beta_y + h, beta_x + w))  # [2, h, w]
+            grid = grid.reshape(2, -1)  # (y, x)
+            y_sections.append(grid[0])
+            x_sections.append(grid[1])
+            # step
+            last_pos = L + w * h
+        # final text
+        y_sections.append(torch.arange(last_pos, sample_seq_lens[sample_id]))
+        x_sections.append(torch.arange(last_pos, sample_seq_lens[sample_id]))
+
+    x_pos = torch.cat(x_sections).long()
+    y_pos = torch.cat(y_sections).long()
+    # If there are overlap positions, we need to remove them.
+    x_pos = x_pos[:seq_len]
+    y_pos = y_pos[:seq_len]
+    all_pos = torch.stack((y_pos, x_pos), dim=1).unsqueeze(1).to(device)    # [seq_len, 1, 2]
+
+    # calc rope
+    idx_theta = (all_pos * theta).reshape(all_pos.shape[0], n_elem // 2).repeat(1, 2)
+
+    cos = torch.cos(idx_theta)
+    sin = torch.sin(idx_theta)
+
+    if return_all_pos:
+        return cos, sin, all_pos
+
+    return cos, sin
+
+
+def build_batch_2d_rope(
+        seq_len: int, n_elem: int, image_infos: Optional[List[List[Tuple[slice, Tuple[int, int]]]]] = None,
+        device: Optional[torch.device] = None, base: int = 10000, base_rescale_factor: float = 1.0,
+        return_all_pos: bool = False,
+):
+    cos_list, sin_list, all_pos_list = [], [], []
+    if image_infos is None:
+        image_infos = [None]
+    for i, image_info in enumerate(image_infos):
+        res = build_2d_rope(
+            seq_len, n_elem, image_infos=image_info, device=device,
+            base=base, base_rescale_factor=base_rescale_factor,
+            return_all_pos=return_all_pos,
+        )
+        if isinstance(res, tuple) and len(res) == 3:
+            cos, sin, all_pos = res
+        elif isinstance(res, tuple) and len(res) == 2:
+            cos, sin = res
+            all_pos = None
+        else:
+            raise ValueError(
+                "build_2d_rope must return a tuple of length 2 or 3 "
+                f"when return_all_pos={return_all_pos}, got: {type(res)} with length "
+                f"{len(res) if isinstance(res, tuple) else 'N/A'}"
+            )
+        cos_list.append(cos)
+        sin_list.append(sin)
+        all_pos_list.append(all_pos)
+    stacked_cos = torch.stack(cos_list, dim=0)
+    stacked_sin = torch.stack(sin_list, dim=0)
+    if return_all_pos:
+        return stacked_cos, stacked_sin, all_pos_list
+
+    return stacked_cos, stacked_sin
 
 def get_full_state_dict(model_path):
     files = glob.glob(os.path.join(model_path, "*.safetensors"))
diff --git a/vllm_omni/diffusion/models/hunyuan/hunyuan_image_3.py b/vllm_omni/diffusion/models/hunyuan/hunyuan_image_3.py
@@ -25,9 +25,11 @@
     TimestepEmbedder,
     UNetDown,
     UNetUp,
+    CausalMMOutputWithPast,
+)
+from .hunyuan_image3_utils import (
     build_batch_2d_rope,
     real_batched_index_select,
-    CausalMMOutputWithPast,
 )
 from .autoencoder_kl_3d import AutoencoderKLConv3D
 from .siglip2 import Siglip2VisionTransformer, LightProjector
diff --git a/vllm_omni/diffusion/models/hunyuan/hunyuan_image_3_models.py b/vllm_omni/diffusion/models/hunyuan/hunyuan_image_3_models.py