|
31 | 31 | ) |
32 | 32 |
|
33 | 33 |
|
| 34 | +import logging |
| 35 | +from typing import Any, Callable, Dict, List, Optional, Union |
| 36 | + |
| 37 | +import torch |
| 38 | + |
| 39 | + |
| 40 | +def _find_image_token_runs( |
| 41 | + input_ids: torch.Tensor, image_token_id: Optional[int] |
| 42 | +) -> List[tuple[int, int, int]]: |
| 43 | + """Return contiguous runs (start, end, length) of image_token_id in input_ids. |
| 44 | +
|
| 45 | + input_ids must be a 1D torch.Tensor. If image_token_id is None, returns an empty list. |
| 46 | + """ |
| 47 | + if image_token_id is None: |
| 48 | + return [] |
| 49 | + |
| 50 | + ids_list = input_ids.tolist() |
| 51 | + runs: List[tuple[int, int, int]] = [] |
| 52 | + i = 0 |
| 53 | + L = len(ids_list) |
| 54 | + while i < L: |
| 55 | + if ids_list[i] == image_token_id: |
| 56 | + j = i |
| 57 | + while j < L and ids_list[j] == image_token_id: |
| 58 | + j += 1 |
| 59 | + runs.append((i, j - 1, j - i)) |
| 60 | + i = j |
| 61 | + else: |
| 62 | + i += 1 |
| 63 | + |
| 64 | + return runs |
| 65 | + |
| 66 | + |
| 67 | +def _hf_to_multimodal_inputs( # noqa: C901 |
| 68 | + inputs: Dict[str, Any], image_token_id: Optional[int] = None |
| 69 | +) -> List[MultimodalInput]: |
| 70 | + """Convert a HuggingFace AutoProcessor dict to ExecuTorch MultimodalInputs. |
| 71 | + Currently only support 1 image inside the input. |
| 72 | +
|
| 73 | + Args: |
| 74 | + - inputs: A dictionary containing the input data. |
| 75 | + - image_token_id: The token ID for the image, if present. |
| 76 | +
|
| 77 | + `inputs` expected keys: |
| 78 | + - 'input_ids': torch.Tensor of shape (L,) or (1, L) |
| 79 | + - Optional 'pixel_values': torch.Tensor; if present, must also provide |
| 80 | + 'image_token_id' (or alias 'image_token_index') and there must be |
| 81 | + exactly one image token occurrence in input_ids. |
| 82 | +
|
| 83 | + Raises: |
| 84 | + RuntimeError: missing keys, invalid shapes/dtypes, or unsupported cases. |
| 85 | + """ |
| 86 | + if "input_ids" not in inputs: |
| 87 | + raise RuntimeError("HF inputs dict must contain 'input_ids' (torch.Tensor)") |
| 88 | + |
| 89 | + input_ids = inputs["input_ids"] |
| 90 | + if not isinstance(input_ids, torch.Tensor): |
| 91 | + raise RuntimeError("'input_ids' must be a torch.Tensor") |
| 92 | + |
| 93 | + if input_ids.dim() == 2: |
| 94 | + if input_ids.size(0) != 1: |
| 95 | + raise RuntimeError( |
| 96 | + "Expected 'input_ids' with batch size 1 when 2D (shape (1, L))" |
| 97 | + ) |
| 98 | + input_ids = input_ids.squeeze(0) |
| 99 | + if input_ids.dim() != 1: |
| 100 | + raise RuntimeError("'input_ids' must be 1D (L) or 2D with batch size 1") |
| 101 | + |
| 102 | + has_pixel_values = "pixel_values" in inputs |
| 103 | + |
| 104 | + # If pixel_values in dict, require image_token_id |
| 105 | + if has_pixel_values and image_token_id is None: |
| 106 | + raise RuntimeError("'pixel_values' provided but missing 'image_token_id'") |
| 107 | + |
| 108 | + # If there are image token ids but no pixel_values, it's an error |
| 109 | + if ( |
| 110 | + image_token_id is not None |
| 111 | + and (input_ids == image_token_id).any().item() |
| 112 | + and not has_pixel_values |
| 113 | + ): |
| 114 | + raise RuntimeError( |
| 115 | + "Found image token(s) in input_ids but 'pixel_values' not provided" |
| 116 | + ) |
| 117 | + |
| 118 | + # No images: return a single tokens input |
| 119 | + if not has_pixel_values: |
| 120 | + return [make_token_input(input_ids.to(torch.long).tolist())] |
| 121 | + |
| 122 | + # Determine number of images from pixel_values shape |
| 123 | + pv = inputs["pixel_values"] |
| 124 | + if not isinstance(pv, torch.Tensor): |
| 125 | + raise RuntimeError( |
| 126 | + "'pixel_values' must be a torch.Tensor, run with `return_tensors='pt'` in HF processor" |
| 127 | + ) |
| 128 | + if pv.dim() == 4: |
| 129 | + num_images = int(pv.size(0)) |
| 130 | + elif pv.dim() == 3: |
| 131 | + num_images = 1 |
| 132 | + else: |
| 133 | + raise RuntimeError( |
| 134 | + f"'pixel_values' must be 3D (C,H,W) or 4D (N,C,H,W)/(N,H,W,C), got shape {pv.shape}" |
| 135 | + ) |
| 136 | + |
| 137 | + # Only support batch size 1 for now: |
| 138 | + if num_images != 1: |
| 139 | + raise RuntimeError("Only 1 image is supported for now") |
| 140 | + # Find contiguous runs of image_token_id in input_ids |
| 141 | + runs = _find_image_token_runs(input_ids, image_token_id) |
| 142 | + |
| 143 | + if len(runs) == 0: |
| 144 | + raise RuntimeError( |
| 145 | + "'pixel_values' provided but no occurrence of 'image_token_id' in input_ids" |
| 146 | + ) |
| 147 | + |
| 148 | + # Support only one image/run for now; enforce exact match |
| 149 | + if num_images != 1 or len(runs) != 1: |
| 150 | + raise RuntimeError( |
| 151 | + f"Mismatch between images and image token runs: images={num_images}, runs={len(runs)} (only batch=1 and a single contiguous run are supported)" |
| 152 | + ) |
| 153 | + |
| 154 | + first, last, _ = runs[0] |
| 155 | + |
| 156 | + combined: List[MultimodalInput] = [] |
| 157 | + if first > 0: |
| 158 | + combined.append(make_token_input(input_ids[:first].to(torch.long).tolist())) |
| 159 | + |
| 160 | + # Use C++ checked creator for images (handles 3D/4D, CHW/HWC, uint8/float32) |
| 161 | + combined.append(make_image_input(inputs["pixel_values"])) |
| 162 | + |
| 163 | + if (last + 1) < input_ids.numel(): |
| 164 | + combined.append(make_token_input(input_ids[last + 1 :].to(torch.long).tolist())) |
| 165 | + |
| 166 | + return combined |
| 167 | + |
| 168 | + |
| 169 | +def generate( |
| 170 | + runner: MultimodalRunner, |
| 171 | + inputs: Union[Dict[str, Any], List[MultimodalInput]], |
| 172 | + config: GenerationConfig, |
| 173 | + image_token_id: Optional[int] = None, |
| 174 | + token_callback: Optional[Callable[[str], None]] = None, |
| 175 | + stats_callback: Optional[Callable[[Stats], None]] = None, |
| 176 | +) -> None: |
| 177 | + """Generate using an HF dict by converting to multimodal inputs internally, or using a list of MultimodalInput.""" |
| 178 | + if isinstance(inputs, dict): |
| 179 | + logging.info( |
| 180 | + "Input is a dict, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs." |
| 181 | + ) |
| 182 | + converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id) |
| 183 | + else: |
| 184 | + converted = inputs |
| 185 | + |
| 186 | + runner.generate(converted, config, token_callback, stats_callback) |
| 187 | + |
| 188 | + |
| 189 | +def generate_text( |
| 190 | + runner: MultimodalRunner, |
| 191 | + inputs: Union[Dict[str, Any], List[MultimodalInput]], |
| 192 | + config: GenerationConfig, |
| 193 | + image_token_id: Optional[int] = None, |
| 194 | +) -> str: |
| 195 | + """Generate using an HF dict by converting to multimodal inputs internally, or using a list of MultimodalInput.""" |
| 196 | + if isinstance(inputs, dict): |
| 197 | + logging.info( |
| 198 | + "Input is a dict, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs." |
| 199 | + ) |
| 200 | + converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id) |
| 201 | + else: |
| 202 | + converted = inputs |
| 203 | + |
| 204 | + return runner.generate_text(converted, config) |
| 205 | + |
| 206 | + |
| 207 | +setattr(MultimodalRunner, "generate", generate) # noqa B010 |
| 208 | +setattr(MultimodalRunner, "generate_text", generate_text) # noqa B010 |
| 209 | + |
| 210 | + |
34 | 211 | __all__ = [ |
35 | 212 | "GenerationConfig", |
36 | 213 | "Image", |
|
0 commit comments