@@ -47,7 +47,8 @@ class LlavaNextImagePixelInputs(TypedDict):
47
47
"""
48
48
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
49
49
50
- Note that `num_patches` may be different for each batch.
50
+ Note that `num_patches` may be different for each batch, in which case
51
+ the data is passed as a list instead of a batched tensor.
51
52
"""
52
53
53
54
image_sizes : NotRequired [torch .Tensor ]
@@ -255,40 +256,20 @@ def _validate_pixel_values(
255
256
self , data : Union [torch .Tensor , List [torch .Tensor ]]
256
257
) -> Union [torch .Tensor , List [torch .Tensor ]]:
257
258
258
- def _validate_shape (data : torch .Tensor ):
259
-
260
- dim = data .dim ()
261
- height = width = self .config .vision_config .image_size
262
- # All 4d image tensors have the same number of patches,
263
- # so data is a 5d batch of these tensors
264
- if dim == 5 :
265
- if list (data .shape )[2 :] != [
266
- 3 , self .config .vision_config .image_size ,
267
- self .config .vision_config .image_size
268
- ]:
269
- raise ValueError (
270
- "Expected pixel value tensor in shape of: (batch size, "
271
- f"patch number, 3, { height } , { width } ), got { data .shape } "
272
- )
273
-
274
- # 4d image tensors have different number of patches,
275
- # so data is each individual tensor.
276
- elif dim == 4 :
277
- if list (data .shape )[1 :] != [
278
- 3 , self .config .vision_config .image_size ,
279
- self .config .vision_config .image_size
280
- ]:
281
- raise ValueError (
282
- "Expected pixel value tensor in shape of: (patch "
283
- f"number, 3, { height } , { width } ), got { data .shape } " )
284
- else :
259
+ h = w = self .config .vision_config .image_size
260
+ expected_dims = (3 , h , w )
261
+
262
+ def _validate_shape (d : torch .Tensor ):
263
+ actual_dims = tuple (d .shape [1 :])
264
+
265
+ if actual_dims != expected_dims :
266
+ expected_expr = ("num_patches" , * map (str , expected_dims ))
285
267
raise ValueError (
286
- f"Invalid pixel value tensor of shape { data .shape } " )
268
+ "The expected shape of pixel values in each batch element "
269
+ f"is { expected_expr } . You supplied { tuple (d .shape )} ." )
287
270
288
- if isinstance (data , torch .Tensor ):
289
- _validate_shape (data )
290
- else :
291
- [_validate_shape (d ) for d in data ]
271
+ for d in data :
272
+ _validate_shape (d )
292
273
293
274
return data
294
275
@@ -464,18 +445,33 @@ def forward(
464
445
465
446
One key thing to understand is the `input_ids` already accounts for the
466
447
positions of the to-be-inserted image embeddings.
448
+
467
449
Concretely, consider a text prompt:
468
- "<image>\n USER: What's the content of the image?\n ASSISTANT:".
450
+ `"A chat between a curious human and an artificial intelligence
451
+ assistant. The assistant gives helpful, detailed, and polite answers to
452
+ the human's questions.
453
+ USER: <image>\\ nWhat is shown in this image? ASSISTANT:"`.
454
+
469
455
Tokenizer outputs:
470
- [1, 32000, 29871, 13, 11889, 29901, 1724, 29915, 29879, 278,
471
- 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901].
472
- The to-be-inserted image has a size of 576 (24 * 24) along the context
473
- length dimension.
474
- `input_ids` is thus [1, 32000, ..., 32000, 29871, 13, 11889, 29901,
475
- 1724, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933,
476
- 9047, 13566, 29901].
477
- There will be 576 `32000` in the `input_ids`.
478
- (32000 is the token id for `<image>`.)
456
+ `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
457
+ 29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
458
+ 6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
459
+ 29871, 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973, 319, 1799,
460
+ 9047, 13566, 29901]`.
461
+
462
+ To reserve space in KV cache, we have to insert placeholder tokens
463
+ before they are inputted to the model, so the input processor prepends
464
+ additional image tokens (denoted as `32000`), resulting in:
465
+ `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
466
+ 29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
467
+ 6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
468
+ 29871, 32000, ..., 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973,
469
+ 319, 1799, 9047, 13566, 29901]`.
470
+
471
+ Unlike in LLaVA-1.5, the number of image tokens inputted to the language
472
+ model depends on the original size of the input image. Including the
473
+ original image token in the input, the required number of image tokens
474
+ is given by :func:`get_llava_next_image_feature_size`.
479
475
480
476
This way, the `positions` and `attn_metadata` are consistent
481
477
with the `input_ids`.
@@ -484,15 +480,10 @@ def forward(
484
480
input_ids: Flattened (concatenated) input_ids corresponding to a
485
481
batch.
486
482
pixel_values: The pixels in each grid patch for each input image.
487
- Expects a batch with shape `[1, num_patches, 3, h, w]`.
488
483
image_sizes: The original `(height, width)` for each input image.
489
- Expects a batch with shape `[1, 2]`.
490
-
484
+
491
485
See also:
492
- Each input maps to huggingface implementation, as follows:
493
-
494
- - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L690
495
- - `image_sizes`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L691
486
+ :class:`LlavaNextImageInputs`
496
487
"""
497
488
image_input = self ._parse_and_validate_image_input (** kwargs )
498
489
0 commit comments