Skip to content

Commit 87015b1

Browse files
zucchini-nlpBernardZach
authored andcommitted
LLaVA OV: fix unpadding precision (huggingface#34779)
* fix * propagate * type check
1 parent ae1497a commit 87015b1

File tree

3 files changed

+10
-3
lines changed

3 files changed

+10
-3
lines changed

src/transformers/models/llava_next/processing_llava_next.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,9 @@ def __call__(
163163
for sample in text:
164164
while self.image_token in sample:
165165
image_size = next(image_sizes)
166-
orig_height, orig_width = image_size
166+
if not isinstance(image_size, (list, tuple)):
167+
# cast to list to avoid numerical precision errors when calculating unpadding
168+
orig_height, orig_width = image_size.tolist()
167169
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
168170
if self.vision_feature_select_strategy == "default":
169171
num_image_tokens -= self.num_additional_image_tokens

src/transformers/models/llava_next_video/processing_llava_next_video.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,9 @@ def __call__(
190190
for sample in text:
191191
while self.image_token in sample:
192192
image_size = next(image_sizes)
193-
orig_height, orig_width = image_size
193+
if not isinstance(image_size, (list, tuple)):
194+
# cast to list to avoid numerical precision errors when calculating unpadding
195+
orig_height, orig_width = image_size.tolist()
194196
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
195197
if self.vision_feature_select_strategy == "default":
196198
num_image_tokens -= self.num_additional_image_tokens

src/transformers/models/llava_onevision/processing_llava_onevision.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,10 @@ def _expand_image_tokens(
188188
for sample in text:
189189
while special_token in sample:
190190
image_size_list = next(image_sizes)
191-
orig_height, orig_width = image_size_list[0] if num_frames != 1 else image_size_list
191+
original_size = image_size_list[0] if num_frames != 1 else image_size_list
192+
if not isinstance(original_size, (list, tuple)):
193+
# cast to list to avoid numerical precision errors when calculating unpadding
194+
orig_height, orig_width = original_size.tolist()
192195
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
193196
if self.vision_feature_select_strategy == "default":
194197
num_image_tokens -= 1

0 commit comments

Comments
 (0)