Skip to content

Commit 252364f

Browse files
authored
[Cohere2Vision] remove unused arg (#40103)
* remove unused arg * remove the arg from test as well
1 parent e446372 commit 252364f

File tree

3 files changed

+10
-56
lines changed

3 files changed

+10
-56
lines changed

src/transformers/models/cohere2_vision/modeling_cohere2_vision.py

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -174,19 +174,13 @@ def set_decoder(self, decoder):
174174
def get_decoder(self):
175175
return self.language_model
176176

177-
def get_image_features(
178-
self,
179-
pixel_values: torch.FloatTensor,
180-
image_num_patches: torch.Tensor,
181-
):
177+
def get_image_features(self, pixel_values: torch.FloatTensor):
182178
"""
183179
Obtains image last hidden states from the vision tower and apply multimodal projection.
184180
185181
Args:
186182
pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
187183
The tensors corresponding to the input images.
188-
image_num_patches (`torch.Tensor` of shape `(num_images)`)
189-
Number of patches for each image.
190184
Returns:
191185
image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
192186
and are of shape `(num_patches, image_length, embed_dim)`).
@@ -227,7 +221,6 @@ def forward(
227221
self,
228222
input_ids: torch.LongTensor = None,
229223
pixel_values: torch.FloatTensor = None,
230-
image_num_patches: Optional[torch.Tensor] = None,
231224
attention_mask: Optional[torch.Tensor] = None,
232225
position_ids: Optional[torch.LongTensor] = None,
233226
past_key_values: Optional[Cache] = None,
@@ -236,18 +229,14 @@ def forward(
236229
cache_position: Optional[torch.LongTensor] = None,
237230
**kwargs: Unpack[FlashAttentionKwargs],
238231
) -> Union[tuple, Cohere2VisionModelOutputWithPast]:
239-
r"""
240-
image_num_patches (`torch.Tensor` of shape `(num_images,)`):
241-
Number of patches per input image.
242-
"""
243232
if (input_ids is None) ^ (inputs_embeds is not None):
244233
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
245234

246235
if inputs_embeds is None:
247236
inputs_embeds = self.get_input_embeddings()(input_ids)
248237

249238
if pixel_values is not None:
250-
image_features = self.get_image_features(pixel_values, image_num_patches=image_num_patches)
239+
image_features = self.get_image_features(pixel_values)
251240
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
252241
special_image_mask = self.get_placeholder_mask(
253242
input_ids, inputs_embeds=inputs_embeds, image_features=image_features
@@ -303,15 +292,8 @@ def set_decoder(self, decoder):
303292
def get_decoder(self):
304293
return self.model.get_decoder()
305294

306-
def get_image_features(
307-
self,
308-
pixel_values: torch.FloatTensor,
309-
image_num_patches: torch.Tensor,
310-
):
311-
return self.model.get_image_features(
312-
pixel_values=pixel_values,
313-
image_num_patches=image_num_patches,
314-
)
295+
def get_image_features(self, pixel_values: torch.FloatTensor):
296+
return self.model.get_image_features(pixel_values=pixel_values)
315297

316298
# Make modules available throught conditional class for BC
317299
@property
@@ -332,7 +314,6 @@ def forward(
332314
self,
333315
input_ids: Optional[torch.LongTensor] = None,
334316
pixel_values: Optional[torch.FloatTensor] = None,
335-
image_num_patches: Optional[torch.Tensor] = None,
336317
attention_mask: Optional[torch.Tensor] = None,
337318
position_ids: Optional[torch.LongTensor] = None,
338319
past_key_values: Optional[Cache] = None,
@@ -345,8 +326,6 @@ def forward(
345326
**kwargs: Unpack[TransformersKwargs],
346327
) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]:
347328
r"""
348-
image_num_patches (`torch.Tensor` of shape `(num_images,)`):
349-
Number of patches per input image.
350329
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
351330
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
352331
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -384,7 +363,6 @@ def forward(
384363
outputs = self.model(
385364
input_ids=input_ids,
386365
pixel_values=pixel_values,
387-
image_num_patches=image_num_patches,
388366
attention_mask=attention_mask,
389367
position_ids=position_ids,
390368
past_key_values=past_key_values,

src/transformers/models/cohere2_vision/modular_cohere2_vision.py

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -94,19 +94,13 @@ class Cohere2VisionCausalLMOutputWithPast(AyaVisionCausalLMOutputWithPast):
9494
class Cohere2VisionModel(AyaVisionModel):
9595
_checkpoint_conversion_mapping = {}
9696

97-
def get_image_features(
98-
self,
99-
pixel_values: torch.FloatTensor,
100-
image_num_patches: torch.Tensor,
101-
):
97+
def get_image_features(self, pixel_values: torch.FloatTensor):
10298
"""
10399
Obtains image last hidden states from the vision tower and apply multimodal projection.
104100
105101
Args:
106102
pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
107103
The tensors corresponding to the input images.
108-
image_num_patches (`torch.Tensor` of shape `(num_images)`)
109-
Number of patches for each image.
110104
Returns:
111105
image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
112106
and are of shape `(num_patches, image_length, embed_dim)`).
@@ -123,7 +117,6 @@ def forward(
123117
self,
124118
input_ids: torch.LongTensor = None,
125119
pixel_values: torch.FloatTensor = None,
126-
image_num_patches: Optional[torch.Tensor] = None,
127120
attention_mask: Optional[torch.Tensor] = None,
128121
position_ids: Optional[torch.LongTensor] = None,
129122
past_key_values: Optional[Cache] = None,
@@ -132,18 +125,14 @@ def forward(
132125
cache_position: Optional[torch.LongTensor] = None,
133126
**kwargs: Unpack[FlashAttentionKwargs],
134127
) -> Union[tuple, Cohere2VisionModelOutputWithPast]:
135-
r"""
136-
image_num_patches (`torch.Tensor` of shape `(num_images,)`):
137-
Number of patches per input image.
138-
"""
139128
if (input_ids is None) ^ (inputs_embeds is not None):
140129
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
141130

142131
if inputs_embeds is None:
143132
inputs_embeds = self.get_input_embeddings()(input_ids)
144133

145134
if pixel_values is not None:
146-
image_features = self.get_image_features(pixel_values, image_num_patches=image_num_patches)
135+
image_features = self.get_image_features(pixel_values)
147136
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
148137
special_image_mask = self.get_placeholder_mask(
149138
input_ids, inputs_embeds=inputs_embeds, image_features=image_features
@@ -172,23 +161,15 @@ def forward(
172161
class Cohere2VisionForConditionalGeneration(AyaVisionForConditionalGeneration):
173162
_checkpoint_conversion_mapping = {}
174163

175-
def get_image_features(
176-
self,
177-
pixel_values: torch.FloatTensor,
178-
image_num_patches: torch.Tensor,
179-
):
180-
return self.model.get_image_features(
181-
pixel_values=pixel_values,
182-
image_num_patches=image_num_patches,
183-
)
164+
def get_image_features(self, pixel_values: torch.FloatTensor):
165+
return self.model.get_image_features(pixel_values=pixel_values)
184166

185167
@check_model_inputs
186168
@auto_docstring
187169
def forward(
188170
self,
189171
input_ids: Optional[torch.LongTensor] = None,
190172
pixel_values: Optional[torch.FloatTensor] = None,
191-
image_num_patches: Optional[torch.Tensor] = None,
192173
attention_mask: Optional[torch.Tensor] = None,
193174
position_ids: Optional[torch.LongTensor] = None,
194175
past_key_values: Optional[Cache] = None,
@@ -201,8 +182,6 @@ def forward(
201182
**kwargs: Unpack[TransformersKwargs],
202183
) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]:
203184
r"""
204-
image_num_patches (`torch.Tensor` of shape `(num_images,)`):
205-
Number of patches per input image.
206185
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
207186
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
208187
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -240,7 +219,6 @@ def forward(
240219
outputs = self.model(
241220
input_ids=input_ids,
242221
pixel_values=pixel_values,
243-
image_num_patches=image_num_patches,
244222
attention_mask=attention_mask,
245223
position_ids=position_ids,
246224
past_key_values=past_key_values,

tests/models/cohere2_vision/test_modeling_cohere2_vision.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,13 +120,12 @@ def get_config(self):
120120
def prepare_config_and_inputs(self):
121121
config = self.get_config()
122122
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
123-
image_num_patches = torch.tensor([1] * self.batch_size).to(torch_device)
124123

125-
return config, pixel_values, image_num_patches
124+
return config, pixel_values
126125

127126
def prepare_config_and_inputs_for_common(self):
128127
config_and_inputs = self.prepare_config_and_inputs()
129-
config, pixel_values, image_num_patches = config_and_inputs
128+
config, pixel_values = config_and_inputs
130129
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
131130
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
132131
input_ids[input_ids == self.image_token_id] = self.pad_token_id
@@ -136,7 +135,6 @@ def prepare_config_and_inputs_for_common(self):
136135
"pixel_values": pixel_values,
137136
"input_ids": input_ids,
138137
"attention_mask": attention_mask,
139-
"image_num_patches": image_num_patches,
140138
}
141139
return config, inputs_dict
142140

0 commit comments

Comments
 (0)