@@ -229,17 +229,17 @@ def bytes_to_unicode():
229229
230230
231231
232- def get_unsigned_vision_feature_layers (v_hparams ):
232+ def get_non_negative_vision_feature_layers (v_hparams ):
233233 """
234234 Determine the vision feature layer(s) for the llava model, which are indices into the
235235 hidden states of the visual encoder. Note that the hidden states array generally takes the
236236 form:
237237
238238 [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
239239
240- so positive feature indices should be offset as n+1 to get the output of encoder block n.
241- We convert all vision feature layers to unsigned ints so that -1 can be used in the model
242- as an unset value. If no vision feature layer is found, we leave it unset.
240+ so feature indices should be offset as n+1 to get the output of encoder block n.
241+ We convert all vision feature layers to non-negative so that -1 can be used in
242+ the model as an unset value. If no vision feature layer is found, we leave it unset.
243243 """
244244 num_hidden_layers = v_hparams ["num_hidden_layers" ]
245245 to_uint = lambda layer_idx : layer_idx if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
@@ -257,7 +257,7 @@ def get_unsigned_vision_feature_layers(v_hparams):
257257 return [to_uint (feature_layer ) for feature_layer in feature_layers ]
258258
259259if has_vision_encoder :
260- feature_layers = get_unsigned_vision_feature_layers (v_hparams )
260+ feature_layers = get_non_negative_vision_feature_layers (v_hparams )
261261
262262 # Siglip does not have a visual projector; set projection dim to 0
263263 if args .clip_model_is_siglip :
0 commit comments