Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
template=None,
dynamic_image_size=False,
use_thumbnail=False,
ps_version='v1',
pixel_unshuffle_version='v1',
min_dynamic_patch=1,
max_dynamic_patch=6,
**kwargs):
Expand Down Expand Up @@ -69,7 +69,7 @@ def __init__(
self.template = template
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.ps_version = ps_version # pixel shuffle version
self.pixel_unshuffle_version = pixel_unshuffle_version # pixel unshuffle version
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch

Expand All @@ -79,7 +79,7 @@ def __init__(
self.llm_config.tie_word_embeddings = self.tie_word_embeddings

logger.info(f'vision_select_layer: {self.select_layer}')
logger.info(f'ps_version: {self.ps_version}')
logger.info(f'pixel_unshuffle_version: {self.pixel_unshuffle_version}')
logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')

Expand All @@ -102,7 +102,7 @@ def to_dict(self):
output['template'] = self.template
output['dynamic_image_size'] = self.dynamic_image_size
output['use_thumbnail'] = self.use_thumbnail
output['ps_version'] = self.ps_version
output['pixel_unshuffle_version'] = self.pixel_unshuffle_version
output['min_dynamic_patch'] = self.min_dynamic_patch
output['max_dynamic_patch'] = self.max_dynamic_patch

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,15 @@ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model
self.template = config.template
self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
self.downsample_ratio = config.downsample_ratio
self.ps_version = config.ps_version
self.pixel_unshuffle_version = config.pixel_unshuffle_version
self.llm_arch_name = config.llm_config.architectures[0]
# Enable Flash Attention if supported, otherwise fall back to eager attention.
use_flash_attn = use_flash_attn if has_flash_attn else False
config.vision_config.use_flash_attn = True if use_flash_attn else False
config.llm_config.attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'

logger.info(f'num_image_token: {self.num_image_token}')
logger.info(f'ps_version: {self.ps_version}')
logger.info(f'pixel_unshuffle_version: {self.pixel_unshuffle_version}')
if vision_model is not None:
self.vision_model = vision_model
else:
Expand Down Expand Up @@ -254,7 +254,7 @@ def forward(
attentions=outputs.attentions,
)

def pixel_shuffle(self, x, scale_factor=0.5):
def pixel_unshuffle(self, x, scale_factor=0.5):
n, w, h, c = x.size()
# N, W, H, C --> N, W, H * scale, C // scale
x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
Expand All @@ -263,8 +263,8 @@ def pixel_shuffle(self, x, scale_factor=0.5):
# N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
x = x.view(n, int(h * scale_factor), int(w * scale_factor),
int(c / (scale_factor * scale_factor)))
if self.ps_version == 'v1':
warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
if self.pixel_unshuffle_version == 'v1':
warnings.warn("In pixel_unshuffle_version 'v1', the height and width have not been swapped back, "
'which results in a transposed image.')
else:
x = x.permute(0, 2, 1, 3).contiguous()
Expand All @@ -285,7 +285,7 @@ def extract_feature(self, pixel_values):

h = w = int(vit_embeds.shape[1] ** 0.5)
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
vit_embeds = self.pixel_unshuffle(vit_embeds, scale_factor=self.downsample_ratio)
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
vit_embeds = self.mlp1(vit_embeds)
return vit_embeds
Expand Down
39 changes: 36 additions & 3 deletions internvl_chat/internvl/train/internvl_chat_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,15 @@ class ModelArguments:
default=0.0,
metadata={'help': 'Set the drop path rate for the ViT. Default is 0.'},
)
ps_version: Literal['v1', 'v2'] = field(
pixel_unshuffle_version: Literal['v1', 'v2'] = field(
default='v2',
metadata={'help': 'Specify the version of pixel shuffle implementation. Default is v2.'}
)
# Deprecated alias for pixel_unshuffle_version; keep temporarily for backward compatibility
ps_version: Optional[Literal['v1', 'v2']] = field(
default=None,
metadata={'help': 'DEPRECATED: use --pixel_unshuffle_version instead.'}
)
use_fast_tokenizer: bool = field(
default=False,
metadata={'help': 'Set to True to use the fast mode of the tokenizer.'}
Expand Down Expand Up @@ -827,6 +832,34 @@ def main():
handlers=[logging.StreamHandler(sys.stdout)],
)

# -------------------- begin: migrate deprecated args (ps_version -> pixel_unshuffle_version) --------------------

# !! IF `ps_version` IS READY TO BE DEPRECATED, REMOVE THE FOLLOWING BLOCK COMPLETELY !!

# Getting the "default value" of the new argument to determine if the user has explicitly set it
_default_new = ModelArguments.__dataclass_fields__['pixel_unshuffle_version'].default
new_is_default = (model_args.pixel_unshuffle_version == _default_new)

# If the old argument is provided, issue a deprecation warning and map it
if getattr(model_args, 'ps_version', None) is not None:
warnings.warn(
'`--ps_version` is deprecated and will be removed; use `--pixel_unshuffle_version` instead.',
DeprecationWarning
)
# If the new argument appears to have been explicitly set, and conflicts with the old value → take the new argument
if not new_is_default and model_args.ps_version != model_args.pixel_unshuffle_version:
logger.warning(
f'Both ps_version={model_args.ps_version} (deprecated) and '
f'pixel_unshuffle_version={model_args.pixel_unshuffle_version} are provided; '
f'using pixel_unshuffle_version.'
)
else:
# Else, map the old value to the new argument
model_args.pixel_unshuffle_version = model_args.ps_version

logger.info(f'Pixel unshuffle version = {model_args.pixel_unshuffle_version}')
# -------------------- end: migrate deprecated args ------------------------------------------------------------

if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()
Expand Down Expand Up @@ -903,7 +936,7 @@ def main():
config.select_layer = model_args.vision_select_layer
config.dynamic_image_size = data_args.dynamic_image_size
config.use_thumbnail = data_args.use_thumbnail
config.ps_version = model_args.ps_version
config.pixel_unshuffle_version = model_args.pixel_unshuffle_version
config.min_dynamic_patch = data_args.min_dynamic_patch
config.max_dynamic_patch = data_args.max_dynamic_patch
model = InternVLChatModel.from_pretrained(
Expand Down Expand Up @@ -932,7 +965,7 @@ def main():
vision_config.to_dict(), llm_config.to_dict(), downsample_ratio=data_args.down_sample_ratio,
pad2square=data_args.pad2square, template=data_args.conv_style,
select_layer=model_args.vision_select_layer, dynamic_image_size=data_args.dynamic_image_size,
use_thumbnail=data_args.use_thumbnail, ps_version=model_args.ps_version,
use_thumbnail=data_args.use_thumbnail, pixel_unshuffle_version=model_args.pixel_unshuffle_version,
min_dynamic_patch=data_args.min_dynamic_patch, max_dynamic_patch=data_args.max_dynamic_patch)
internvl_chat_config.force_image_size = data_args.force_image_size
logger.info('Building InternVLChatModel...')
Expand Down
40 changes: 37 additions & 3 deletions internvl_chat/internvl/train/internvl_chat_mpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,15 @@ class ModelArguments:
default=0.0,
metadata={'help': 'Set the drop path rate for the ViT. Default is 0.'},
)
ps_version: Literal['v1', 'v2'] = field(
pixel_unshuffle_version: Literal['v1', 'v2'] = field(
default='v2',
metadata={'help': 'Specify the version of pixel shuffle implementation. Default is v2.'}
)
# Deprecated alias for pixel_unshuffle_version; keep temporarily for backward compatibility
ps_version: Optional[Literal['v1', 'v2']] = field(
default=None,
metadata={'help': 'DEPRECATED: use --pixel_unshuffle_version instead.'}
)
use_fast_tokenizer: bool = field(
default=False,
metadata={'help': 'Set to True to use the fast mode of the tokenizer.'}
Expand Down Expand Up @@ -803,6 +808,35 @@ def main():
handlers=[logging.StreamHandler(sys.stdout)],
)

# -------------------- begin: migrate deprecated args (ps_version -> pixel_unshuffle_version) --------------------

# !! IF `ps_version` IS READY TO BE DEPRECATED, REMOVE THE FOLLOWING BLOCK COMPLETELY !!

# Getting the "default value" of the new argument to determine if the user has explicitly set it
_default_new = ModelArguments.__dataclass_fields__['pixel_unshuffle_version'].default
new_is_default = (model_args.pixel_unshuffle_version == _default_new)

# If the old argument is provided, issue a deprecation warning and map it
if getattr(model_args, 'ps_version', None) is not None:
warnings.warn(
'`--ps_version` is deprecated and will be removed; use `--pixel_unshuffle_version` instead.',
DeprecationWarning
)
# If the new argument appears to have been explicitly set, and conflicts with the old value → take the new argument
if not new_is_default and model_args.ps_version != model_args.pixel_unshuffle_version:
logger.warning(
f'Both ps_version={model_args.ps_version} (deprecated) and '
f'pixel_unshuffle_version={model_args.pixel_unshuffle_version} are provided; '
f'using pixel_unshuffle_version.'
)
else:
# Else, map the old value to the new argument
model_args.pixel_unshuffle_version = model_args.ps_version

logger.info(f'Pixel unshuffle version = {model_args.pixel_unshuffle_version}')
# -------------------- end: migrate deprecated args ------------------------------------------------------------


if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()
Expand Down Expand Up @@ -873,7 +907,7 @@ def main():
config.select_layer = model_args.vision_select_layer
config.dynamic_image_size = data_args.dynamic_image_size
config.use_thumbnail = data_args.use_thumbnail
config.ps_version = model_args.ps_version
config.pixel_unshuffle_version = model_args.pixel_unshuffle_version
config.min_dynamic_patch = data_args.min_dynamic_patch
config.max_dynamic_patch = data_args.max_dynamic_patch
model = InternVLChatModel.from_pretrained(
Expand Down Expand Up @@ -904,7 +938,7 @@ def main():
vision_config.to_dict(), llm_config.to_dict(), downsample_ratio=data_args.down_sample_ratio,
pad2square=data_args.pad2square, template=data_args.conv_style,
select_layer=model_args.vision_select_layer, dynamic_image_size=data_args.dynamic_image_size,
use_thumbnail=data_args.use_thumbnail, ps_version=model_args.ps_version,
use_thumbnail=data_args.use_thumbnail, pixel_unshuffle_version=model_args.pixel_unshuffle_version,
min_dynamic_patch=data_args.min_dynamic_patch, max_dynamic_patch=data_args.max_dynamic_patch)
internvl_chat_config.force_image_size = data_args.force_image_size
logger.info('Building InternVLChatModel...')
Expand Down
41 changes: 38 additions & 3 deletions internvl_chat/internvl/train/internvl_chat_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,15 @@ class ModelArguments:
default=0.0,
metadata={'help': 'Set the drop path rate for the ViT. Default is 0.'},
)
ps_version: Literal['v1', 'v2'] = field(
pixel_unshuffle_version: Literal['v1', 'v2'] = field(
default='v2',
metadata={'help': 'Specify the version of pixel shuffle implementation. Default is v2.'}
)
# Deprecated alias for pixel_unshuffle_version; keep temporarily for backward compatibility
ps_version: Optional[Literal['v1', 'v2']] = field(
default=None,
metadata={'help': 'DEPRECATED: use --pixel_unshuffle_version instead.'}
)
use_fast_tokenizer: bool = field(
default=False,
metadata={'help': 'Set to True to use the fast mode of the tokenizer.'}
Expand Down Expand Up @@ -870,6 +875,36 @@ def main():
datefmt='%m/%d/%Y %H:%M:%S',
handlers=[logging.StreamHandler(sys.stdout)],
)


# -------------------- begin: migrate deprecated args (ps_version -> pixel_unshuffle_version) --------------------

# !! IF `ps_version` IS READY TO BE DEPRECATED, REMOVE THE FOLLOWING BLOCK COMPLETELY !!

# Getting the "default value" of the new argument to determine if the user has explicitly set it
_default_new = ModelArguments.__dataclass_fields__['pixel_unshuffle_version'].default
new_is_default = (model_args.pixel_unshuffle_version == _default_new)

# If the old argument is provided, issue a deprecation warning and map it
if getattr(model_args, 'ps_version', None) is not None:
warnings.warn(
'`--ps_version` is deprecated and will be removed; use `--pixel_unshuffle_version` instead.',
DeprecationWarning
)
# If the new argument appears to have been explicitly set, and conflicts with the old value → take the new argument
if not new_is_default and model_args.ps_version != model_args.pixel_unshuffle_version:
logger.warning(
f'Both ps_version={model_args.ps_version} (deprecated) and '
f'pixel_unshuffle_version={model_args.pixel_unshuffle_version} are provided; '
f'using pixel_unshuffle_version.'
)
else:
# Else, map the old value to the new argument
model_args.pixel_unshuffle_version = model_args.ps_version

logger.info(f'Pixel unshuffle version = {model_args.pixel_unshuffle_version}')
# -------------------- end: migrate deprecated args ------------------------------------------------------------


if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
Expand Down Expand Up @@ -947,7 +982,7 @@ def main():
config.select_layer = model_args.vision_select_layer
config.dynamic_image_size = data_args.dynamic_image_size
config.use_thumbnail = data_args.use_thumbnail
config.ps_version = model_args.ps_version
config.pixel_unshuffle_version = model_args.pixel_unshuffle_version
config.min_dynamic_patch = data_args.min_dynamic_patch
config.max_dynamic_patch = data_args.max_dynamic_patch
model = InternVLChatModel.from_pretrained(
Expand Down Expand Up @@ -976,7 +1011,7 @@ def main():
vision_config.to_dict(), llm_config.to_dict(), downsample_ratio=data_args.down_sample_ratio,
pad2square=data_args.pad2square, template=data_args.conv_style,
select_layer=model_args.vision_select_layer, dynamic_image_size=data_args.dynamic_image_size,
use_thumbnail=data_args.use_thumbnail, ps_version=model_args.ps_version,
use_thumbnail=data_args.use_thumbnail, pixel_unshuffle_version=model_args.pixel_unshuffle_version,
min_dynamic_patch=data_args.min_dynamic_patch, max_dynamic_patch=data_args.max_dynamic_patch)
internvl_chat_config.force_image_size = data_args.force_image_size
logger.info('Building InternVLChatModel...')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ srun -p ${PARTITION} \
--group_by_length True \
--dynamic_image_size False \
--use_thumbnail False \
--ps_version 'v1' \
--pixel_unshuffle_version 'v1' \
--deepspeed "zero_stage3_config_34b.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ torchrun \
--group_by_length True \
--dynamic_image_size False \
--use_thumbnail False \
--ps_version 'v1' \
--pixel_unshuffle_version 'v1' \
--deepspeed "zero_stage3_config_34b.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ torchrun \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage1_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ torchrun \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage1_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ torchrun \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage3_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ torchrun \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage3_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ torchrun \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage1_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ torchrun \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage1_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ srun -p ${PARTITION} \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage3_config_34b.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ srun -p ${PARTITION} \
--group_by_length False \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage3_config_34b.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ srun -p ${PARTITION} \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--pixel_unshuffle_version 'v2' \
--deepspeed "zero_stage1_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
Loading