Skip to content

Commit 6ac30ff

Browse files
cheng221nemonamelessluyao-cv
authored
support internvl2.5-mpo (#1008)
Co-authored-by: nifeng <[email protected]> Co-authored-by: luyao-cv <[email protected]>
1 parent 270d161 commit 6ac30ff

File tree

4 files changed

+115
-38
lines changed

4 files changed

+115
-38
lines changed

paddlemix/examples/internvl2/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,15 @@
2727
|--------------------|
2828
| OpenGVLab/InternVL2-1B |
2929
| OpenGVLab/InternVL2_5-1B |
30+
| OpenGVLab/InternVL2_5-1B-MPO |
3031
| OpenGVLab/InternVL2-2B |
3132
| OpenGVLab/InternVL2_5-2B |
33+
| OpenGVLab/InternVL2_5-2B-MPO |
3234
| OpenGVLab/InternVL2_5-4B |
35+
| OpenGVLab/InternVL2_5-4B-MPO |
3336
| OpenGVLab/InternVL2-8B |
3437
| OpenGVLab/InternVL2_5-8B |
38+
| OpenGVLab/InternVL2_5-8B-MPO |
3539
| OpenGVLab/InternVL2-26B |
3640
| OpenGVLab/InternVL2-40B |
3741
| OpenGVLab/InternVL2-8B-MPO |
@@ -198,7 +202,7 @@ sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen
198202

199203
## 多卡
200204
# 2B
201-
sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
205+
sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
202206

203207
## 多卡
204208
# 8B
@@ -211,7 +215,7 @@ sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_8b_inte
211215

212216
```bash
213217
python paddlemix/examples/internvl2/chat_demo.py \
214-
--model_name_or_path "your_checkpoints" \
218+
--model_name_or_path "your_checkpoint" \
215219
--image_path 'paddlemix/demo_images/examples_image1.jpg' \
216220
--text "Please describe this image in detail."
217221
```

paddlemix/examples/internvl2/chat_demo.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
IMAGENET_STD = (0.229, 0.224, 0.225)
3030

3131

32-
3332
def check_dtype_compatibility():
3433
"""
3534
检查当前环境下可用的数据类型
@@ -44,15 +43,15 @@ def check_dtype_compatibility():
4443
if gpu_arch is None:
4544
print("Unable to determine GPU architecture, falling back to float32")
4645
return paddle.float32
47-
46+
4847
major, minor = gpu_arch
49-
compute_capability = major + minor/10
48+
compute_capability = major + minor / 10
5049
print(f"GPU compute capability: {compute_capability}")
51-
50+
5251
try:
5352
# 测试bfloat16兼容性
5453
if compute_capability >= 8.0: # Ampere及更新架构
55-
test_tensor = paddle.zeros([2, 2], dtype='bfloat16')
54+
test_tensor = paddle.zeros([2, 2], dtype="bfloat16")
5655
test_op = paddle.matmul(test_tensor, test_tensor)
5756
print("bfloat16 is supported and working")
5857
return paddle.bfloat16
@@ -62,7 +61,7 @@ def check_dtype_compatibility():
6261
try:
6362
# 测试float16兼容性
6463
if compute_capability >= 5.3: # Maxwell及更新架构
65-
test_tensor = paddle.zeros([2, 2], dtype='float16')
64+
test_tensor = paddle.zeros([2, 2], dtype="float16")
6665
test_op = paddle.matmul(test_tensor, test_tensor)
6766
print("float16 is supported and working")
6867
return paddle.float16
@@ -99,7 +98,7 @@ def load_tokenizer(model_path):
9998
import re
10099

101100
match = re.search(r"\d+B", model_path)
102-
model2_5 = "InternVL2_5" in model_path
101+
model2_5 = "InternVL2_5" in model_path
103102
if match:
104103
model_size = match.group()
105104
else:
@@ -137,7 +136,7 @@ def main(args):
137136
print("len(tokenizer): ", len(tokenizer))
138137

139138
model = InternVLChatModel.from_pretrained(MODEL_PATH, dtype=args.dtype).eval()
140-
generation_config = dict(max_new_tokens=1024, do_sample=False, top_p=0.01)
139+
generation_config = dict(max_new_tokens=1024, do_sample=False)
141140

142141
with paddle.no_grad():
143142
response, history = model.chat(
@@ -157,11 +156,7 @@ def main(args):
157156
parser.add_argument("--image_path", type=str, default=None)
158157
parser.add_argument("--text", type=str, default="Please describe the image shortly.", required=True)
159158
parser.add_argument(
160-
"--dtype",
161-
type=str,
162-
default="float16",
163-
choices=["float32", "bfloat16", "float16"],
164-
help="Model dtype"
159+
"--dtype", type=str, default="float16", choices=["float32", "bfloat16", "float16"], help="Model dtype"
165160
)
166161
args = parser.parse_args()
167162

@@ -171,11 +166,10 @@ def main(args):
171166
args.dtype = paddle.float16
172167
else:
173168
args.dtype = paddle.float32
174-
175169

176170
# 检查环境支持的dtype并设置
177171
available_dtype = check_dtype_compatibility()
178-
172+
179173
# 如果用户指定了dtype,尝试使用用户指定的类型
180174
if args.dtype == "bfloat16":
181175
desired_dtype = paddle.bfloat16
@@ -192,5 +186,5 @@ def main(args):
192186
args.dtype = desired_dtype
193187

194188
print(f"Using dtype: {args.dtype}")
195-
196-
main(args)
189+
190+
main(args)

paddlemix/models/internvl2/internlm2/tokenizer_internlm2.py

Lines changed: 93 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,17 @@
1818

1919
"""Tokenization classes for InternLM."""
2020
import os
21+
import re
2122
from shutil import copyfile
2223
from typing import Any, Dict, List, Optional, Tuple
2324

2425
import sentencepiece as spm
2526
from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
27+
from paddlenlp.transformers.tokenizer_utils_base import AddedToken, TextInput
28+
2629
from paddlemix.utils.log import logger
2730

28-
VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
31+
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
2932
# VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
3033

3134
# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
@@ -38,18 +41,18 @@ class InternLM2Tokenizer(PretrainedTokenizer):
3841
Path to the vocabulary file.
3942
"""
4043

41-
resource_files_names = VOCAB_FILES_NAMES # vocab_files_names in torch
42-
pretrained_resource_files_map = {} # pretrained_vocab_files_map in torch
43-
model_input_names = ['input_ids', 'attention_mask']
44-
_auto_class = 'AutoTokenizer'
44+
resource_files_names = VOCAB_FILES_NAMES # vocab_files_names in torch
45+
pretrained_resource_files_map = {} # pretrained_vocab_files_map in torch
46+
model_input_names = ["input_ids", "attention_mask"]
47+
_auto_class = "AutoTokenizer"
4548

4649
def __init__(
4750
self,
4851
vocab_file,
49-
unk_token='<unk>',
50-
bos_token='<s>',
51-
eos_token='</s>',
52-
pad_token='</s>',
52+
unk_token="<unk>",
53+
bos_token="<s>",
54+
eos_token="</s>",
55+
pad_token="</s>",
5356
sp_model_kwargs: Optional[Dict[str, Any]] = None,
5457
add_bos_token=True,
5558
add_eos_token=False,
@@ -78,7 +81,7 @@ def __init__(
7881
def no_prefix_space_tokens(self):
7982
if self._no_prefix_space_tokens is None:
8083
vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
81-
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith('▁')}
84+
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
8285
return self._no_prefix_space_tokens
8386

8487
@property
@@ -115,20 +118,20 @@ def _convert_id_to_token(self, index):
115118

116119
def _maybe_add_prefix_space(self, tokens, decoded):
117120
if tokens and tokens[0] not in self.no_prefix_space_tokens:
118-
return ' ' + decoded
121+
return " " + decoded
119122
else:
120123
return decoded
121124

122125
def convert_tokens_to_string(self, tokens):
123126
"""Converts a sequence of tokens (string) in a single string."""
124127
current_sub_tokens = []
125-
out_string = ''
128+
out_string = ""
126129
prev_is_special = False
127130
for token in tokens:
128131
# make sure that special tokens are not decoded using sentencepiece model
129132
if token in self.all_special_tokens:
130133
if not prev_is_special:
131-
out_string += ' '
134+
out_string += " "
132135
out_string += self.sp_model.decode(current_sub_tokens) + token
133136
prev_is_special = True
134137
current_sub_tokens = []
@@ -152,16 +155,16 @@ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None)
152155
`Tuple(str)`: Paths to the files saved.
153156
"""
154157
if not os.path.isdir(save_directory):
155-
logger.error(f'Vocabulary path ({save_directory}) should be a directory')
158+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
156159
return
157160
out_vocab_file = os.path.join(
158-
save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
161+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
159162
)
160163

161164
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
162165
copyfile(self.vocab_file, out_vocab_file)
163166
elif not os.path.isfile(self.vocab_file):
164-
with open(out_vocab_file, 'wb') as fi:
167+
with open(out_vocab_file, "wb") as fi:
165168
content_spiece_model = self.sp_model.serialized_model_proto()
166169
fi.write(content_spiece_model)
167170

@@ -231,3 +234,77 @@ def create_token_type_ids_from_sequences(
231234
if token_ids_1 is None:
232235
return len(token_ids_0 + eos) * [0]
233236
return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
237+
238+
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
239+
"""
240+
Converts a string into a sequence of tokens, using the tokenizer.
241+
242+
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
243+
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
244+
245+
Args:
246+
text (`str`):
247+
The sequence to be encoded.
248+
**kwargs (additional keyword arguments):
249+
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
250+
251+
Returns:
252+
`List[str]`: The list of tokens.
253+
"""
254+
split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
255+
256+
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
257+
258+
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
259+
all_special_tokens_extended = dict(
260+
(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
261+
)
262+
263+
if hasattr(self, "do_lower_case") and self.do_lower_case:
264+
# convert non-special tokens to lowercase. Might be super slow as well?
265+
escaped_special_toks = [
266+
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
267+
]
268+
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
269+
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
270+
271+
if split_special_tokens:
272+
no_split_token = []
273+
tokens = [text]
274+
else:
275+
no_split_token = set(self.unique_no_split_tokens) # don't split on any of the added tokens
276+
# "This is something<special_token_1> else"
277+
tokens = self.tokens_trie.split(text)
278+
279+
# ["This is something", "<special_token_1>", " else"]
280+
for i, token in enumerate(tokens):
281+
if token in no_split_token:
282+
tok_extended = all_special_tokens_extended.get(token, None)
283+
left = tokens[i - 1] if i > 0 else None
284+
right = tokens[i + 1] if i < len(tokens) - 1 else None
285+
if isinstance(tok_extended, AddedToken):
286+
if tok_extended.rstrip and right:
287+
# A bit counter-intuitive but we strip the left of the string
288+
# since tok_extended.rstrip means the special token is eating all white spaces on its right
289+
tokens[i + 1] = right.lstrip()
290+
# Strip white spaces on the left
291+
if tok_extended.lstrip and left:
292+
tokens[i - 1] = left.rstrip() # Opposite here
293+
if tok_extended.single_word and left and left[-1] != " ":
294+
tokens[i - 1] += token
295+
tokens[i] = ""
296+
elif tok_extended.single_word and right and right[0] != " ":
297+
tokens[i + 1] = token + tokens[i + 1]
298+
tokens[i] = ""
299+
# ["This is something", "<special_token_1>", "else"]
300+
tokenized_text = []
301+
for token in tokens:
302+
# Need to skip eventual empty (fully stripped) tokens
303+
if not token:
304+
continue
305+
if token in no_split_token:
306+
tokenized_text.append(token)
307+
else:
308+
tokenized_text.extend(self._tokenize(token))
309+
# ["This", " is", " something", "<special_token_1>", "else"]
310+
return tokenized_text

paddlemix/models/internvl2/internvl_chat/modeling_internvl_chat.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ def chat(
307307

308308
template = get_conv_template(self.template)
309309
template.system_message = self.system_message
310-
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
310+
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
311311

312312
history = [] if history is None else history
313313
for (old_question, old_answer) in history:
@@ -324,16 +324,18 @@ def chat(
324324
for num_patches in num_patches_list:
325325
image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
326326
query = query.replace("<image>", image_tokens, 1)
327+
model_inputs = tokenizer(query, add_special_tokens=True, return_tensors="pd")
327328

328-
model_inputs = tokenizer(query, return_tensors="pd")
329329
input_ids = model_inputs["input_ids"]
330330
attention_mask = model_inputs["attention_mask"]
331331
generation_config["eos_token_id"] = eos_token_id
332+
generation_config = GenerationConfig(**generation_config)
333+
332334
generation_output = self.generate(
333335
pixel_values=pixel_values, # [7, 3, 448, 448]
336+
generation_config=generation_config,
334337
input_ids=input_ids, # [1, 1847]
335338
attention_mask=attention_mask, # [1, 1847]
336-
**generation_config, # {'max_new_tokens': 1024, 'do_sample': False, 'eos_token_id': 92542}
337339
)
338340
response = tokenizer.batch_decode(generation_output[0], skip_special_tokens=True)[0]
339341
response = response.split(template.sep)[0].strip()

0 commit comments

Comments
 (0)