11
11
from megatron .training .initialize import initialize_megatron
12
12
from megatron .training .utils import get_ltor_masks_and_position_ids
13
13
14
- from swift .llm import ExportArguments , HfConfigFactory , get_model_tokenizer , get_template , save_checkpoint
14
+ from swift .llm import ExportArguments , HfConfigFactory , get_model_tokenizer , get_template , save_checkpoint , to_device
15
15
from swift .utils import get_logger , get_n_params_grads
16
16
from ..argument import MegatronArguments
17
17
from ..model import get_megatron_model_meta
@@ -87,21 +87,37 @@ def test_convert_precision(hf_model, mg_model, processor, torch_dtype=torch.floa
87
87
_test_params_sum (mg_model )
88
88
89
89
template = get_template (hf_model .model_meta .template , processor )
90
- input_ids = template .encode ({'messages' : [{'role' : 'user' , 'content' : 'who are you?' }]})['input_ids' ]
91
- input_ids = torch .tensor (input_ids )[None ].to ('cuda' )
90
+ template .set_mode ('train' )
91
+ inputs = template .encode ({
92
+ 'messages' : [
93
+ {
94
+ 'role' : 'user' ,
95
+ 'content' : 'Introduction to ms-swift.'
96
+ },
97
+ {
98
+ 'role' :
99
+ 'assistant' ,
100
+ 'content' :
101
+ 'ms-swift is an official framework provided by the ModelScope community for fine-tuning '
102
+ 'and deploying large language models and multi-modal large models.'
103
+ },
104
+ ]
105
+ })
106
+ inputs = to_device (template .data_collator ([inputs ]), 'cuda' )
92
107
93
108
HfConfigFactory .set_model_config_attr (hf_model , 'use_cache' , False )
94
109
share_embedding = mg_model .share_embeddings_and_output_weights
95
110
hf_modules = _find_modules (hf_model )
96
111
with torch .inference_mode (), _model_cpu_forward_context (hf_modules , torch_dtype , share_embedding = share_embedding ):
97
- hf_logits = hf_model (input_ids ).logits
112
+ hf_logits = hf_model (** inputs ).logits
98
113
hf_model = hf_model .to ('cpu' )
99
114
115
+ input_ids = inputs ['input_ids' ]
100
116
attention_mask , _ , position_ids = get_ltor_masks_and_position_ids (input_ids , - 100 , True , True , True )
101
117
packed_seq_params = None
102
118
mg_torch_dtype = torch_dtype
103
119
# thd
104
- # from ..train .utils import get_packed_seq_params
120
+ # from ..trainers .utils import get_packed_seq_params
105
121
# mg_torch_dtype = None
106
122
# packed_seq_params = get_packed_seq_params(position_ids)
107
123
# attention_mask = None
@@ -115,8 +131,10 @@ def test_convert_precision(hf_model, mg_model, processor, torch_dtype=torch.floa
115
131
position_ids = position_ids ,
116
132
packed_seq_params = packed_seq_params )
117
133
118
- mean_diff = (mg_logits - hf_logits ).abs ().mean ().item ()
134
+ token_mean_diff = (mg_logits - hf_logits ).abs ().mean (dim = - 1 )
135
+ mean_diff = token_mean_diff .mean ().item ()
119
136
max_diff = (mg_logits - hf_logits ).abs ().max ().item ()
137
+ print (f'token_mean_diff: { token_mean_diff } ' )
120
138
print (f'mean_diff: { mean_diff } , max_diff: { max_diff } (Please check that mean_diff is less than 0.1).' )
121
139
hf_tokens = hf_logits .argmax (- 1 )
122
140
mg_tokens = mg_logits .argmax (- 1 )
0 commit comments