|
7 | 7 | class QWenConfig(PretrainedConfig):
|
8 | 8 | model_type = "qwen"
|
9 | 9 | keys_to_ignore_at_inference = ["past_key_values"]
|
10 |
| - attribute_map = { |
11 |
| - "hidden_size": "n_embd", |
12 |
| - "num_attention_heads": "n_head", |
13 |
| - "max_position_embeddings": "n_positions", |
14 |
| - "num_hidden_layers": "n_layer", |
15 |
| - } |
16 | 10 |
|
17 | 11 | def __init__(
|
18 | 12 | self,
|
19 |
| - vocab_size=151851, |
20 |
| - n_embd=4096, |
21 |
| - n_layer=32, |
22 |
| - n_head=32, |
23 |
| - n_inner=None, |
24 |
| - embd_pdrop=0.0, |
25 |
| - attn_pdrop=0.0, |
26 |
| - layer_norm_epsilon=1e-5, |
| 13 | + vocab_size=151936, |
| 14 | + hidden_size=4096, |
| 15 | + num_hidden_layers=32, |
| 16 | + num_attention_heads=32, |
| 17 | + emb_dropout_prob=0.0, |
| 18 | + attn_dropout_prob=0.0, |
| 19 | + layer_norm_epsilon=1e-6, |
27 | 20 | initializer_range=0.02,
|
| 21 | + max_position_embeddings=8192, |
28 | 22 | scale_attn_weights=True,
|
29 | 23 | use_cache=True,
|
30 |
| - eos_token_id=151643, |
31 |
| - apply_residual_connection_post_layernorm=False, |
32 |
| - bf16=True, |
| 24 | + bf16=False, |
| 25 | + fp16=False, |
| 26 | + fp32=False, |
33 | 27 | kv_channels=128,
|
34 | 28 | rotary_pct=1.0,
|
35 | 29 | rotary_emb_base=10000,
|
36 |
| - use_dynamic_ntk=False, |
37 |
| - use_logn_attn=False, |
38 |
| - use_flash_attn=True, |
39 |
| - ffn_hidden_size=22016, |
| 30 | + use_dynamic_ntk=True, |
| 31 | + use_logn_attn=True, |
| 32 | + use_flash_attn="auto", |
| 33 | + intermediate_size=22016, |
40 | 34 | no_bias=True,
|
41 | 35 | tie_word_embeddings=False,
|
42 | 36 | **kwargs,
|
43 | 37 | ):
|
44 |
| - self.eos_token_id = eos_token_id |
45 |
| - super().__init__(eos_token_id=eos_token_id, |
46 |
| - tie_word_embeddings=tie_word_embeddings, |
47 |
| - **kwargs) |
48 |
| - |
49 | 38 | self.vocab_size = vocab_size
|
50 |
| - self.n_embd = n_embd |
51 |
| - self.n_layer = n_layer |
52 |
| - self.n_head = n_head |
53 |
| - self.n_inner = n_inner |
54 |
| - self.embd_pdrop = embd_pdrop |
55 |
| - self.attn_pdrop = attn_pdrop |
| 39 | + self.hidden_size = hidden_size |
| 40 | + self.intermediate_size = intermediate_size |
| 41 | + self.num_hidden_layers = num_hidden_layers |
| 42 | + self.num_attention_heads = num_attention_heads |
| 43 | + self.emb_dropout_prob = emb_dropout_prob |
| 44 | + self.attn_dropout_prob = attn_dropout_prob |
56 | 45 | self.layer_norm_epsilon = layer_norm_epsilon
|
57 | 46 | self.initializer_range = initializer_range
|
58 | 47 | self.scale_attn_weights = scale_attn_weights
|
59 | 48 | self.use_cache = use_cache
|
60 |
| - self.apply_residual_connection_post_layernorm = ( |
61 |
| - apply_residual_connection_post_layernorm) |
| 49 | + self.max_position_embeddings = max_position_embeddings |
62 | 50 | self.bf16 = bf16
|
| 51 | + self.fp16 = fp16 |
| 52 | + self.fp32 = fp32 |
63 | 53 | self.kv_channels = kv_channels
|
64 | 54 | self.rotary_pct = rotary_pct
|
65 | 55 | self.rotary_emb_base = rotary_emb_base
|
66 | 56 | self.use_dynamic_ntk = use_dynamic_ntk
|
67 | 57 | self.use_logn_attn = use_logn_attn
|
68 | 58 | self.use_flash_attn = use_flash_attn
|
69 |
| - self.ffn_hidden_size = ffn_hidden_size |
70 | 59 | self.no_bias = no_bias
|
71 |
| - self.tie_word_embeddings = tie_word_embeddings |
| 60 | + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) |
0 commit comments