@@ -507,6 +507,10 @@ class PretrainedConfig:
507
507
If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
508
508
sep_token_id (`int`, *optional*): The id of the _separation_ token.
509
509
510
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
511
+ Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
512
+ model has a output word embedding layer.
513
+
510
514
dtype (`str`, *optional*):
511
515
The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
512
516
(which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
@@ -569,8 +573,10 @@ def __init__(self, **kwargs):
569
573
self .output_hidden_states = kwargs .pop ("output_hidden_states" , False )
570
574
self .output_attentions = kwargs .pop ("output_attentions" , False )
571
575
self .use_cache = kwargs .pop ("use_cache" , False )
576
+ self .tie_word_embeddings = kwargs .pop ("tie_word_embeddings" , True )
572
577
573
578
# for transformers fuse
579
+ self .fuse_linear = kwargs .pop ("fuse_linear" , False )
574
580
self .fuse_attention_qkv = kwargs .pop ("fuse_attention_qkv" , False )
575
581
self .fuse_attention_ffn = kwargs .pop ("fuse_attention_ffn" , False )
576
582
@@ -623,6 +629,9 @@ def __init__(self, **kwargs):
623
629
624
630
self .classifier_dropout = kwargs .pop ("classifier_dropout" , None )
625
631
632
+ self .dpo_config = kwargs .pop ("dpo_config" , None )
633
+ self .kto_config = kwargs .pop ("kto_config" , None )
634
+
626
635
# Tokenizer arguments TODO: eventually tokenizer and models should share the same config
627
636
self .tokenizer_class = kwargs .pop ("tokenizer_class" , None )
628
637
self .prefix = kwargs .pop ("prefix" , None )
0 commit comments