14
14
}
15
15
16
16
17
- # from transformers
18
- class Conv1D (nn .Module ):
19
- """
20
- 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
21
-
22
- Basically works like a linear layer but the weights are transposed.
23
-
24
- Args:
25
- nf (`int`): The number of output features.
26
- nx (`int`): The number of input features.
27
- """
28
-
29
- def __init__ (self , nf , nx ):
30
- super ().__init__ ()
31
- self .nf = nf
32
- self .weight = nn .Parameter (torch .empty (nx , nf ))
33
- self .bias = nn .Parameter (torch .zeros (nf ))
34
- nn .init .normal_ (self .weight , std = 0.02 )
35
-
36
- def forward (self , x ):
37
- size_out = x .size ()[:- 1 ] + (self .nf ,)
38
- x = torch .addmm (self .bias , x .view (- 1 , x .size (- 1 )), self .weight )
39
- x = x .view (size_out )
40
- return x
41
-
42
-
43
17
class HeadFFN (nn .Module ): # todo rename
44
18
def __init__ (self , dim ):
45
19
super ().__init__ ()
46
- self .c_fc = Conv1D ( dim , config ['n_embd' ])
47
- self .c_proj = Conv1D ( config ['n_embd' ], dim )
20
+ self .c_fc = nn . Linear ( config ['n_embd' ], dim )
21
+ self .c_proj = nn . Linear ( dim , config ['n_embd' ])
48
22
self .act = nn .functional .gelu
49
23
50
24
def forward (self , hidden_states ):
@@ -62,8 +36,8 @@ def __init__(self):
62
36
self .head_dim = self .embed_dim // self .num_heads
63
37
self .split_size = self .embed_dim
64
38
65
- self .c_att = Conv1D (config ['n_embd' ] * 3 , config ['n_embd' ])
66
- self .c_proj = Conv1D (config ['n_embd' ], config ['n_embd' ])
39
+ self .c_att = nn . Linear (config ['n_embd' ], config ['n_embd' ] * 3 )
40
+ self .c_proj = nn . Linear (config ['n_embd' ], config ['n_embd' ])
67
41
68
42
def _split_heads (self , tensor , num_heads , attn_head_size ):
69
43
"""
0 commit comments