testing suite

Kye · Kye · commit 53b0de0d1201 · 2023-07-13T23:29:58.000-04:00
diff --git a/LongNet/attention.py b/LongNet/attention.py
@@ -5,7 +5,7 @@
 
 from LongNet.utils import XPOS, RelativePositionBias
 
-from LongNet.attend import FlashMHA
+from LongNet.attend import FlashAttention
 
 device = "cuda:0"
 dtype=torch.float16
@@ -34,7 +34,6 @@ def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0,
         self.num_heads = num_heads           # number of attention heads
         self.dilation_rate = dilation_rate   # dilation rate
         self.segment_size = segment_size     # segment size
-        
         self.dropout = nn.Dropout(dropout)
         # If casual attention is used
         self.casual = casual
@@ -44,13 +43,12 @@ def __init__(self, d_model, num_heads, dilation_rate, segment_size, dropout=0.0,
         self.use_rel_pos_bias = use_rel_pos_bias
         self.distributed = Distributed
 
+
         # Initialize attention for each head with dilation
-        # Initialize the attention heads with or without DataParallel based on the value of 'distributed'
         if self.distributed:
-            self.attentions = nn.ModuleList([DataParallel(FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype)) for _ in range(self.dilation_rate)])
+            self.attentions = nn.ModuleList([DataParallel(FlashAttention(causal=self.casual, dropout=dropout)) for _ in range(self.dilation_rate)])
         else:
-            self.attentions = nn.ModuleList([FlashMHA(embed_dim=d_model, num_heads=num_heads, device=device, dtype=dtype) for _ in range(self.dilation_rate)])
-
+            self.attentions = nn.ModuleList([FlashAttention(causal=self.casual, dropout=dropout) for _ in range(self.dilation_rate)])
 
         # If using positional encoding, initialize it
         if use_xpos:
@@ -104,8 +102,6 @@ def forward(self, x):
             #option2
             # elements_attns = [attention(element.to(dtype), element.to(dtype), element.to(dtype)) for element in x_]
             # attn_output = torch.cat(elements_attns, dim=1)
-
-
             
             # If using relative positional bias, add it
             if self.use_rel_pos_bias:
@@ -137,6 +133,11 @@ def forward(self, x):
 
 
 
+class MultiHeadDilatedAttention:
+    def __init__():
+        pass
+
+
 
 
 
diff --git a/LongNet/model.py b/LongNet/model.py
@@ -91,7 +91,7 @@ def __init__(self):
             distributed = False             # whether to distribute attention for DilatedAttention
         )
 
-    def forward(self, text_tokens, temperature: int = None, filter_thres: int = None, **kwargs):
+    def generate(self, text_tokens, temperature: int = None, filter_thres: int = None, **kwargs):
         sampled = self.model.generate(temperature=temperature, filter_thres=filter_thres)
         return sampled
 

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ def __init__(self):`
`91`	`91`	`distributed = False # whether to distribute attention for DilatedAttention`
`92`	`92`	`)`
`93`	`93`
`94`		`- def forward(self, text_tokens, temperature: int = None, filter_thres: int = None, **kwargs):`
	`94`	`+ def generate(self, text_tokens, temperature: int = None, filter_thres: int = None, **kwargs):`
`95`	`95`	`sampled = self.model.generate(temperature=temperature, filter_thres=filter_thres)`
`96`	`96`	`return sampled`
`97`	`97`