parallel wrapper

Kye · Kye · commit 0941bb8269b6 · 2023-08-10T15:03:01.000-04:00
diff --git a/LongNet/__init__.py b/LongNet/__init__.py
@@ -1,5 +1,5 @@
 
-from LongNet.attention import DilatedAttention
+from LongNet.attention import ParallelWrapper, DilatedAttention
 # from LongNet.model import LongNetTokenizer, LongNet, DecoderConfig, Decoder, DilatedLongNet
 
 # from LongNet.iterations import DynamicDilatedAttention, DilatedAttentionOld, DilatedAttentionOP
diff --git a/LongNet/attention.py b/LongNet/attention.py
@@ -14,6 +14,12 @@
 dtype=torch.float16
 
 
+
+
+
+
+
+
 def SparsifyIndices(
     x: torch.Tensor, ws: List[int], rs: List[int], head_idx: int
 ) -> Tuple[int, torch.Tensor, Optional[torch.Tensor]]:
@@ -104,6 +110,45 @@ def MixOutputs(
 
 
 
+
+class ParallelWrapper:
+    """
+    A simple wrapper to enable easy usage of data parallelism.
+
+    Arguments:
+        model: The neural network model to be parallelized.
+        device (optional): The device to which the model should be moved. Default: "cuda".
+        use_data_parallel (optional): A boolean flag to indicate whether to use data parallelism or not. Default: True.
+    """
+    def __init__(
+            self,
+            model,
+            device="cuda",
+            use_data_parallel=True
+    ):
+        self.model = model.to(device)
+        self.use_data_parallel = use_data_parallel
+        self.device = device
+
+        if self.use_data_parallel and torch.cuda.device_count() < 1:
+            print(f"Using {torch.cuda.device_count()} GPUS")
+            self.model = nn.DataParallel(self.model)
+    
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+    
+    def to(self, device):
+        self.device = device
+        self.model= self.model.to(device)
+        return self
+    
+    def __getattr__(self, name):
+        #redirect attribute access to the internal model to allow direct access to its methods and props
+        return getattr(self.model, name)
+    
+
+
+
 #add alibi, qk layer norm, one write head, multiway, 
 class DilatedAttentionNew(nn.Module):
     """
@@ -319,6 +364,22 @@ def forward(self, x):
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 
 class MultiHeadDilatedAttention:
diff --git a/mh_example.py b/mh_example.py
@@ -0,0 +1 @@
+#
diff --git a/parallel_example.py b/parallel_example.py
@@ -0,0 +1,45 @@
+import timeit
+import torch 
+from LongNet.attention import ParallelWrapper, DilatedAttention
+
+#model condig
+d_model = 512
+num_heads = 8
+dilation_rate = 2
+segment_size = 64
+
+
+device="cuda:0"
+dtype=torch.float16
+
+#inputs
+batch_size = 32
+seq_len = 8192
+
+
+#create model
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+model = DilatedAttention(
+    d_model,
+    num_heads,
+    dilation_rate,
+    segment_size
+)
+parallel_model = ParallelWrapper(model, device=device)
+
+x = torch.randn((batch_size, seq_len, d_model), device=device, dtype=dtype)
+
+#test forward pass
+with torch.no_grad():
+    output = model(x)
+    print(f"Output shape: {output.shape}") #expected (batch_size, seq_len)
+
+#benchmark model
+num_runs = 1000
+start_time = timeit.default_timer()
+for _ in range(num_runs):
+    model(x)
+
+
+elapsed_time = timeit.default_timer() - start_time
+print(f"Average forward pass time: {elapsed_time / num_runs:.6f} seconds")
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 setup(
   name = 'LongNet',
   packages = find_packages(exclude=[]),
-  version = '0.4.3',
+  version = '0.4.8',
   license='MIT',
   description = 'LongNet - Pytorch',
   author = 'Kye Gomez',