Initial sparse matrix support copied from Opus

jmvalin · jmvalin · commit f4700a68c00f · 2024-04-14T03:06:25.000-04:00
Now training model with data listed in the README
diff --git a/model_version b/model_version
@@ -1 +1 @@
-37cf35f
+2828242
diff --git a/torch/rnnoise/rnnoise.py b/torch/rnnoise/rnnoise.py
@@ -1,6 +1,31 @@
 import torch
 from torch import nn
 import torch.nn.functional as F
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.split(__file__)[0], '..'))
+from sparsification import GRUSparsifier
+
+sparsify_start     = 2500
+sparsify_stop      = 8000
+sparsify_interval  = 50
+sparsify_exponent  = 3
+
+sparse_params1 = {
+                'W_hr' : (0.3, [8, 4], True),
+                'W_hz' : (0.2, [8, 4], True),
+                'W_hn' : (0.5, [8, 4], True),
+                'W_ir' : (0.3, [8, 4], False),
+                'W_iz' : (0.2, [8, 4], False),
+                'W_in' : (0.5, [8, 4], False)
+                }
+
+def init_weights(module):
+    if isinstance(module, nn.GRU):
+        for p in module.named_parameters():
+            if p[0].startswith('weight_hh_'):
+                nn.init.orthogonal_(p[1])
 
 class RNNoise(nn.Module):
     def __init__(self, input_dim=65, output_dim=32, cond_size=128, gru_size=256):
@@ -19,6 +44,16 @@ def __init__(self, input_dim=65, output_dim=32, cond_size=128, gru_size=256):
         self.vad_dense = nn.Linear(self.gru_size, 1)
         nb_params = sum(p.numel() for p in self.parameters())
         print(f"model: {nb_params} weights")
+        self.apply(init_weights)
+        self.sparsifier = []
+        self.sparsifier.append(GRUSparsifier([(self.gru1, sparse_params1)], sparsify_start, sparsify_stop, sparsify_interval, sparsify_exponent))
+        self.sparsifier.append(GRUSparsifier([(self.gru2, sparse_params1)], sparsify_start, sparsify_stop, sparsify_interval, sparsify_exponent))
+        self.sparsifier.append(GRUSparsifier([(self.gru3, sparse_params1)], sparsify_start, sparsify_stop, sparsify_interval, sparsify_exponent))
+
+
+    def sparsify(self):
+        for sparsifier in self.sparsifier:
+            sparsifier.step()
 
     def forward(self, features, states=None):
         #print(states)
diff --git a/torch/rnnoise/train_rnnoise.py b/torch/rnnoise/train_rnnoise.py
@@ -18,14 +18,14 @@
 
 model_group = parser.add_argument_group(title="model parameters")
 model_group.add_argument('--cond-size', type=int, help="first conditioning size, default: 128", default=128)
-model_group.add_argument('--gru-size', type=int, help="first conditioning size, default: 256", default=256)
+model_group.add_argument('--gru-size', type=int, help="first conditioning size, default: 384", default=384)
 
 training_group = parser.add_argument_group(title="training parameters")
-training_group.add_argument('--batch-size', type=int, help="batch size, default: 192", default=192)
+training_group.add_argument('--batch-size', type=int, help="batch size, default: 128", default=128)
 training_group.add_argument('--lr', type=float, help='learning rate, default: 1e-3', default=1e-3)
 training_group.add_argument('--epochs', type=int, help='number of training epochs, default: 200', default=200)
 training_group.add_argument('--sequence-length', type=int, help='sequence length, default: 2000', default=2000)
-training_group.add_argument('--lr-decay', type=float, help='learning rate decay factor, default: 1e-3', default=1e-3)
+training_group.add_argument('--lr-decay', type=float, help='learning rate decay factor, default: 5e-5', default=5e-5)
 training_group.add_argument('--initial-checkpoint', type=str, help='initial checkpoint to start training from, default: None', default=None)
 training_group.add_argument('--gamma', type=float, help='perceptual exponent (default 0.1667)', default=0.1667)
 
@@ -127,6 +127,7 @@ def mask(g):
 
                 loss.backward()
                 optimizer.step()
+                model.sparsify()
 
                 scheduler.step()
 
diff --git a/torch/sparsification/__init__.py b/torch/sparsification/__init__.py
@@ -0,0 +1,2 @@
+from .gru_sparsifier import GRUSparsifier
+from .common import sparsify_matrix, calculate_gru_flops_per_step
diff --git a/torch/sparsification/common.py b/torch/sparsification/common.py
@@ -0,0 +1,121 @@
+"""
+/* Copyright (c) 2023 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import torch
+
+def sparsify_matrix(matrix : torch.tensor, density : float, block_size, keep_diagonal : bool=False, return_mask : bool=False):
+    """ sparsifies matrix with specified block size
+
+        Parameters:
+        -----------
+        matrix : torch.tensor
+            matrix to sparsify
+        density : int
+            target density
+        block_size : [int, int]
+            block size dimensions
+        keep_diagonal : bool
+            If true, the diagonal will be kept. This option requires block_size[0] == block_size[1] and defaults to False
+    """
+
+    m, n   = matrix.shape
+    m1, n1 = block_size
+
+    if m % m1 or n % n1:
+        raise ValueError(f"block size {(m1, n1)} does not divide matrix size {(m, n)}")
+
+    # extract diagonal if keep_diagonal = True
+    if keep_diagonal:
+        if m != n:
+            raise ValueError("Attempting to sparsify non-square matrix with keep_diagonal=True")
+
+        to_spare = torch.diag(torch.diag(matrix))
+        matrix   = matrix - to_spare
+    else:
+        to_spare = torch.zeros_like(matrix)
+
+    # calculate energy in sub-blocks
+    x = torch.reshape(matrix, (m // m1, m1, n // n1, n1))
+    x = x ** 2
+    block_energies = torch.sum(torch.sum(x, dim=3), dim=1)
+
+    number_of_blocks = (m * n) // (m1 * n1)
+    number_of_survivors = round(number_of_blocks * density)
+
+    # masking threshold
+    if number_of_survivors == 0:
+        threshold = 0
+    else:
+        threshold = torch.sort(torch.flatten(block_energies)).values[-number_of_survivors]
+
+    # create mask
+    mask = torch.ones_like(block_energies)
+    mask[block_energies < threshold] = 0
+    mask = torch.repeat_interleave(mask, m1, dim=0)
+    mask = torch.repeat_interleave(mask, n1, dim=1)
+
+    # perform masking
+    masked_matrix = mask * matrix + to_spare
+
+    if return_mask:
+        return masked_matrix, mask
+    else:
+        return masked_matrix
+
+def calculate_gru_flops_per_step(gru, sparsification_dict=dict(), drop_input=False):
+    input_size = gru.input_size
+    hidden_size = gru.hidden_size
+    flops = 0
+
+    input_density = (
+        sparsification_dict.get('W_ir', [1])[0]
+        + sparsification_dict.get('W_in', [1])[0]
+        + sparsification_dict.get('W_iz', [1])[0]
+    ) / 3
+
+    recurrent_density = (
+        sparsification_dict.get('W_hr', [1])[0]
+        + sparsification_dict.get('W_hn', [1])[0]
+        + sparsification_dict.get('W_hz', [1])[0]
+    ) / 3
+
+    # input matrix vector multiplications
+    if not drop_input:
+        flops += 2 * 3 * input_size * hidden_size * input_density
+
+    # recurrent matrix vector multiplications
+    flops += 2 * 3 * hidden_size * hidden_size * recurrent_density
+
+    # biases
+    flops += 6 * hidden_size
+
+    # activations estimated by 10 flops per activation
+    flops += 30 * hidden_size
+
+    return flops
diff --git a/torch/sparsification/gru_sparsifier.py b/torch/sparsification/gru_sparsifier.py
@@ -0,0 +1,187 @@
+"""
+/* Copyright (c) 2023 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import torch
+
+from .common import sparsify_matrix
+
+
+class GRUSparsifier:
+    def __init__(self, task_list, start, stop, interval, exponent=3):
+        """ Sparsifier for torch.nn.GRUs
+
+            Parameters:
+            -----------
+            task_list : list
+                task_list contains a list of tuples (gru, sparsify_dict), where gru is an instance
+                of torch.nn.GRU and sparsify_dic is a dictionary with keys in {'W_ir', 'W_iz', 'W_in',
+                'W_hr', 'W_hz', 'W_hn'} corresponding to the input and recurrent weights for the reset,
+                update, and new gate. The values of sparsify_dict are tuples (density, [m, n], keep_diagonal),
+                where density is the target density in [0, 1], [m, n] is the shape sub-blocks to which
+                sparsification is applied and keep_diagonal is a bool variable indicating whether the diagonal
+                should be kept.
+
+            start : int
+                training step after which sparsification will be started.
+
+            stop : int
+                training step after which sparsification will be completed.
+
+            interval : int
+                sparsification interval for steps between start and stop. After stop sparsification will be
+                carried out after every call to GRUSparsifier.step()
+
+            exponent : float
+                Interpolation exponent for sparsification interval. In step i sparsification will be carried out
+                with density (alpha + target_density * (1 * alpha)), where
+                alpha = ((stop - i) / (start - stop)) ** exponent
+
+            Example:
+            --------
+            >>> import torch
+            >>> gru = torch.nn.GRU(10, 20)
+            >>> sparsify_dict = {
+            ...         'W_ir' : (0.5, [2, 2], False),
+            ...         'W_iz' : (0.6, [2, 2], False),
+            ...         'W_in' : (0.7, [2, 2], False),
+            ...         'W_hr' : (0.1, [4, 4], True),
+            ...         'W_hz' : (0.2, [4, 4], True),
+            ...         'W_hn' : (0.3, [4, 4], True),
+            ...     }
+            >>> sparsifier = GRUSparsifier([(gru, sparsify_dict)], 0, 100, 50)
+            >>> for i in range(100):
+            ...         sparsifier.step()
+        """
+        # just copying parameters...
+        self.start      = start
+        self.stop       = stop
+        self.interval   = interval
+        self.exponent   = exponent
+        self.task_list  = task_list
+
+        # ... and setting counter to 0
+        self.step_counter = 0
+
+        self.last_masks = {key : None for key in ['W_ir', 'W_in', 'W_iz', 'W_hr', 'W_hn', 'W_hz']}
+
+    def step(self, verbose=False):
+        """ carries out sparsification step
+
+            Call this function after optimizer.step in your
+            training loop.
+
+            Parameters:
+            ----------
+            verbose : bool
+                if true, densities are printed out
+
+            Returns:
+            --------
+            None
+
+        """
+        # compute current interpolation factor
+        self.step_counter += 1
+
+        if self.step_counter < self.start:
+            return
+        elif self.step_counter < self.stop:
+            # update only every self.interval-th interval
+            if self.step_counter % self.interval:
+                return
+
+            alpha = ((self.stop - self.step_counter) / (self.stop - self.start)) ** self.exponent
+        else:
+            alpha = 0
+
+
+        with torch.no_grad():
+            for gru, params in self.task_list:
+                hidden_size = gru.hidden_size
+
+                # input weights
+                for i, key in enumerate(['W_ir', 'W_iz', 'W_in']):
+                    if key in params:
+                        density = alpha + (1 - alpha) * params[key][0]
+                        if verbose:
+                            print(f"[{self.step_counter}]: {key} density: {density}")
+
+                        gru.weight_ih_l0[i * hidden_size : (i+1) * hidden_size, : ], new_mask = sparsify_matrix(
+                            gru.weight_ih_l0[i * hidden_size : (i + 1) * hidden_size, : ],
+                            density, # density
+                            params[key][1], # block_size
+                            params[key][2], # keep_diagonal (might want to set this to False)
+                            return_mask=True
+                        )
+
+                        if type(self.last_masks[key]) != type(None):
+                            if not torch.all(self.last_masks[key] == new_mask) and self.step_counter > self.stop:
+                                print(f"sparsification mask {key} changed for gru {gru}")
+
+                        self.last_masks[key] = new_mask
+
+                # recurrent weights
+                for i, key in enumerate(['W_hr', 'W_hz', 'W_hn']):
+                    if key in params:
+                        density = alpha + (1 - alpha) * params[key][0]
+                        if verbose:
+                            print(f"[{self.step_counter}]: {key} density: {density}")
+                        gru.weight_hh_l0[i * hidden_size : (i+1) * hidden_size, : ], new_mask = sparsify_matrix(
+                            gru.weight_hh_l0[i * hidden_size : (i + 1) * hidden_size, : ],
+                            density,
+                            params[key][1], # block_size
+                            params[key][2], # keep_diagonal (might want to set this to False)
+                            return_mask=True
+                        )
+
+                        if type(self.last_masks[key]) != type(None):
+                            if not torch.all(self.last_masks[key] == new_mask) and self.step_counter > self.stop:
+                                print(f"sparsification mask {key} changed for gru {gru}")
+
+                        self.last_masks[key] = new_mask
+
+
+
+if __name__ == "__main__":
+    print("Testing sparsifier")
+
+    gru = torch.nn.GRU(10, 20)
+    sparsify_dict = {
+        'W_ir' : (0.5, [2, 2], False),
+        'W_iz' : (0.6, [2, 2], False),
+        'W_in' : (0.7, [2, 2], False),
+        'W_hr' : (0.1, [4, 4], True),
+        'W_hz' : (0.2, [4, 4], True),
+        'W_hn' : (0.3, [4, 4], True),
+    }
+
+    sparsifier = GRUSparsifier([(gru, sparsify_dict)], 0, 100, 10)
+
+    for i in range(100):
+        sparsifier.step(verbose=True)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .gru_sparsifier import GRUSparsifier`
	`2`	`+from .common import sparsify_matrix, calculate_gru_flops_per_step`