fastgrnncuda: added low rank support to unrolled version

MJ10 · MJ10 · commit 4bcdf1796dcb · 2019-09-27T12:33:23.000+05:30
diff --git a/pytorch/edgeml_pytorch/cuda/fastgrnn_cuda.cpp b/pytorch/edgeml_pytorch/cuda/fastgrnn_cuda.cpp
@@ -34,7 +34,11 @@ std::vector<torch::Tensor> fastgrnn_unroll_cuda_forward(
   torch::Tensor zeta,
   torch::Tensor nu,
   torch::Tensor initial_h,
-  int z_non_linearity);
+  int z_non_linearity,
+  torch::Tensor w1,
+  torch::Tensor w2,
+  torch::Tensor u1,
+  torch::Tensor u2);
 
 std::vector<torch::Tensor> fastgrnn_unroll_cuda_backward(
   torch::Tensor grad_h,
@@ -47,7 +51,11 @@ std::vector<torch::Tensor> fastgrnn_unroll_cuda_backward(
   torch::Tensor z,
   torch::Tensor h_prime,
   torch::Tensor initial_h,
-  int z_non_linearity);
+  int z_non_linearity,
+  torch::Tensor w1,
+  torch::Tensor w2,
+  torch::Tensor u1,
+  torch::Tensor u2);
 
 #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
@@ -108,16 +116,30 @@ std::vector<torch::Tensor> fastgrnn_unroll_forward(
   torch::Tensor zeta,
   torch::Tensor nu,
   torch::Tensor initial_h,
-  int z_non_linearity) {
+  int z_non_linearity,
+  torch::Tensor w1,
+  torch::Tensor w2,
+  torch::Tensor u1,
+  torch::Tensor u2) {
   CHECK_INPUT(input);
-  CHECK_INPUT(w);
-  CHECK_INPUT(u);
+  if(w1.size(0) == 0) {
+    CHECK_INPUT(w);
+  } else {
+    CHECK_INPUT(w1);
+    CHECK_INPUT(w2);
+  }
+  if (u1.size(0) == 0) {
+    CHECK_INPUT(u);
+  } else {
+    CHECK_INPUT(u1);
+    CHECK_INPUT(u2);
+  }
   CHECK_INPUT(bias_z);
   CHECK_INPUT(bias_h_prime);
   CHECK_INPUT(initial_h);
   CHECK_INPUT(zeta);
   CHECK_INPUT(nu);
-  return fastgrnn_unroll_cuda_forward(input, w, u, bias_z, bias_h_prime, zeta, nu, initial_h, z_non_linearity);
+  return fastgrnn_unroll_cuda_forward(input, w, u, bias_z, bias_h_prime, zeta, nu, initial_h, z_non_linearity, w1, w2, u1, u2);
 }
 
 std::vector<torch::Tensor> fastgrnn_unroll_backward(
@@ -131,14 +153,28 @@ std::vector<torch::Tensor> fastgrnn_unroll_backward(
   torch::Tensor z,
   torch::Tensor h_prime,
   torch::Tensor initial_h,
+  torch::Tensor w1,
+  torch::Tensor w2,
+  torch::Tensor u1,
+  torch::Tensor u2,
   int z_non_linearity) {
   CHECK_INPUT(grad_h);
   CHECK_INPUT(input);
   CHECK_INPUT(hidden_states);
   CHECK_INPUT(z);
   CHECK_INPUT(h_prime);
-  CHECK_INPUT(w);
-  CHECK_INPUT(u);
+  if(w1.size(0) == 0) {
+    CHECK_INPUT(w);
+  } else {
+    CHECK_INPUT(w1);
+    CHECK_INPUT(w2);
+  }
+  if (u1.size(0) == 0) {
+    CHECK_INPUT(u);
+  } else {
+    CHECK_INPUT(u1);
+    CHECK_INPUT(u2);
+  }
   CHECK_INPUT(zeta);
   CHECK_INPUT(nu);
   CHECK_INPUT(initial_h);
@@ -154,7 +190,8 @@ std::vector<torch::Tensor> fastgrnn_unroll_backward(
     z,
     h_prime,
     initial_h,
-    z_non_linearity);
+    z_non_linearity,
+    w1, w2, u1, u2);
 }
 
 
diff --git a/pytorch/edgeml_pytorch/cuda/fastgrnn_cuda_kernel.cu b/pytorch/edgeml_pytorch/cuda/fastgrnn_cuda_kernel.cu
@@ -287,7 +287,11 @@ std::vector<torch::Tensor> fastgrnn_unroll_cuda_forward(
   torch::Tensor zeta,
   torch::Tensor nu,
   torch::Tensor initial_h,
-  int z_non_linearity) {
+  int z_non_linearity,
+  torch::Tensor w1,
+  torch::Tensor w2,
+  torch::Tensor u1,
+  torch::Tensor u2) {
     auto options = torch::TensorOptions().dtype(input.dtype()).device(input.device().type());
     const auto timesteps = input.size(0);
     const auto batch_size = initial_h.size(0);
@@ -305,9 +309,19 @@ std::vector<torch::Tensor> fastgrnn_unroll_cuda_forward(
 
     const int threads = 1024;
     const dim3 blocks((state_size + threads - 1) / threads, batch_size);
+    bool w_low_rank = w1.size(0) != 0;
+    bool u_low_rank = u1.size(0) != 0;
+    if (w_low_rank){
+      w = torch::mm(w1.transpose(0, 1), w2.transpose(0, 1));
+    }  else {
+      w = w.transpose(0, 1);
+    }
+    if (u_low_rank){
+      u = torch::mm(u1.transpose(0, 1), u2.transpose(0, 1));
+    } else {
+      u = u.transpose(0, 1);
+    }
 
-    w = w.transpose(0, 1);
-    u = u.transpose(0, 1);
     zeta = torch::sigmoid(zeta);
     nu = torch::sigmoid(nu);
 
@@ -372,16 +386,29 @@ std::vector<torch::Tensor> fastgrnn_unroll_cuda_backward(
   torch::Tensor z,
   torch::Tensor h_prime,
   torch::Tensor initial_h,
-  int z_non_linearity) {
+  int z_non_linearity,
+  torch::Tensor w1,
+  torch::Tensor w2,
+  torch::Tensor u1,
+  torch::Tensor u2) {
   
   auto d_input = torch::zeros_like(input);
-  auto d_w = torch::zeros_like(w);
-  auto d_u = torch::zeros_like(u);
   auto d_zeta = torch::zeros_like(initial_h);
   auto d_nu = torch::zeros_like(initial_h);
   auto d_bias_z = torch::zeros_like(initial_h);
   auto d_bias_h_prime = torch::zeros_like(initial_h);
 
+  bool w_low_rank = w1.size(0) != 0;
+  bool u_low_rank = u1.size(0) != 0;
+  if(w_low_rank) {
+    w = torch::mm(w2, w1);
+  }
+  if (u_low_rank) {
+    u = torch::mm(u2, u1);
+  }
+  auto d_w = torch::zeros_like(w);
+  auto d_u = torch::zeros_like(u);
+  
   zeta = torch::sigmoid(zeta);
   nu = torch::sigmoid(nu);
   auto d_nu_sigmoid = d_sigmoid(nu);
@@ -468,11 +495,26 @@ std::vector<torch::Tensor> fastgrnn_unroll_cuda_backward(
     d_input[t] = torch::mm(d_precomp, w);
     d_w = torch::addmm(d_w, d_precomp.transpose(0, 1), input[t]);
     d_u = torch::addmm(d_u, d_precomp.transpose(0, 1), prev_h_);
-    // grad_curr_h = d_old_h;
   }
   d_bias_z = d_bias_z.sum(0, true);
   d_bias_h_prime = d_bias_h_prime.sum(0, true);
   d_zeta = (d_zeta.sum(0, true)).sum(1, true);
   d_nu = (d_nu.sum(0, true)).sum(1, true);
-  return {d_input, d_w, d_u, d_bias_z, d_bias_h_prime, d_zeta, d_nu, d_old_h};
-}
+  if (w_low_rank) {
+    auto d_w1 = torch::mm(w2.transpose(0, 1), d_w);
+    auto d_w2 = torch::mm(d_w, w1.transpose(0, 1));
+    d_w = torch::empty(0);
+  } else {
+    auto d_w1 = torch::empty(0);
+    auto d_w2 = torch::empty(0);
+  }
+  if(u_low_rank) {
+    auto d_u1 = torch::mm(u2.transpose(0, 1), d_u);
+    auto d_u2 = torch::mm(d_u, u1.transpose(0, 1));
+    d_u = torch::empty(0);
+  } else {
+    auto d_u1 = torch::empty(0);
+    auto d_u2 = torch::empty(0);
+  }
+  return {d_input, d_bias_z, d_bias_h_prime, d_zeta, d_nu, d_old_h, d_w, d_u, d_w1, d_w2, d_u1, d_u2};
+}
diff --git a/pytorch/edgeml_pytorch/graph/rnn.py b/pytorch/edgeml_pytorch/graph/rnn.py
@@ -329,7 +329,7 @@ def __init__(self, input_size, hidden_size, gate_non_linearity="sigmoid", zetaIn
         self._nuInit = nuInit
         self._name = name
         self._gate_non_linearity = NON_LINEARITY[gate_non_linearity]
-        self.W = nn.Parameter(0.1 * torch.randn([input_size, hidden_size]))
+        self.W = nn.Parameter(0.1 * torch.randn([hidden_size, input_size]))
         self.U = nn.Parameter(0.1 * torch.randn([hidden_size, hidden_size]))
 
         self.bias_gate = nn.Parameter(torch.ones([1, hidden_size]))
@@ -1065,7 +1065,8 @@ def forward(self, input, hiddenState=None, cellState=None):
 
 class FastGRNNCUDA(nn.Module):
     """Unrolled implementation of the FastGRNNCUDACell"""
-    def __init__(self, input_size, hidden_size, gate_non_linearity="sigmoid", zetaInit=1.0, nuInit=-4.0, name="FastGRNNCUDACell"):
+    def __init__(self, input_size, hidden_size, gate_nonlinearity="sigmoid",
+                 update_nonlinearity="tanh", wRank=None, uRank=None, zetaInit=1.0, nuInit=-4.0, name="FastGRNNCUDACell"):
         super(FastGRNNCUDA, self).__init__()
         if utils.findCUDA() is None:
             raise Exception('FastGRNNCUDA is supported only on GPU devices.')
@@ -1075,7 +1076,34 @@ def __init__(self, input_size, hidden_size, gate_non_linearity="sigmoid", zetaIn
         self._zetaInit = zetaInit
         self._nuInit = nuInit
         self._name = name
-        self._gate_non_linearity = NON_LINEARITY[gate_non_linearity]
+    
+        if wRank is not None:
+            self._num_W_matrices += 1
+            self._num_weight_matrices[0] = self._num_W_matrices
+        if uRank is not None:
+            self._num_U_matrices += 1
+            self._num_weight_matrices[1] = self._num_U_matrices
+        self._name = name
+
+        if wRank is None:
+            self.W = nn.Parameter(0.1 * torch.randn([hidden_size, input_size]))
+            self.W1 = torch.empty(0)
+            self.W2 = torch.empty(0)
+        else:
+            self.W = torch.empty(0)
+            self.W1 = nn.Parameter(0.1 * torch.randn([wRank, input_size]))
+            self.W2 = nn.Parameter(0.1 * torch.randn([hidden_size, wRank]))
+
+        if uRank is None:
+            self.U = nn.Parameter(0.1 * torch.randn([hidden_size, hidden_size]))
+            self.U1 = torch.empty(0)
+            self.U2 = torch.empty(0)
+        else:
+            self.U = torch.empty(0)
+            self.U1 = nn.Parameter(0.1 * torch.randn([uRank, hidden_size]))
+            self.U2 = nn.Parameter(0.1 * torch.randn([hidden_size, uRank]))
+
+        self._gate_non_linearity = NON_LINEARITY[gate_nonlinearity]
         self.W = nn.Parameter(0.1 * torch.randn([input_size, hidden_size]))
         self.U = nn.Parameter(0.1 * torch.randn([hidden_size, hidden_size]))
 
@@ -1086,9 +1114,12 @@ def __init__(self, input_size, hidden_size, gate_non_linearity="sigmoid", zetaIn
 
     def forward(self, input, h_state, cell_state=None):
         # input: [timesteps, batch, features, state_size]
-        return FastGRNNUnrollFunction.apply(input, self.W, self.U, self.bias_gate, self.bias_update, self.zeta, self.nu, h_state, self._gate_non_linearity)
+        return FastGRNNUnrollFunction.apply(input, self.bias_gate, self.bias_update, self.zeta, self.nu, h_state,
+            self.W, self.U, self.W1, self.W2, self.U1, self.U2, self._gate_non_linearity)
 
     def getVars(self):
+        if self._num_W_matrices != 1:
+           return [self.W1, self.W2, self.U1, self.U2, self.bias_gate, self.bias_update, self.zeta, self.nu]
         return [self.W, self.U, self.bias_gate, self.bias_update, self.zeta, self.nu]
 
 class SRNN2(nn.Module):
@@ -1225,10 +1256,10 @@ def backward(ctx, grad_h):
 
 class FastGRNNUnrollFunction(Function):
     @staticmethod
-    def forward(ctx,  input, w, u, bias_gate, bias_update, zeta, nu, old_h, gate_non_linearity):
-        outputs = fastgrnn_cuda.forward_unroll(input, w, u, bias_gate, bias_update, zeta, nu, old_h, gate_non_linearity)
+    def forward(ctx, input, bias_gate, bias_update, zeta, nu, old_h, w, u, w1, w2, u1, u2, gate_non_linearity):
+        outputs = fastgrnn_cuda.forward_unroll(input, w, u, bias_gate, bias_update, zeta, nu, old_h, gate_non_linearity, w1, w2, u1, u2)
         hidden_states = outputs[0]
-        variables = [input, hidden_states, zeta, nu, w, u] + outputs[1:] + [old_h]
+        variables = [input, hidden_states, zeta, nu, w, u] + outputs[1:] + [old_h, w1, w2, u1, u2]
         ctx.save_for_backward(*variables)
         ctx.gate_non_linearity = gate_non_linearity
         return hidden_states
@@ -1237,5 +1268,4 @@ def forward(ctx,  input, w, u, bias_gate, bias_update, zeta, nu, old_h, gate_non
     def backward(ctx, grad_h):
         outputs = fastgrnn_cuda.backward_unroll(
             grad_h.contiguous(), *ctx.saved_variables, ctx.gate_non_linearity)
-        d_input, d_w, d_u, d_bias_gate, d_bias_update, d_zeta, d_nu, d_old_h = outputs
-        return d_input, d_w, d_u, d_bias_gate, d_bias_update, d_zeta, d_nu, d_old_h
+        return tuple(outputs + [None])