IntelLabs
diff --git a/‎bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py
Lines changed: 522 additions & 0 deletions b/‎bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py
Lines changed: 522 additions & 0 deletions
diff --git a/‎bayesian_torch/layers/base_variational_layer.py
Lines changed: 9 additions & 0 deletions b/‎bayesian_torch/layers/base_variational_layer.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎bayesian_torch/layers/flipout_layers/conv_flipout.py
Lines changed: 51 additions & 23 deletions b/‎bayesian_torch/layers/flipout_layers/conv_flipout.py
Lines changed: 51 additions & 23 deletions
diff --git a/‎bayesian_torch/layers/flipout_layers/linear_flipout.py
Lines changed: 16 additions & 8 deletions b/‎bayesian_torch/layers/flipout_layers/linear_flipout.py
Lines changed: 16 additions & 8 deletions
diff --git a/‎bayesian_torch/layers/flipout_layers/rnn_flipout.py
Lines changed: 8 additions & 0 deletions b/‎bayesian_torch/layers/flipout_layers/rnn_flipout.py
Lines changed: 8 additions & 0 deletions
@@ -34,6 +34,15 @@
 class BaseVariationalLayer_(nn.Module):
     def __init__(self):
         super().__init__()
+        self._dnn_to_bnn_flag = False
+
+    @property
+    def dnn_to_bnn_flag(self):
+        return self._dnn_to_bnn_flag
+
+    @dnn_to_bnn_flag.setter
+    def dnn_to_bnn_flag(self, value):
+        self._dnn_to_bnn_flag = value
 
     def kl_div(self, mu_q, sigma_q, mu_p, sigma_p):
         """
 
@@ -154,6 +154,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv1d(x,
                            weight=self.mu_kernel,
@@ -173,16 +176,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv1d(x * sign_input,
@@ -308,6 +313,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv2d(x,
                            weight=self.mu_kernel,
@@ -327,16 +335,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv2d(x * sign_input,
@@ -347,7 +357,6 @@ def forward(self, x, return_kl=True):
                                      dilation=self.dilation,
                                      groups=self.groups) * sign_output
 
-        self.kl = kl
         # returning outputs + perturbations
         if return_kl:
             return outputs + perturbed_outputs, kl
@@ -462,6 +471,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv3d(x,
                            weight=self.mu_kernel,
@@ -481,16 +493,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv3d(x * sign_input,
@@ -612,6 +626,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv_transpose1d(x,
                                      weight=self.mu_kernel,
@@ -631,16 +648,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv_transpose1d(
@@ -767,6 +786,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv_transpose2d(x,
                                      bias=self.mu_bias,
@@ -786,16 +808,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv_transpose2d(
@@ -922,6 +946,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv_transpose3d(x,
                                      weight=self.mu_kernel,
@@ -941,8 +968,9 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
 
@@ -90,8 +90,6 @@ def __init__(self,
                              torch.Tensor(out_features, in_features),
                              persistent=False)
 
-        self.kl = 0
-
         if bias:
             self.mu_bias = nn.Parameter(torch.Tensor(out_features))
             self.rho_bias = nn.Parameter(torch.Tensor(out_features))
@@ -125,21 +123,33 @@ def init_parameters(self):
             self.mu_bias.data.normal_(mean=self.posterior_mu_init, std=0.1)
             self.rho_bias.data.normal_(mean=self.posterior_rho_init, std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_weight))
+        kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.mu_bias is not None:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, x, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
         # sampling delta_W
         sigma_weight = torch.log1p(torch.exp(self.rho_weight))
         delta_weight = (sigma_weight * self.eps_weight.data.normal_())
 
         # get kl divergence
-        kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.mu_bias is not None:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             bias = (sigma_bias * self.eps_bias.data.normal_())
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # linear outputs
         outputs = F.linear(x, self.mu_weight, self.mu_bias)
@@ -150,8 +160,6 @@ def forward(self, x, return_kl=True):
         perturbed_outputs = F.linear(x * sign_input, delta_weight,
                                      bias) * sign_output
 
-        self.kl = kl
-
         # returning outputs + perturbations
         if return_kl:
             return outputs + perturbed_outputs, kl
 
@@ -94,8 +94,16 @@ def __init__(self,
                                 out_features=out_features * 4,
                                 bias=bias)
 
+    def kl_loss(self):
+        kl_i = self.ih.kl_loss()
+        kl_h = self.hh.kl_loss()
+        return kl_i + kl_h
+
     def forward(self, X, hidden_states=None, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         batch_size, seq_size, _ = X.size()
 
         hidden_seq = []