Merge pull request #20 from neu-spiral/feature/large_runs

esennesh · web-flow · commit f58043d2e039 · 2018-04-10T20:13:19.000-04:00
Feature/large runs
diff --git a/create_mean_images.sh b/create_mean_images.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+for file in $1/*.nii ##path to relevant dataset group
+do
+  fslmaths "$file" -Tmean -bin "${file}_mean"
+done
+
+fslmerge -t $1/allmeanmasks4d $1/*.nii.gz
+fslmaths $1/allmeanmasks4d -Tmean $1/propDatavox3d
+fslmaths $1/propDatavox3d -thr 1 $1/wholebrain
diff --git a/creating_mask.txt b/creating_mask.txt
@@ -0,0 +1,24 @@
+what you want to do is to first get a mean (across time) image for each 4D file and then binarize it*. 
+
+In order to do this, use fslmaths for each 4D file:
+
+fslmaths 4D_inputVolume1 -Tmean -bin 3d_meanmask1
+fslmaths 4D_inputVolume2 -Tmean -bin 3d_meanmask2
+...
+fslmaths 4D_inputVolumeN -Tmean -bin 3d_meanmaskN
+
+Then, we'll want to get the proportion of subjects who have data for each voxel. We do this by creating a 4D file from all the 3D masks and then taking the mean across the 4th dim:
+
+fslmerge -t allmeanmasks4d 3d_meanmask1 3d_meanmask2 ... 3d_meanmaskN
+
+fslmaths allmeanmasks4d -Tmean propDatavox3d
+
+One can look at this file to get a sense of how across subject alignment did and where there is consistent or spotty drop-out of data.
+
+Lastly, make this a binary mask which is 1 where ALL subjects have data and 0 elsewhere (save as wholebrain.nii.gz):
+fslmaths propDatavox3d -thr 1 wholebrain
+
+
+
+
+*Note that if the data is z-scored already, this won't work (it isn't z-scored for greeneyes), because the mean will be ~0 for each voxel and so the binarize operation (turn non-zeros into 1) will be bad, so you would probably have to binarize, take the mean, then binarize again.
diff --git a/htfa_torch/htfa.py b/htfa_torch/htfa.py
@@ -29,10 +29,16 @@
 
 class HierarchicalTopographicFactorAnalysis:
     """Overall container for a run of TFA"""
-    def __init__(self, data_files, num_factors=tfa_models.NUM_FACTORS):
+    def __init__(self, data_files, num_factors=tfa_models.NUM_FACTORS,
+                 mask=None):
         self.num_factors = num_factors
         self.num_subjects = len(data_files)
-        datasets = [utils.load_dataset(data_file) for data_file in data_files]
+        if mask is None:
+            raise ValueError('please provide a mask')
+        else:
+            self.mask = mask
+        datasets = [utils.load_dataset(data_file, mask=mask)
+                    for data_file in data_files]
         self.voxel_activations = [dataset[0] for dataset in datasets]
         self._images = [dataset[1] for dataset in datasets]
         self.voxel_locations = [dataset[2] for dataset in datasets]
@@ -52,25 +58,30 @@ def __init__(self, data_files, num_factors=tfa_models.NUM_FACTORS):
 
     def train(self, num_steps=10, learning_rate=tfa.LEARNING_RATE,
               log_level=logging.WARNING, num_particles=tfa_models.NUM_PARTICLES,
-              use_cuda=True):
+              batch_size=64, use_cuda=True):
         """Optimize the variational guide to reflect the data for `num_steps`"""
         logging.basicConfig(format='%(asctime)s %(message)s',
                             datefmt='%m/%d/%Y %H:%M:%S',
                             level=log_level)
-
-        activations = [{'Y': Variable(self.voxel_activations[s])}
-                       for s in range(self.num_subjects)]
+        activations = torch.Tensor(max(self.num_times), max(self.num_voxels),
+                                   len(self.voxel_activations))
+        for s in range(self.num_subjects):
+            activations[:, :, s] = self.voxel_activations[s]
+        activations_loader = torch.utils.data.DataLoader(
+            torch.utils.data.TensorDataset(
+                activations,
+                torch.zeros(activations.shape[0])
+            ),
+            batch_size=batch_size
+        )
         if tfa.CUDA and use_cuda:
             enc = torch.nn.DataParallel(self.enc)
             dec = torch.nn.DataParallel(self.dec)
             enc.cuda()
-            dec.cuda()
-            for acts in activations:
-                acts['Y'] = acts['Y'].cuda()
+            dec.cuda(0)
         else:
             enc = self.enc
             dec = self.dec
-
         optimizer = torch.optim.Adam(list(self.enc.parameters()),
                                      lr=learning_rate)
         enc.train()
@@ -81,24 +92,37 @@ def train(self, num_steps=10, learning_rate=tfa.LEARNING_RATE,
 
         for epoch in range(num_steps):
             start = time.time()
-
-            optimizer.zero_grad()
-            q = probtorch.Trace()
-            enc(q, num_particles=num_particles)
-            p = probtorch.Trace()
-            dec(p, guide=q, observations=activations)
-
-            free_energies[epoch] = tfa.free_energy(q, p, num_particles=num_particles)
-            lls[epoch] = tfa.log_likelihood(q, p, num_particles=num_particles)
-
-            free_energies[epoch].backward()
-            optimizer.step()
-
-            if tfa.CUDA and use_cuda:
-                free_energies[epoch] = free_energies[epoch].cpu()
-                lls[epoch] = lls[epoch].cpu()
-            free_energies[epoch] = free_energies[epoch].data.numpy().sum(0)
-            lls[epoch] = lls[epoch].data.numpy().sum(0)
+            epoch_free_energies = list(range(len(activations_loader)))
+            epoch_lls = list(range(len(activations_loader)))
+
+            for (batch, (data, _)) in enumerate(activations_loader):
+                activations = [{'Y': Variable(data[:, :, s])}
+                               for s in range(self.num_subjects)]
+                trs = (batch * batch_size, None)
+                trs = (trs[0], trs[0] + activations[0]['Y'].shape[0])
+
+
+                optimizer.zero_grad()
+                q = probtorch.Trace()
+                enc(q, times=trs, num_particles=num_particles)
+                p = probtorch.Trace()
+                dec(p, times=trs, guide=q, observations=activations)
+
+
+                epoch_free_energies[batch] =\
+                    tfa.free_energy(q, p, num_particles=num_particles)
+                epoch_lls[batch] =\
+                    tfa.log_likelihood(q, p, num_particles=num_particles)
+                epoch_free_energies[batch].backward()
+                optimizer.step()
+                if tfa.CUDA and use_cuda:
+                    epoch_free_energies[batch] = epoch_free_energies[batch].cpu().data.numpy()
+                    epoch_lls[batch] = epoch_lls[batch].cpu().data.numpy()
+
+            free_energies[epoch] = np.array(epoch_free_energies).sum(0)
+            free_energies[epoch] = free_energies[epoch].sum(0)
+            lls[epoch] = np.array(epoch_lls).sum(0)
+            lls[epoch] = lls[epoch].sum(0)
 
             end = time.time()
             msg = tfa.EPOCH_MSG % (epoch + 1, (end - start) * 1000, free_energies[epoch])
diff --git a/htfa_torch/utils.py b/htfa_torch/utils.py
@@ -141,10 +141,10 @@ def cmu2nii(activations, locations, template):
 
     return nib.Nifti1Image(data, affine=sform)
 
-def load_dataset(data_file):
+def load_dataset(data_file, mask=None):
     name, ext = os.path.splitext(data_file)
     if ext == '.nii':
-        dataset, image = nii2cmu(data_file)
+        dataset, image = nii2cmu(data_file, mask_file=mask)
         template = data_file
     else:
         dataset = sio.loadmat(data_file)
diff --git a/notebooks/example_htfa.ipynb b/notebooks/example_htfa.ipynb