ViT not working for multi-class classification #4525

fengling0410 · 2022-06-17T03:18:35Z

fengling0410
Jun 17, 2022

Hi all, I'm doing self-supervised learning for a CT scan ViT encoder. The pre-training task is rotation where I rotate the image patches according to 10 directions (so I have 10 classes in total) and let the ViT classification model predict the class. However, I find that the ViT classification model doesn't work. The model constantly gives the same prediction for all the inputs. But when I replace the ViT with DenseNet121, it works. I'm pretty confused about this and want to ask the community for advice.

Here are my codes

class RotationDataset(dataset):
    def __init__(self, file_dir, train = True, patch_size = 96):
        # self.img_files = sorted(glob.glob(os.path.join(ct_dir, 'test*.nii.gz')))
        self.patch_size = patch_size
        if train == True:
            with open(os.path.join(file_dir,'train_data.json')) as f:
                train_Data = json.load(f)
                self.img_files = [x["image"] for x in train_Data]
                self.img_files = self.img_files[:100]
        else:
            with open(os.path.join(file_dir,'val_data.json')) as f:
                val_Data = json.load(f)
                self.img_files = [x["image"] for x in val_Data]
                self.img_files = self.img_files[:50]

    def __getitem__(self, index):

        ct_path = self.img_files[index]
        ct_array = LoadImage()(ct_path)[0]
        ct_array = AddChannel()(ct_array)

        ct_array = ct_array.astype(np.float32)
        ct_array = Orientation(axcodes="RAS")(ct_array)[0]
        ct_array = Spacing(pixdim=(2, 2, 2), mode="nearest")(ct_array)[0]
        ct_array = SpatialPad(spatial_size=(self.patch_size, self.patch_size, self.patch_size))(ct_array)
        ct_array = ScaleIntensityRange(a_min=-57, a_max=164, b_min=0.0, b_max=1.0, clip=True)(ct_array)
        ct_array = CropForeground()(ct_array)   
        
        ct_array = RandCropByPosNegLabel(label = ct_array, spatial_size=(self.patch_size, self.patch_size, self.patch_size),pos=1,neg=1,num_samples=4,image_threshold=0)(ct_array) # dimension (1, 1, 96, 96, 96)
        label = []
        rotated_img = []
        for i in range(len(ct_array)):
            img_i, label_i = self._rotate(ct_array[i])
            label.append(label_i)
            rotated_img.append(img_i.copy())
        rotated_img = np.array(rotated_img)
        rotated_img = torch.from_numpy(rotated_img)
        label = np.array(label)
        label = torch.from_numpy(label)
        return rotated_img, label
    
    def _rotate(self, img):
        rot = np.random.randint(10)
        if rot == 0:
            img = img
      
        elif rot == 1:
            img = np.rot90(img, k=1, axes = (1,2)) # 90 xy plan

        elif rot == 2:
            img = np.rot90(img, k=2, axes = (1,2)) # 180 xy plane 
        
        elif rot == 3:
            img = np.rot90(img, k=3, axes = (1,2)) # 270 xy plane
            
        elif rot == 4:
            img = np.rot90(img, k=1, axes = (2,3)) # 90 yz plane

        elif rot == 5:
            img = np.rot90(img, k=2, axes = (2,3)) # 180 yz plane

        elif rot == 6:
            img = np.rot90(img, k=3, axes = (2,3)) # 270 yz plane

        elif rot == 7:
            img = np.rot90(img, k=1, axes = (1,3)) # 90 xz plane
            
        elif rot == 8:
            img = np.rot90(img, k=2, axes = (1,3)) # 180 xz plane

        elif rot == 9:
            img = np.rot90(img, k=3, axes = (1,3)) # 270 xz plane 

        return img, rot  

    def __len__(self):
        return len(self.img_files)

# create dataloader
train_data = RotationDataset("simclr_tcia", train = True, patch_size = 96)
train_data_loader = DataLoader(train_data, batch_size=1, shuffle=True, num_workers = 8)

val_data = RotationDataset("simclr_tcia", train = False, patch_size = 96)
val_data_loader = DataLoader(val_data, batch_size=1, shuffle=True, num_workers = 8)


model = ViT(
            in_channels=1,  
            img_size=(96, 96, 96),
            patch_size=(16, 16, 16),
            pos_embed='conv',
            hidden_size=768,
            mlp_dim=3072,
            classification = True,
            num_classes = 10,
            spatial_dims = 3
    ).to(device)


def train(train_loader, model, loss_function, optimizer, curr_epoch):
    epoch_avg_loss = 0
    model.train()
    batches = tqdm(train_loader, total=len(train_loader))
    batches.set_description('Training: [epoch {}], [cross_entropy_loss {:0.6f}]')

    for inputs, labels in batches:
        # move inputs and label to device
        optimizer.zero_grad()
        inputs = torch.squeeze(inputs, axis = 0)
        labels = torch.squeeze(labels, axis = 0)
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)[0]
        # outputs = model(inputs)
        loss = loss_function(outputs, labels)

        # backward propogation
        loss.backward()
        optimizer.step()
        epoch_avg_loss += loss.item()
        batches.set_description('Train: [epoch {}], [cross_entropy_loss {:0.6f}]'.format(curr_epoch, loss.item()))
    return  epoch_avg_loss/len(train_loader)


def validate(val_loader, model, loss_function, curr_epoch):
    model.eval()
    val_avg_loss = 0
    total_correct_count = 0
    total_count = 0 

    batches = tqdm(val_loader, total=len(val_loader))
    batches.set_description('Validation: [epoch {}], [cross_entropy_loss {:0.6f}], [accuracy {:0.6f}]')

    for inputs, labels in batches:
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            inputs = torch.squeeze(inputs, axis = 0)
            labels = torch.squeeze(labels, axis = 0)
            outputs = model(inputs)[0]
            # outputs = model(inputs)
            pred = torch.argmax(outputs, dim=1)
                
            # update accuracy
            correct_count = torch.eq(pred, labels)
            total_correct_count += correct_count.sum().item()
            total_count += len(pred)
            current_acc = total_correct_count/total_count
            
            batches.set_description('Validation: [epoch {}], [accuracy {:0.6f}]'.format(curr_epoch, current_acc))

    return  total_correct_count/total_count# retuen the value of loss


train_loss = []
val_acc = []

for epoch in range(max_epoch):
    
    train_l = train(train_data_loader, model, loss_function, optimizer, epoch)
    print('Train: [epoch {}], [Avg loss {:0.2f}]'.format(epoch, train_l))
    val_a = validate(val_data_loader, model, loss_function, epoch)
    print('Val: [epoch {}], [Avg loss {:0.2f}], [Avg accuracy {:0.2f}]'.format(epoch, val_l, val_a))
    train_loss.append(train_l)
    # val_loss.append(val_l)
    val_acc.append(val_a)

    if val_a < highest_accuracy:
        torch.save(model.state_dict(), model_name)
        print(f"Save Model!")
        highest_accuracy = val_a

After I have replaced ViT model with model = monai.networks.nets.DenseNet121(spatial_dims=3, in_channels=1, out_channels=4).to(device), the code works well. I would appreciate any advice from the community. Thank you in advance!

finalelement · 2022-06-17T14:56:36Z

finalelement
Jun 17, 2022
Maintainer

Thanks for starting off the discussion
@fengling0410 . I noticed a few things here:

1.) Why is it that you are only using out_channels=4 with DenseNet121 as compared to num_classes=10 for ViT, is it because you tested 2D rotation with DenseNet as a classification task?

2.) If the answer to above question is "yes", I would suggest to try the same with ViT first, before claiming it does not work. :)

3.) I would also encourage you to share the training and validation curves for the DenseNet and then the same for ViT, to be able to help you out better.

As additional insight, I would also ask you if the 3D rotation this way makes for a well-posed problem or not, because usually for classification one usually goes for a 4 class rotation by fixing an axis. If you have a paper/reference for 3D rotation classification please share, it would help in understanding the use case and also towards what you have implemented

1 reply

fengling0410 Jun 17, 2022
Author

Thank you so much for your reply @finalelement.

1.) It's my bad. I was testing the classification based only on the rotation about x-axis (then I have four labels) and that's why I put num_class = 4 for DenseNet121. DenseNet121 works for both case (4 classes or 10 classes) but ViT fails for both.
3.) Sure, here is the training loss curve and validation accuracy curve for DenseNet121 and ViT. Here I tried with 10 classes. Both of the models are trained for exactly the same setting (learning rate, epochs, batch size......)
This is the plot of DenseNet121:

This is the plot of ViT:

I was inspired by the method used in the paper "3D Self-Supervised Methods for Medical Imaging" by Aiham Taleb and others. In that paper they did experiment on 10-class rotation and achieved a comparatively good result, and I would like to apply their method to my work.

Thank you again for your reply, do you have any thoughts on this? I would appreciate any suggestion :)

finalelement · 2022-06-20T23:02:13Z

finalelement
Jun 20, 2022
Maintainer

@fengling0410 Thanks for sharing the reference and also the plots. It does seem like the training loss is going down, however it does not seem to be reflecting the same for the validation plot of ViT. It seems like a high learning rate, is being used.

Can you try lowering the learning rate for ViT model, because the training loss is decreasing (but its just not smooth enough) and try it only with 4 classes for now.

3 replies

fengling0410 Jun 22, 2022
Author

@finalelement Thank you so much for your suggestions. I've tried to use learning rate of 1e-5, 1e-4, 1e-3 with 10 classes but all of them failed. Here are the training plot:
learning rate = 1e-5

learning rate = 1e-4

learning rate = 1e-3

It seems like the training loss ends up around 2.3 where the classification predictions are constant (0 or 1 or 2......for all the inputs). May I ask what issue could possibly lead to this kind of result? I would appreciate any insights about this :)

finalelement Jul 1, 2022
Maintainer

@fengling0410 I appreciate your persistence, can you check out this post, might be of use: #4129 (comment)

fengling0410 Jul 13, 2022
Author

Thanks @finalelement. I guess that's the reason. Thank you again for all your help :)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

ViT not working for multi-class classification #4525

Uh oh!

{{title}}

Uh oh!

Replies: 2 comments 4 replies

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

ViT not working for multi-class classification #4525

Uh oh!

fengling0410 Jun 17, 2022

Replies: 2 comments · 4 replies

Uh oh!

Uh oh!

finalelement Jun 17, 2022 Maintainer

Uh oh!

Uh oh!

fengling0410 Jun 17, 2022 Author

Uh oh!

finalelement Jun 20, 2022 Maintainer

Uh oh!

fengling0410 Jun 22, 2022 Author

Uh oh!

finalelement Jul 1, 2022 Maintainer

Uh oh!

fengling0410 Jul 13, 2022 Author

fengling0410
Jun 17, 2022

Replies: 2 comments 4 replies

finalelement
Jun 17, 2022
Maintainer

fengling0410 Jun 17, 2022
Author

finalelement
Jun 20, 2022
Maintainer

fengling0410 Jun 22, 2022
Author

finalelement Jul 1, 2022
Maintainer

fengling0410 Jul 13, 2022
Author