MichiganCOG
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 31 additions & 1 deletion b/‎README.md‎
Lines changed: 31 additions & 1 deletion
diff --git a/‎config_default_example.yaml‎
Lines changed: 4 additions & 4 deletions b/‎config_default_example.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎datasets/DHF1K.py‎
Lines changed: 113 additions & 0 deletions b/‎datasets/DHF1K.py‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎datasets/HMDB51.py‎
Lines changed: 3 additions & 2 deletions b/‎datasets/HMDB51.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎datasets/ImageNetVID.py‎
Lines changed: 6 additions & 4 deletions b/‎datasets/ImageNetVID.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎datasets/KTH.py‎
Lines changed: 79 additions & 0 deletions b/‎datasets/KTH.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎datasets/MSCOCO.py‎
Lines changed: 7 additions & 6 deletions b/‎datasets/MSCOCO.py‎
Lines changed: 7 additions & 6 deletions
@@ -13,3 +13,4 @@ runs/*
 models/HGC3D
 *.json
 pbs/*
+*.pt
@@ -1,4 +1,4 @@
-# Video Platform for Recognition and Detection in Pytorch
+# [Video Platform for Recognition and Detection in Pytorch](https://arxiv.org/abs/1910.02793)
 
 A platform for quick and easy development of deep learning networks for recognition and detection in videos. Includes popular models like C3D and SSD.
 
@@ -9,13 +9,39 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 ### Recognition
 |  Model Architecture  |      Dataset       |    ViP Accuracy (%)   |  
 |:--------------------:|:------------------:|:---------------------:|
+|        I3D           |  HMDB51 (Split 1)  |    72.75              |
 |        C3D           |  HMDB51 (Split 1)  |    50.14 ± 0.777      |
 |        C3D           |  UCF101 (Split 1)  |    80.40 ± 0.399      |
 
 ### Object Detection
 |  Model Architecture  |      Dataset       |    ViP Accuracy (%)   | 
 |:--------------------:|:------------------:|:---------------------:|
 |        SSD300        |  VOC2007  |    76.58      |
+
+### Video Object Grounding
+|  Model Architecture  |      Dataset       |    ViP Accuracy (%)   | 
+|:--------------------:|:------------------:|:---------------------:|
+|        DVSA (+fw, obj)        |  YC2-BB (Validation)  |    30.09      |
+
+**fw**: framewise weighting, **obj**: object interaction
+
+
+## Citation
+
+Please cite ViP when releasing any work that used this platform: https://arxiv.org/abs/1910.02793
+
+```
+@article{ganesh2019vip,
+  title={ViP: Video Platform for PyTorch},
+  author={Ganesh, Madan Ravi and Hofesmann, Eric and Louis, Nathan and Corso, Jason},
+  journal={arXiv preprint arXiv:1910.02793},
+  year={2019}
+}
+
+```
+
+
+
 ## Table of Contents
 
 * [Datasets](#configured-datasets)
@@ -38,12 +64,16 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |[ImageNetVID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/download-videos-3j16.php)                      | Video Object Detection |
 |[MSCOCO 2014](http://cocodataset.org/#download)                                                       | Object Detection, Keypoints|
 |[VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/)                                            | Object Detection, Classification|
+|[YC2-BB](http://youcook2.eecs.umich.edu/download)| Video Object Grounding|
+|[DHF1K](https://github.com/wenguanwang/DHF1K)							       | Video Saliency Prediction|
 
 ## Models
 |                     Model                        |        Task(s)       |
 |:------------------------------------------------:|:--------------------:|
 |[C3D](https://github.com/jfzhang95/pytorch-video-recognition/blob/master/network/C3D_model.py) | Activity Recognition |
+|[I3D](https://github.com/piergiaj/pytorch-i3d) | Activity Recognition |
 |[SSD300](https://github.com/amdegroot/ssd.pytorch)                                             | Object Detection     |
+|[DVSA (+fw, obj)](https://github.com/MichiganCOG/Video-Grounding-from-Text)| Video Object Grounding|
 
 ## Requirements
 
 
@@ -1,25 +1,24 @@
 # Preprocessing
 clip_length:       16                    # Number of frames within a clip 
 clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                     # Frame offset between successive frames
+clip_stride:       1                     # Frame offset between successive clips, must be >= 1 
 crop_shape:        [112,112]             # (Height, Width) of frame  
 crop_type:         Random                # Type of cropping operation (Random, Central and None)  
 final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
 num_clips:         -1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
 random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
 resize_shape:      [128,171]             # (Height, Width) to resize original data 
-sample_duration:   16                    # Temporal size of video to be provided as input to the model 
-sample_size:       112                   # Height of frame to be provided as input to the model
 subtract_mean:     ''                    # Subtract mean (R,G,B) from all frames during preprocessing
 
 # Experiment Setup 
 acc_metric:        Accuracy              # Accuracy metric 
-batch_size:        3                     # Numbers of videos in a mini-batch 
+batch_size:        15                     # Numbers of videos in a mini-batch 
 dataset:           HMDB51                # Name of dataset 
 debug:             0                     # If True, do not plot, save, or create data files 
 epoch:             30                    # Total number of epochs 
 exp:               exp                   # Experiment name
 gamma:             0.1                   # Multiplier with which to change learning rate
+grad_max_norm:     0                     # Norm for gradient clipping
 json_path:         /z/dat/HMDB51/        # Path to the json file for the given dataset
 labels:            51                    # Number of total classes in the dataset
 load_type:         train                 # Environment selection, to include only training/training and validation/testing dataset
@@ -37,3 +36,4 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
+resume:            0                     # Flag to resume training or switch to alternate objective after loading
@@ -0,0 +1,113 @@
+import torch
+try:
+    from .abstract_datasets import DetectionDataset 
+except:
+    from abstract_datasets import DetectionDataset 
+import cv2
+import os
+import numpy as np
+import json
+try:
+    import datasets.preprocessing_transforms as pt
+except:
+    import preprocessing_transforms as pt
+
+class DHF1K(DetectionDataset):
+    def __init__(self, *args, **kwargs):
+        super(DHF1K, self).__init__(*args, **kwargs)
+
+        # Get model object in case preprocessing other than default is used
+        self.model_object   = kwargs['model_obj']
+        self.load_type = kwargs['load_type']
+        
+        print(self.load_type)
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+        
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+    
+
+
+    
+    def __getitem__(self, idx):
+        vid_info = self.samples[idx]
+
+        
+        base_path = vid_info['base_path']
+        vid_size  = vid_info['frame_size']
+
+        input_data = []
+        map_data = []
+        bin_data = []
+
+        for frame_ind in range(len(vid_info['frames'])):
+            frame      = vid_info['frames'][frame_ind]
+            frame_path = frame['img_path']
+            map_path   = frame['map_path']
+            bin_path   = frame['bin_path']
+            
+            # Load frame, convert to RGB from BGR and normalize from 0 to 1
+            input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1]/255.)
+            
+            # Load frame, Normalize from 0 to 1
+            # All frame channels have repeated values
+            map_data.append(cv2.imread(map_path)/255.)
+            bin_data.append(cv2.imread(bin_path)/255.)
+
+
+
+        vid_data = self.transforms(input_data)
+
+        # Annotations must be resized in the loss/metric
+        map_data = torch.Tensor(map_data)
+        bin_data = torch.Tensor(bin_data)
+
+        # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+        vid_data = vid_data.permute(3, 0, 1, 2)
+        map_data = map_data.permute(3, 0, 1, 2)
+        bin_data = bin_data.permute(3, 0, 1, 2)
+        # All channels are repeated so remove the unnecessary channels
+        map_data = map_data[0].unsqueeze(0)
+        bin_data = bin_data[0].unsqueeze(0)
+
+
+        ret_dict         = dict() 
+        ret_dict['data'] = vid_data 
+
+        annot_dict                = dict()
+        annot_dict['map']         = map_data
+        annot_dict['bin']         = bin_data
+        annot_dict['input_shape'] = vid_data.size()
+        annot_dict['name']        = base_path
+        ret_dict['annots']        = annot_dict
+
+        return ret_dict
+
+
+if __name__=='__main__':
+
+    class tts():
+        def __call__(self, x):
+            return pt.ToTensorClip()(x)
+    class debug_model():
+        def __init__(self):
+            self.train_transforms = tts()
+
+
+    json_path = '/path/to/DHF1K' #### Change this when testing ####
+
+
+    dataset = DHF1K(model_obj=debug_model(), json_path=json_path, load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1)
+    train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False)
+
+
+    import matplotlib.pyplot as plt
+    for x in enumerate(train_loader):
+        dat = x[1]['data'][0,:,0].permute(1,2,0).numpy()
+        bin = x[1]['annots']['bin'][0,:,0].permute(1,2,0).numpy().repeat(3,axis=2)
+        map = x[1]['annots']['map'][0,:,0].permute(1,2,0).numpy().repeat(3, axis=2)
+        img = np.concatenate([dat,bin,map], axis=0)
+        plt.imshow(img)
+        plt.show()
+        import pdb; pdb.set_trace()
@@ -40,8 +40,9 @@ def __getitem__(self, idx):
         base_path = vid_info['base_path']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        labels     = np.zeros((self.clip_length))-1
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((vid_length))-1
         input_data = []
 
         for frame_ind in range(len(vid_info['frames'])):
 
@@ -42,10 +42,12 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data  = np.zeros((self.clip_length, self.max_objects, 4))-1
-        labels     = np.zeros((self.clip_length, self.max_objects))-1
-        occlusions = np.zeros((self.clip_length, self.max_objects))-1
+
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data  = np.zeros((vid_length, self.max_objects, 4))-1
+        labels     = np.zeros((vid_length, self.max_objects))-1
+        occlusions = np.zeros((vid_length, self.max_objects))-1
 
 
 
 
@@ -0,0 +1,79 @@
+import torch
+from .abstract_datasets import RecognitionDataset 
+from PIL import Image
+import cv2
+import os
+import numpy as np
+from torchvision import transforms
+
+class KTH(RecognitionDataset):
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize KTH class  
+        Args:
+            load_type    (String): Select training or testing set 
+            resize_shape (Int):    [Int, Int] Array indicating desired height and width to resize input
+            crop_shape   (Int):    [Int, Int] Array indicating desired height and width to crop input
+            final_shape  (Int):    [Int, Int] Array indicating desired height and width of input to deep network
+            preprocess   (String): Keyword to select different preprocessing types            
+
+        Return:
+            None
+        """
+        super(KTH, self).__init__(*args, **kwargs)
+
+        self.load_type    = kwargs['load_type']
+        self.resize_shape = kwargs['resize_shape']
+        self.crop_shape   = kwargs['crop_shape']
+        self.final_shape  = kwargs['final_shape']
+        self.preprocess   = kwargs['preprocess']
+        
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+
+
+    def __getitem__(self, idx):
+        vid_info  = self.samples[idx]
+        base_path = vid_info['base_path']
+
+        input_data = []
+
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((vid_length))-1
+        input_data = []
+    
+        for frame_ind in range(len(vid_info['frames'])):
+            frame_path   = os.path.join(base_path, vid_info['frames'][frame_ind]['img_path'])
+
+            for frame_labels in vid_info['frames'][frame_ind]['actions']:
+                labels[frame_ind] = frame_labels['action_class']
+
+            # Load frame image data and preprocess image accordingly
+            input_data.append(cv2.imread(frame_path)[...,::-1]/1.)
+
+
+        # Preprocess data
+        vid_data   = self.transforms(input_data)
+        labels     = torch.from_numpy(labels).float()
+
+        # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+        vid_data = vid_data.permute(3, 0, 1, 2)
+
+        ret_dict           = dict() 
+        ret_dict['data']   = vid_data 
+
+        annot_dict           = dict()
+        annot_dict['labels'] = labels
+
+        ret_dict['annots']   = annot_dict
+
+        return ret_dict
+
+
+#dataset = HMDB51(json_path='/z/dat/HMDB51', dataset_type='train', clip_length=100, num_clips=0)
+#dat = dataset.__getitem__(0)
+#import pdb; pdb.set_trace()
@@ -1,6 +1,6 @@
 import torch
 from .abstract_datasets import DetectionDataset 
-from PIL import Image
+import cv2
 import os
 import numpy as np
 import datasets.preprocessing_transforms as pt
@@ -34,10 +34,11 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data  = np.zeros((self.clip_length, self.max_objects, 4))-1
-        labels     = np.zeros((self.clip_length, self.max_objects))-1
-        iscrowds   = np.zeros((self.clip_length, self.max_objects))-1
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data  = np.zeros((vid_length, self.max_objects, 4))-1
+        labels     = np.zeros((vid_length, self.max_objects))-1
+        iscrowds   = np.zeros((vid_length, self.max_objects))-1
 
 
 
@@ -62,7 +63,7 @@ def __getitem__(self, idx):
                 iscrowds[frame_ind, trackid]     = iscrowd
 
 
-            input_data.append(Image.open(os.path.join(base_path, frame_path)))
+            input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1])
 
         vid_data, bbox_data = self.transforms(input_data, bbox_data)
-Original file line number
+Diff line change
 models/HGC3D
 *.json
 pbs/*
 +*.pt