Fixed video labelling after subset call for HMDB51 dataset (hmdb51.py) (EDIT: UCF101 as well) (#1240)

JMistele · RyanCao7 · Etang21 · ezyang · commit 40c8330d1c49 · 2019-10-09T19:53:56.000-07:00
* Fixed video labelling after subset for HMDB51 dataset

* Fixed video labelling after subset for HMDB51 dataset

Co-authored-by: Eric Tang &lt;etang21@stanford.edu&gt;
Co-authored-by: Ryan Cao &lt;ryancao@stanford.edu&gt;

* UCF 101 Labeling fixes

- Analogous fix to HMDB51 to maintain correct labels after the train-test split
- Additional change to the `select_fold` method in `ucf101.py` to correctly reflect the annotation format

Co-authored-by: Ryan Cao &lt;ryancao@stanford.edu&gt;
Co-authored-by: Eric Tang &lt;etang21@stanford.edu&gt;
diff --git a/torchvision/datasets/hmdb51.py b/torchvision/datasets/hmdb51.py
@@ -65,8 +65,8 @@ def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1,
         self.classes = classes
         video_list = [x[0] for x in self.samples]
         video_clips = VideoClips(video_list, frames_per_clip, step_between_clips)
-        indices = self._select_fold(video_list, annotation_path, fold, train)
-        self.video_clips = video_clips.subset(indices)
+        self.indices = self._select_fold(video_list, annotation_path, fold, train)
+        self.video_clips = video_clips.subset(self.indices)
         self.transform = transform
 
     def _select_fold(self, video_list, annotation_path, fold, train):
@@ -89,7 +89,7 @@ def __len__(self):
 
     def __getitem__(self, idx):
         video, audio, info, video_idx = self.video_clips.get_clip(idx)
-        label = self.samples[video_idx][1]
+        label = self.samples[self.indices[video_idx]][1]
 
         if self.transform is not None:
             video = self.transform(video)
diff --git a/torchvision/datasets/ucf101.py b/torchvision/datasets/ucf101.py
@@ -58,8 +58,8 @@ def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1,
         self.classes = classes
         video_list = [x[0] for x in self.samples]
         video_clips = VideoClips(video_list, frames_per_clip, step_between_clips)
-        indices = self._select_fold(video_list, annotation_path, fold, train)
-        self.video_clips = video_clips.subset(indices)
+        self.indices = self._select_fold(video_list, annotation_path, fold, train)
+        self.video_clips = video_clips.subset(self.indices)
         self.transform = transform
 
     def _select_fold(self, video_list, annotation_path, fold, train):
@@ -81,7 +81,7 @@ def __len__(self):
 
     def __getitem__(self, idx):
         video, audio, info, video_idx = self.video_clips.get_clip(idx)
-        label = self.samples[video_idx][1]
+        label = self.samples[self.indices[video_idx]][1]
 
         if self.transform is not None:
             video = self.transform(video)