MiCT-RANet-ASL-FingerSpelling/chicago_fs_wild.py at main · fmahoudeau/MiCT-RANet-ASL-FingerSpelling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# ==============================================================================
# Based on the work by B. Shi and al.:
# Fingerspelling recognition in the wild with iterative visual attention
# ==============================================================================

import os
import json
import torch
import cv2 as cv
import numpy as np
from torch.utils.data import Dataset


class ChicagoFSWild(Dataset):
    """
    Chicago Fingerspelling in the Wild Data Sets.
    For mode information and downloads go to:
    https://ttic.uchicago.edu/~klivescu/ChicagoFSWild.htm#overview
    """
    def __init__(self, split, img_dir, fcsv, vocab_map, transform,
                 img_size=224, map_size=14, lambda_x=None, scale_x=None):
        """
        :param split: the dataset split must be 'test'.
        :param img_dir: the dataset RGB root directory.
        :param fcsv: the CSV filename of the dataset split.
        :param vocab_map: maps label's chars to integers.
        :param transform: image transformations.
        :param img_size: model image input size.
        :param map_size: prior map size, which is equal to the CNN features map
            size in output
        :param lambda_x: the JSON filename of the dataset split.
        :param scale_x: the zooming factor to apply to the samples.
        """
        assert split == 'test', 'Unknown split: %s'.format(split)
        self.split = split
        self.img_dir = img_dir
        self.fcsv = fcsv
        self.vocab_map = vocab_map
        self.transform = transform
        self.img_size = img_size
        self.map_size = map_size
        with open(lambda_x, 'r') as f:
            self.lambda_x = json.load(f)
        assert scale_x in ['1', '2', '3', '4'], 'Invalid value for `scale_x` parameter: %d' % scale_x
        self.scale_x = scale_x

        self._parse()

    def _parse(self):
        with open(self.fcsv, 'r') as fo:
            lns = fo.readlines()
        print('%d %s samples' % (len(lns), self.split))
        self.imdirs, self.labels, self.n_frames = [], [], []
        for i in range(len(lns)):
            imdir, label, nframes = lns[i].strip().split(',')
            self.imdirs.append(imdir)
            self.labels.append(label)
            self.n_frames.append(int(nframes))

    def __len__(self):
        return len(self.imdirs)

    def __getitem__(self, idx):
        """Loads a sample video at the scale specified by the `scale_x` instance attribute."""
        subdir = self.imdirs[idx]
        label = list(map(lambda x: self.vocab_map[x], self.labels[idx]))
        fnames = [str(i).zfill(4) + '.jpg' for i in range(1, self.n_frames[idx]+1)]

        pad = self.lambda_x[subdir]['pad']
        l_pad, u_pad, r_pad, d_pad = pad['l'], pad['u'], pad['r'], pad['d']

        # boxes are stored in polar-like coordinates
        x0, y0, x1, y1 = self.to_cartesian_coord(self.lambda_x[subdir][self.scale_x])

        imgs, grays = [], []
        for fname in fnames:
            rgb = cv.imread(os.path.join(self.img_dir, subdir, fname))
            rgb = cv.cvtColor(rgb, cv.COLOR_BGR2RGB)
            expand_rgb = cv.copyMakeBorder(rgb, u_pad, d_pad, l_pad, r_pad,
                                            cv.BORDER_CONSTANT, value=(0, 0, 0))
            patch_rgb = expand_rgb[y0 + u_pad: y1 + u_pad, x0 + l_pad: x1 + l_pad]
            patch_rgb = cv.resize(patch_rgb, (self.img_size, self.img_size))
            patch_gray = cv.cvtColor(patch_rgb, cv.COLOR_RGB2GRAY)
            imgs.append(patch_rgb)
            grays.append(patch_gray)

        imgs, gray = np.stack(imgs), np.stack(grays)[..., np.newaxis]
        sample = {'imgs': imgs, 'gray': gray, 'label': label}

        return self.transform(sample)

    def to_cartesian_coord(self, polar_coord):
        """Convert a squared-box from polar-like coordinates to cartesian coordinates."""
        cx, cy, r = polar_coord['cx'], polar_coord['cy'], polar_coord['r']
        return [cx - r, cy - r, cx + r, cy + r]