Skip to content

Commit 7c95f97

Browse files
bjuncekfmassa
authored andcommitted
Add VideoModelZoo models (#1130)
* [0.4_video] models - initial commit * addressing fmassas inline comments * pep8 and flake8 * simplify "hacks" * sorting out latest comments * nitpick * Updated tests and constructors * Added docstrings - ready to merge
1 parent 4886ccc commit 7c95f97

File tree

9 files changed

+498
-0
lines changed

9 files changed

+498
-0
lines changed

test/test_models.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ def get_available_detection_models():
2020
return [k for k, v in models.detection.__dict__.items() if callable(v) and k[0].lower() == k[0] and k[0] != "_"]
2121

2222

23+
def get_available_video_models():
24+
# TODO add a registration mechanism to torchvision.models
25+
return [k for k, v in models.video.__dict__.items() if callable(v) and k[0].lower() == k[0] and k[0] != "_"]
26+
27+
2328
class Tester(unittest.TestCase):
2429
def _test_classification_model(self, name, input_shape):
2530
# passing num_class equal to a number other than 1000 helps in making the test
@@ -53,6 +58,16 @@ def _test_detection_model(self, name):
5358
self.assertTrue("scores" in out[0])
5459
self.assertTrue("labels" in out[0])
5560

61+
def _test_video_model(self, name):
62+
# the default input shape is
63+
# bs * num_channels * clip_len * h *w
64+
input_shape = (1, 3, 8, 112, 112)
65+
# test both basicblock and Bottleneck
66+
model = models.video.__dict__[name](num_classes=50)
67+
x = torch.rand(input_shape)
68+
out = model(x)
69+
self.assertEqual(out.shape[-1], 50)
70+
5671
def _make_sliced_model(self, model, stop_layer):
5772
layers = OrderedDict()
5873
for name, layer in model.named_children():
@@ -130,6 +145,12 @@ def do_test(self, model_name=model_name):
130145

131146
setattr(Tester, "test_" + model_name, do_test)
132147

148+
for model_name in get_available_video_models():
149+
150+
def do_test(self, model_name=model_name):
151+
self._test_video_model(model_name)
152+
153+
setattr(Tester, "test_" + model_name, do_test)
133154

134155
if __name__ == '__main__':
135156
unittest.main()

torchvision/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010
from .shufflenetv2 import *
1111
from . import segmentation
1212
from . import detection
13+
from . import video
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .r3d import *
2+
from .r2plus1d import *
3+
from .mixed_conv import *

torchvision/models/video/_utils.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import torch.nn as nn
2+
3+
4+
__all__ = ["Conv3DSimple", "Conv2Plus1D", "Conv3DNoTemporal"]
5+
6+
7+
class Conv3DSimple(nn.Conv3d):
8+
def __init__(self,
9+
in_planes,
10+
out_planes,
11+
midplanes=None,
12+
stride=1,
13+
padding=1):
14+
15+
super(Conv3DSimple, self).__init__(
16+
in_channels=in_planes,
17+
out_channels=out_planes,
18+
kernel_size=(3, 3, 3),
19+
stride=stride,
20+
padding=padding,
21+
bias=False)
22+
23+
@staticmethod
24+
def get_downsample_stride(stride):
25+
return (stride, stride, stride)
26+
27+
28+
class Conv2Plus1D(nn.Sequential):
29+
30+
def __init__(self,
31+
in_planes,
32+
out_planes,
33+
midplanes,
34+
stride=1,
35+
padding=1):
36+
conv1 = [
37+
nn.Conv3d(in_planes, midplanes, kernel_size=(1, 3, 3),
38+
stride=(1, stride, stride), padding=(0, padding, padding),
39+
bias=False),
40+
nn.BatchNorm3d(midplanes),
41+
nn.ReLU(inplace=True),
42+
nn.Conv3d(midplanes, out_planes, kernel_size=(3, 1, 1),
43+
stride=(stride, 1, 1), padding=(padding, 0, 0),
44+
bias=False)
45+
]
46+
super(Conv2Plus1D, self).__init__(*conv1)
47+
48+
@staticmethod
49+
def get_downsample_stride(stride):
50+
return (stride, stride, stride)
51+
52+
53+
class Conv3DNoTemporal(nn.Conv3d):
54+
55+
def __init__(self,
56+
in_planes,
57+
out_planes,
58+
midplanes=None,
59+
stride=1,
60+
padding=1):
61+
62+
super(Conv3DNoTemporal, self).__init__(
63+
in_channels=in_planes,
64+
out_channels=out_planes,
65+
kernel_size=(1, 3, 3),
66+
stride=(1, stride, stride),
67+
padding=(0, padding, padding),
68+
bias=False)
69+
70+
@staticmethod
71+
def get_downsample_stride(stride):
72+
return (1, stride, stride)
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import torch.nn as nn
2+
3+
from ._utils import Conv3DSimple, Conv3DNoTemporal
4+
from .video_stems import get_default_stem
5+
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck
6+
7+
8+
__all__ = ["mc3_18"]
9+
10+
11+
def _mcX(model_depth, X=3, use_pool1=False, **kwargs):
12+
"""Generate mixed convolution network as in
13+
https://arxiv.org/abs/1711.11248
14+
15+
Args:
16+
model_depth (int): trunk depth - supports most resnet depths
17+
X (int): Up to which layers are convolutions 3D
18+
use_pool1 (bool, optional): Add pooling layer to the stem. Defaults to False.
19+
20+
Returns:
21+
nn.Module: mcX video trunk
22+
"""
23+
assert X > 1 and X <= 5
24+
conv_makers = [Conv3DSimple] * (X - 2)
25+
while len(conv_makers) < 5:
26+
conv_makers.append(Conv3DNoTemporal)
27+
28+
if model_depth < 50:
29+
block = BasicBlock
30+
else:
31+
block = Bottleneck
32+
33+
model = VideoTrunkBuilder(block=block, conv_makers=conv_makers, model_depth=model_depth,
34+
stem=get_default_stem(use_pool1=use_pool1), **kwargs)
35+
36+
return model
37+
38+
39+
def _rmcX(model_depth, X=3, use_pool1=False, **kwargs):
40+
"""Generate reverse mixed convolution network as in
41+
https://arxiv.org/abs/1711.11248
42+
43+
Args:
44+
model_depth (int): trunk depth - supports most resnet depths
45+
X (int): Up to which layers are convolutions 2D
46+
use_pool1 (bool, optional): Add pooling layer to the stem. Defaults to False.
47+
48+
Returns:
49+
nn.Module: mcX video trunk
50+
"""
51+
assert X > 1 and X <= 5
52+
53+
conv_makers = [Conv3DNoTemporal] * (X - 2)
54+
while len(conv_makers) < 5:
55+
conv_makers.append(Conv3DSimple)
56+
57+
if model_depth < 50:
58+
block = BasicBlock
59+
else:
60+
block = Bottleneck
61+
62+
model = VideoTrunkBuilder(block=block, conv_makers=conv_makers, model_depth=model_depth,
63+
stem=get_default_stem(use_pool1=use_pool1), **kwargs)
64+
65+
return model
66+
67+
68+
def mc3_18(use_pool1=False, **kwargs):
69+
"""Constructor for 18 layer Mixed Convolution network as in
70+
https://arxiv.org/abs/1711.11248
71+
72+
Args:
73+
use_pool1 (bool, optional): Include pooling in the resnet stem. Defaults to False.
74+
75+
Returns:
76+
nn.Module: MC3 Network definitino
77+
"""
78+
return _mcX(18, 3, use_pool1, **kwargs)
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import torch.nn as nn
2+
3+
from ._utils import Conv2Plus1D
4+
from .video_stems import get_r2plus1d_stem
5+
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck
6+
7+
8+
__all__ = ["r2plus1d_18"]
9+
10+
11+
def _r2plus1d(model_depth, use_pool1=False, **kwargs):
12+
"""Constructor for R(2+1)D network as described in
13+
https://arxiv.org/abs/1711.11248
14+
15+
Args:
16+
model_depth (int): Depth of the model - standard resnet depths apply
17+
use_pool1 (bool, optional): Should we use the pooling layer? Defaults to False
18+
Returns:
19+
nn.Module: An R(2+1)D video backbone
20+
"""
21+
convs = [Conv2Plus1D] * 4
22+
if model_depth < 50:
23+
block = BasicBlock
24+
else:
25+
block = Bottleneck
26+
27+
model = VideoTrunkBuilder(
28+
block=block, conv_makers=convs, model_depth=model_depth,
29+
stem=get_r2plus1d_stem(use_pool1), **kwargs)
30+
return model
31+
32+
33+
def r2plus1d_18(use_pool1=False, **kwargs):
34+
"""Constructor for the 18 layer deep R(2+1)D network as in
35+
https://arxiv.org/abs/1711.11248
36+
37+
Args:
38+
use_pool1 (bool, optional): Include pooling in the resnet stem. Defaults to False.
39+
40+
Returns:
41+
nn.Module: R(2+1)D-18 network
42+
"""
43+
return _r2plus1d(18, use_pool1, **kwargs)

torchvision/models/video/r3d.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import torch.nn as nn
2+
3+
from ._utils import Conv3DSimple
4+
from .video_stems import get_default_stem
5+
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck
6+
7+
__all__ = ["r3d_18"]
8+
9+
10+
def _r3d(model_depth, use_pool1=False, **kwargs):
11+
"""Constructor of a r3d network as in
12+
https://arxiv.org/abs/1711.11248
13+
14+
Args:
15+
model_depth (int): resnet trunk depth
16+
use_pool1 (bool, optional): Add pooling layer to the stem. Defaults to False
17+
18+
Returns:
19+
nn.Module: R3D network trunk
20+
"""
21+
22+
conv_makers = [Conv3DSimple] * 4
23+
if model_depth < 50:
24+
block = BasicBlock
25+
else:
26+
block = Bottleneck
27+
28+
model = VideoTrunkBuilder(block=block, conv_makers=conv_makers, model_depth=model_depth,
29+
stem=get_default_stem(use_pool1=use_pool1), **kwargs)
30+
return model
31+
32+
33+
def r3d_18(use_pool1=False, **kwargs):
34+
"""Construct 18 layer Resnet3D model as in
35+
https://arxiv.org/abs/1711.11248
36+
37+
Args:
38+
use_pool1 (bool, optional): Include pooling in resnet stem. Defaults to False.
39+
40+
Returns:
41+
nn.Module: R3D-18 network
42+
"""
43+
return _r3d(18, use_pool1, **kwargs)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import torch.nn as nn
2+
3+
4+
def get_default_stem(use_pool1=False):
5+
"""The default conv-batchnorm-relu(-maxpool) stem
6+
7+
Args:
8+
use_pool1 (bool, optional): Should the stem include the default maxpool? Defaults to False.
9+
10+
Returns:
11+
nn.Sequential: Conv1 stem of resnet based models.
12+
"""
13+
14+
m = [
15+
nn.Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2),
16+
padding=(1, 3, 3), bias=False),
17+
nn.BatchNorm3d(64),
18+
nn.ReLU(inplace=True)]
19+
if use_pool1:
20+
m.append(nn. MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1))
21+
return nn.Sequential(*m)
22+
23+
24+
def get_r2plus1d_stem(use_pool1=False):
25+
"""R(2+1)D stem is different than the default one as it uses separated 3D convolution
26+
27+
Args:
28+
use_pool1 (bool, optional): Should the stem contain pool1 layer. Defaults to False.
29+
30+
Returns:
31+
nn.Sequential: the stem of the conv-separated network.
32+
"""
33+
34+
m = [
35+
nn.Conv3d(3, 45, kernel_size=(1, 7, 7),
36+
stride=(1, 2, 2), padding=(0, 3, 3),
37+
bias=False),
38+
nn.BatchNorm3d(45),
39+
nn.ReLU(inplace=True),
40+
nn.Conv3d(45, 64, kernel_size=(3, 1, 1),
41+
stride=(1, 1, 1), padding=(1, 0, 0),
42+
bias=False),
43+
nn.BatchNorm3d(64),
44+
nn.ReLU(inplace=True)]
45+
46+
if use_pool1:
47+
m.append(nn. MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1))
48+
return nn.Sequential(*m)

0 commit comments

Comments
 (0)