Skip to content

Commit 18577d0

Browse files
authored
Add entry in the documentation for video models (#1207)
* Add docs for video models * Fix docstrings for resnet and vgg
1 parent 1afef1d commit 18577d0

File tree

3 files changed

+63
-15
lines changed

3 files changed

+63
-15
lines changed

docs/source/models.rst

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ torchvision.models
44

55
The models subpackage contains definitions of models for addressing
66
different tasks, including: image classification, pixelwise semantic
7-
segmentation, object detection, instance segmentation and person
8-
keypoint detection.
7+
segmentation, object detection, instance segmentation, person
8+
keypoint detection and video classification.
99

1010

1111
Classification
@@ -395,3 +395,51 @@ Keypoint R-CNN
395395

396396
.. autofunction:: torchvision.models.detection.keypointrcnn_resnet50_fpn
397397

398+
399+
Video classification
400+
====================
401+
402+
We provide models for action recognition pre-trained on Kinetics-400.
403+
They have all been trained with the scripts provided in ``references/video_classification``.
404+
405+
All pre-trained models expect input images normalized in the same way,
406+
i.e. mini-batches of 3-channel RGB videos of shape (3 x T x H x W),
407+
where H and W are expected to be 112, and T is a number of video frames in a clip.
408+
The images have to be loaded in to a range of [0, 1] and then normalized
409+
using ``mean = [0.43216, 0.394666, 0.37645]`` and ``std = [0.22803, 0.22145, 0.216989]``.
410+
411+
412+
.. note::
413+
The normalization parameters are different from the image classification ones, and correspond
414+
to the mean and std from Kinetics-400.
415+
416+
.. note::
417+
For now, normalization code can be found in ``references/video_classification/transforms.py``,
418+
see the ``Normalize`` function there. Note that it differs from standard normalization for
419+
images because it assumes the video is 4d.
420+
421+
Kinetics 1-crop accuracies for clip length 16 (16x112x112)
422+
423+
================================ ============= =============
424+
Network Clip acc@1 Clip acc@5
425+
================================ ============= =============
426+
ResNet 3D 18 52.75 75.45
427+
ResNet MC 18 53.90 76.29
428+
ResNet (2+1)D 57.50 78.81
429+
================================ ============= =============
430+
431+
432+
ResNet 3D
433+
----------
434+
435+
.. autofunction:: torchvision.models.video.r3d_18
436+
437+
ResNet Mixed Convolution
438+
------------------------
439+
440+
.. autofunction:: torchvision.models.video.mc3_18
441+
442+
ResNet (2+1)D
443+
-------------
444+
445+
.. autofunction:: torchvision.models.video.r2plus1d_18

torchvision/models/resnet.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ def _resnet(arch, block, layers, pretrained, progress, **kwargs):
221221

222222
def resnet18(pretrained=False, progress=True, **kwargs):
223223
r"""ResNet-18 model from
224-
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>'_
224+
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
225225
226226
Args:
227227
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -233,7 +233,7 @@ def resnet18(pretrained=False, progress=True, **kwargs):
233233

234234
def resnet34(pretrained=False, progress=True, **kwargs):
235235
r"""ResNet-34 model from
236-
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>'_
236+
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
237237
238238
Args:
239239
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -245,7 +245,7 @@ def resnet34(pretrained=False, progress=True, **kwargs):
245245

246246
def resnet50(pretrained=False, progress=True, **kwargs):
247247
r"""ResNet-50 model from
248-
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>'_
248+
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
249249
250250
Args:
251251
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -257,7 +257,7 @@ def resnet50(pretrained=False, progress=True, **kwargs):
257257

258258
def resnet101(pretrained=False, progress=True, **kwargs):
259259
r"""ResNet-101 model from
260-
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>'_
260+
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
261261
262262
Args:
263263
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -269,7 +269,7 @@ def resnet101(pretrained=False, progress=True, **kwargs):
269269

270270
def resnet152(pretrained=False, progress=True, **kwargs):
271271
r"""ResNet-152 model from
272-
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>'_
272+
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
273273
274274
Args:
275275
pretrained (bool): If True, returns a model pre-trained on ImageNet

torchvision/models/vgg.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, progress, **kwargs):
9797

9898
def vgg11(pretrained=False, progress=True, **kwargs):
9999
r"""VGG 11-layer model (configuration "A") from
100-
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
100+
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
101101
102102
Args:
103103
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -108,7 +108,7 @@ def vgg11(pretrained=False, progress=True, **kwargs):
108108

109109
def vgg11_bn(pretrained=False, progress=True, **kwargs):
110110
r"""VGG 11-layer model (configuration "A") with batch normalization
111-
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
111+
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
112112
113113
Args:
114114
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -119,7 +119,7 @@ def vgg11_bn(pretrained=False, progress=True, **kwargs):
119119

120120
def vgg13(pretrained=False, progress=True, **kwargs):
121121
r"""VGG 13-layer model (configuration "B")
122-
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
122+
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
123123
124124
Args:
125125
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -130,7 +130,7 @@ def vgg13(pretrained=False, progress=True, **kwargs):
130130

131131
def vgg13_bn(pretrained=False, progress=True, **kwargs):
132132
r"""VGG 13-layer model (configuration "B") with batch normalization
133-
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
133+
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
134134
135135
Args:
136136
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -141,7 +141,7 @@ def vgg13_bn(pretrained=False, progress=True, **kwargs):
141141

142142
def vgg16(pretrained=False, progress=True, **kwargs):
143143
r"""VGG 16-layer model (configuration "D")
144-
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
144+
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
145145
146146
Args:
147147
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -152,7 +152,7 @@ def vgg16(pretrained=False, progress=True, **kwargs):
152152

153153
def vgg16_bn(pretrained=False, progress=True, **kwargs):
154154
r"""VGG 16-layer model (configuration "D") with batch normalization
155-
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
155+
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
156156
157157
Args:
158158
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -163,7 +163,7 @@ def vgg16_bn(pretrained=False, progress=True, **kwargs):
163163

164164
def vgg19(pretrained=False, progress=True, **kwargs):
165165
r"""VGG 19-layer model (configuration "E")
166-
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
166+
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
167167
168168
Args:
169169
pretrained (bool): If True, returns a model pre-trained on ImageNet
@@ -174,7 +174,7 @@ def vgg19(pretrained=False, progress=True, **kwargs):
174174

175175
def vgg19_bn(pretrained=False, progress=True, **kwargs):
176176
r"""VGG 19-layer model (configuration 'E') with batch normalization
177-
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>'_
177+
`"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
178178
179179
Args:
180180
pretrained (bool): If True, returns a model pre-trained on ImageNet

0 commit comments

Comments
 (0)