1
1
"""
2
- Pretraining VGG from scratch
2
+ `` Pretraining`` VGG from scratch
3
3
============================
4
4
5
5
55
55
# - We train the model from scratch using only the configuration
56
56
# presented in the paper.
57
57
#
58
- # - we do not use future method, like BatchNormalization ,Adam , He
58
+ # - we do not use future method, like Batch normalization ,Adam , He
59
59
# initialization.
60
60
#
61
61
# - You can apply to ImageNet Data.
68
68
69
69
70
70
######################################################################
71
- # Why Vgg is so popluar ?
71
+ # Why VGG is so popular ?
72
72
# -----------------------
73
73
#
74
74
75
75
76
76
######################################################################
77
77
# VGG became a model that attracted attention because it succeeded in
78
78
# building deeper layers and dramatically shortening the training time
79
- # compared to alexNet , which was the sota model at the time.:
79
+ # compared to alexnet , which was the SOTA model at the time.:
80
80
#
81
81
82
82
91
91
# this configuration will be explained below section.
92
92
#
93
93
94
- DatasetName = 'Cifar' # Cifar ,Cifar10, Mnist , ImageNet
94
+ DatasetName = 'Cifar' # CIFAR ,CIFAR10, MNIST , ImageNet
95
95
96
96
## model configuration
97
97
98
98
num_classes = 100
99
- # CalTech 257 Cifar 100 Cifar10 10 ,Mnist 10 ImageNet 1000
99
+ # Caltech 257 CIFAR 100 CIFAR10 10 ,MNIST 10 ImageNet 1000
100
100
model_version = None ## you must configure it.
101
101
102
102
## data configuration
119
119
120
120
update_count = int (256 / batch_size )
121
121
accum_step = int (256 / batch_size )
122
- eval_step = 26 * accum_step ## CalTech 5 Cifar 5 Mnist 6 , Cifar10 5 ImageNet 26
122
+ eval_step = 26 * accum_step ## Caltech 5 CIFAR 5 MNIST 6 , CIFAR10 5 ImageNet 26
123
123
124
124
125
125
## model configuration
147
147
148
148
149
149
######################################################################
150
- # We use ``CIFAR100`` Dataset in this tutorial. In Vgg paper , the authors
151
- # scales image istropically . Then , they apply
152
- # Normalization,RandomCrop, HorizontalFlip . So , we need to override
150
+ # We use ``CIFAR100`` Dataset in this tutorial. In VGG paper , the authors
151
+ # scales image isotropically . Then , they apply
152
+ # Normalization,`` RandomCrop``,`` HorizontalFlip`` . So , we need to override
153
153
# CIFAR100 class to apply preprocessing.
154
154
#
155
155
@@ -168,8 +168,7 @@ def __init__(self,root,transform = None,multi=False,s_max=None,s_min=256,downloa
168
168
A .Normalize (mean = (0.5071 , 0.4867 , 0.4408 ) , std = (0.2675 , 0.2565 , 0.2761 )),
169
169
A .SmallestMaxSize (max_size = self .S ),
170
170
A .RandomCrop (height = 224 ,width = 224 ),
171
- A .HorizontalFlip (),
172
- # A.RGBShift()
171
+ A .HorizontalFlip ()
173
172
]
174
173
175
174
)
@@ -216,12 +215,12 @@ def __getitem__(self, index: int) :
216
215
217
216
218
217
######################################################################
219
- # | In Vgg paper, they do experiment over 6 models. model A is 11 layers,
220
- # model B is 13 layers, model C is 16 layers , model D is 16 laeyrs and
218
+ # | In VGG paper, they do experiment over 6 models. model A is 11 layers,
219
+ # model B is 13 layers, model C is 16 layers , model D is 16 layers and
221
220
# model D is 19 layers . you can train all version of models to
222
221
# reproduce VGG .
223
222
# | ``Config_Channels`` means output channels and ``Config_kernels`` means
224
- # kerenl size.
223
+ # kernel size.
225
224
#
226
225
227
226
import torch
@@ -284,8 +283,7 @@ def __init__(self,version , num_classes):
284
283
self .num_classes = num_classes
285
284
self .linear_out = 4096
286
285
self .xavier_count = xavier_count
287
- self .last_xavier = last_xavier ## if >0 , initialize last 3 fully connected noraml distribution
288
- # conv_1_by_1_3_outchannel = num_classes
286
+ self .last_xavier = last_xavier ## if >0 , initialize last 3 fully connected normal distribution
289
287
self .except_xavier = except_xavier
290
288
291
289
super ().__init__ ()
@@ -307,8 +305,6 @@ def __init__(self,version , num_classes):
307
305
print ('weight intialize end' )
308
306
def forward (self ,x ):
309
307
x = self .feature_extractor (x )
310
- # x= self.avgpool(x) ## If Linear is output, use this
311
- # x= torch.flatten(x,start_dim = 1) ## If Linear is output, use this
312
308
x = self .output_layer (x )
313
309
x = self .avgpool (x )
314
310
x = torch .flatten (x ,start_dim = 1 )
@@ -318,15 +314,12 @@ def forward(self,x):
318
314
@torch .no_grad ()
319
315
def _init_weights (self ,m ):
320
316
321
- # print(m)
322
317
if isinstance (m ,nn .Conv2d ):
323
318
print ('-------------' )
324
319
print (m .kernel_size )
325
320
print (m .out_channels )
326
- # if (m.out_channels == self.num_classes or m.out_channels == self.linear_out) and self.last_xavier>0 :
327
321
if self .last_xavier > 0 and (self .except_xavier is None or self .last_xavier != self .except_xavier ):
328
322
print ('xavier' )
329
- # self.last_xavier-=1
330
323
nn .init .xavier_uniform_ (m .weight )
331
324
elif self .xavier_count > 0 :
332
325
print ('xavier' )
@@ -335,10 +328,8 @@ def _init_weights(self,m):
335
328
else :
336
329
std = 0.1
337
330
print (f'normal std : { std } ' )
338
-
339
331
torch .nn .init .normal_ (m .weight ,std = std )
340
- # if (m.out_channels == self.num_classes or m.out_channels == self.linear_out) :
341
- # self.last_xavier+=10
332
+
342
333
self .last_xavier += 1
343
334
if m .bias is not None :
344
335
print ('bias zero init' )
@@ -361,21 +352,21 @@ def _init_weights(self,m):
361
352
362
353
363
354
######################################################################
364
- # When training Vgg , the authors first train model A , then initialized
355
+ # When training VGG , the authors first train model A , then initialized
365
356
# the weights of other models with the weights of model A. Waiting for
366
357
# Model A to be trained takes a long time . The authors mention how to
367
- # train with xavier initialization rather than initializing with the
358
+ # train with `` xavier`` initialization rather than initializing with the
368
359
# weights of model A. But, they do not mention how to initialize .
369
360
#
370
- # | To Reproduce Vgg , we use xavier initialization method to initialize
371
- # weights. We apply initialization to few first layes and last layers.
361
+ # | To Reproduce VGG , we use `` xavier`` initialization method to initialize
362
+ # weights. We apply initialization to few first layers and last layers.
372
363
# Then , we apply random initialization to other layers.
373
- # | **we must fix stdandrad deviation to 0.1**. If standard deviation is
364
+ # | **we must fix standard deviation to 0.1**. If standard deviation is
374
365
# larger than 0.1, the weight get NAN values. For stability, we use 0.1
375
366
# for standard deviation.
376
- # | The ``front_xavier`` means how many layers we initialize with xavier
367
+ # | The ``front_xavier`` means how many layers we initialize with `` xavier``
377
368
# initialization in front of layers and The ``last_xavier`` means how
378
- # many layers we initializae with xavier initialization in last of
369
+ # many layers we initialize with `` xavier`` initialization in last of
379
370
# layers.
380
371
#
381
372
# In My experiment, we can use ``front_xavier`` = 4 , ``last_xavier``\ =5
@@ -406,17 +397,15 @@ def accuracy(output, target, topk=(1,)):
406
397
407
398
res = []
408
399
for k in topk :
409
- # print(f'top {k}')
410
400
correct_k = correct [:k ].reshape (- 1 ).float ().sum (0 ,keepdim = True )
411
- # res.append(correct_k.mul_(100.0 / batch_size))
412
401
res .append (correct_k )
413
402
return res
414
403
415
404
416
405
######################################################################
417
406
# we initiate model and loss function and optimizer and schedulers. In
418
- # vgg , they use softmax output ,Momentum Optimizer , and Scheduling based
419
- # on accuarcy .
407
+ # VGG , they use softmax output ,Momentum Optimizer , and Scheduling based
408
+ # on accuracy .
420
409
#
421
410
422
411
model = Model_vgg (model_version ,num_classes )
@@ -440,9 +429,7 @@ def accuracy(output, target, topk=(1,)):
440
429
[
441
430
A .Normalize (mean = (0.5071 , 0.4867 , 0.4408 ) , std = (0.2675 , 0.2565 , 0.2761 )),
442
431
A .SmallestMaxSize (max_size = val_data .S ),
443
- A .CenterCrop (height = 224 ,width = 224 ),
444
- # A.HorizontalFlip(),
445
- # A.RGBShift()
432
+ A .CenterCrop (height = 224 ,width = 224 )
446
433
]
447
434
448
435
)
@@ -492,7 +479,6 @@ def accuracy(output, target, topk=(1,)):
492
479
if i > 0 and i % update_count == 0 :
493
480
print (f'Training steps : { i } parameter update loss :{ total_loss } ' )
494
481
if grad_clip is not None :
495
- # print(f'Training steps : {i} parameter grad clip to {grad_clip}')
496
482
torch .nn .utils .clip_grad_norm_ (model .parameters (), grad_clip )
497
483
optimizer .step ()
498
484
optimizer .zero_grad (set_to_none = True )
@@ -594,8 +580,7 @@ def __init__(self,root,transform = None,multi=False,s_max=None,s_min=256,split=N
594
580
A .Normalize (),
595
581
A .SmallestMaxSize (max_size = self .S ),
596
582
A .RandomCrop (height = 224 ,width = 224 ),
597
- A .HorizontalFlip (),
598
- # A.RGBShift()
583
+ A .HorizontalFlip ()
599
584
]
600
585
601
586
)
@@ -644,17 +629,15 @@ def __getitem__(self, index: int) :
644
629
[
645
630
A .Normalize (),
646
631
A .SmallestMaxSize (max_size = val_data .S ),
647
- A .CenterCrop (height = 224 ,width = 224 ),
648
- # A.HorizontalFlip(),
649
- # A.RGBShift()
632
+ A .CenterCrop (height = 224 ,width = 224 )
650
633
]
651
634
652
635
)
653
636
654
637
######################################################################
655
638
# Conculsion
656
639
# ----------
657
- # We have seen how pretraining VGG from scratch . This Tutorial will be helpful to reproduce another Foundation Model .
640
+ # We have seen how `` pretraining`` VGG from scratch . This Tutorial will be helpful to reproduce another Foundation Model .
658
641
659
642
######################################################################
660
643
# More things to try
@@ -668,5 +651,5 @@ def __getitem__(self, index: int) :
668
651
# Further Reading
669
652
# ---------------
670
653
671
- # - `VGG training using python script <https://github.com/woongjoonchoi/DeepLearningPaper-Reproducing/tree/master/Vgg >`__
654
+ # - `VGG training using python script <https://github.com/woongjoonchoi/DeepLearningPaper-Reproducing/tree/master/VGG >`__
672
655
# - `VGG paper <https://arxiv.org/abs/1409.1556>`__
0 commit comments