🚲 Added gradient_accumulate_steps for all config, clearer note for batch_size parameter.

dathudeptrai · dathudeptrai · commit 1ca581f3ad53 · 2020-11-19T14:14:54.000+07:00
diff --git a/examples/fastspeech/conf/fastspeech.v1.yaml b/examples/fastspeech/conf/fastspeech.v1.yaml
@@ -46,7 +46,7 @@ fastspeech_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16              # Batch size.
+batch_size: 16              # Batch size for each GPU with asuming that gradient_accumulation_steps is 1
 remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32    # remove all targets has mel_length <= 32 
@@ -60,7 +60,8 @@ optimizer_params:
     decay_steps: 150000          # < train_max_steps is recommend.
     warmup_proportion: 0.02
     weight_decay: 0.001
-    
+
+gradient_accumulation_steps: 1
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable 
diff --git a/examples/fastspeech/conf/fastspeech.v3.yaml b/examples/fastspeech/conf/fastspeech.v3.yaml
@@ -46,7 +46,7 @@ fastspeech_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16              # Batch size.
+batch_size: 16              # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32    # remove all targets has mel_length <= 32 
@@ -60,7 +60,8 @@ optimizer_params:
     decay_steps: 150000          # < train_max_steps is recommend.
     warmup_proportion: 0.02
     weight_decay: 0.001
-    
+
+gradient_accumulation_steps: 1
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable     
diff --git a/examples/fastspeech2/conf/fastspeech2.baker.v2.yaml b/examples/fastspeech2/conf/fastspeech2.baker.v2.yaml
@@ -48,7 +48,7 @@ fastspeech2_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16              # Batch size.
+batch_size: 16              # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32    # remove all targets has mel_length <= 32 
@@ -62,7 +62,8 @@ optimizer_params:
     decay_steps: 150000          # < train_max_steps is recommend.
     warmup_proportion: 0.02
     weight_decay: 0.001
-    
+
+gradient_accumulation_steps: 1
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable
diff --git a/examples/fastspeech2/conf/fastspeech2.kss.v1.yaml b/examples/fastspeech2/conf/fastspeech2.kss.v1.yaml
@@ -47,7 +47,7 @@ fastspeech2_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16              # Batch size.
+batch_size: 16              # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32    # remove all targets has mel_length <= 32 
@@ -61,7 +61,8 @@ optimizer_params:
     decay_steps: 150000          # < train_max_steps is recommend.
     warmup_proportion: 0.02
     weight_decay: 0.001
-    
+
+gradient_accumulation_steps: 1
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable 
diff --git a/examples/fastspeech2/conf/fastspeech2.kss.v2.yaml b/examples/fastspeech2/conf/fastspeech2.kss.v2.yaml
@@ -48,7 +48,7 @@ fastspeech2_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16              # Batch size.
+batch_size: 16              # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32    # remove all targets has mel_length <= 32 
@@ -62,7 +62,8 @@ optimizer_params:
     decay_steps: 150000          # < train_max_steps is recommend.
     warmup_proportion: 0.02
     weight_decay: 0.001
-    
+
+gradient_accumulation_steps: 1
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable
diff --git a/examples/fastspeech2/conf/fastspeech2.v1.yaml b/examples/fastspeech2/conf/fastspeech2.v1.yaml
@@ -46,7 +46,7 @@ fastspeech2_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16              # Batch size.
+batch_size: 16              # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32    # remove all targets has mel_length <= 32 
@@ -60,7 +60,8 @@ optimizer_params:
     decay_steps: 150000          # < train_max_steps is recommend.
     warmup_proportion: 0.02
     weight_decay: 0.001
-    
+
+gradient_accumulation_steps: 1
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable
diff --git a/examples/fastspeech2/conf/fastspeech2.v2.yaml b/examples/fastspeech2/conf/fastspeech2.v2.yaml
@@ -47,7 +47,7 @@ fastspeech2_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16              # Batch size.
+batch_size: 16              # Batch size for each GPU with assuming that gradient_accumulation_steps == 1
 remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32    # remove all targets has mel_length <= 32 
@@ -61,7 +61,8 @@ optimizer_params:
     decay_steps: 150000          # < train_max_steps is recommend.
     warmup_proportion: 0.02
     weight_decay: 0.001
-    
+
+gradient_accumulation_steps: 1  
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable   
diff --git a/examples/fastspeech2_libritts/conf/fastspeech2libritts.yaml b/examples/fastspeech2_libritts/conf/fastspeech2libritts.yaml
@@ -46,7 +46,7 @@ fastspeech2_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 32               # Batch size.
+batch_size: 32              # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 48    # remove all targets has mel_length <= 32
@@ -60,7 +60,8 @@ optimizer_params:
     decay_steps: 120000          # < train_max_steps is recommend.
     warmup_proportion: 0.02
     weight_decay: 0.001
-    
+
+gradient_accumulation_steps: 1  
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable
diff --git a/examples/melgan.stft/conf/melgan.stft.v1.yaml b/examples/melgan.stft/conf/melgan.stft.v1.yaml
@@ -63,7 +63,7 @@ lambda_adv: 4.0
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16                 # Batch size.
+batch_size: 16                 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 batch_max_steps: 8192          # Length of each audio in batch for training. Make sure dividable by hop_size.
 batch_max_steps_valid: 81920   # Length of each audio for validation. Make sure dividable by hope_size.
 remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
@@ -86,7 +86,7 @@ discriminator_optimizer_params:
         boundaries: [0]             # after resume and start training discriminator, global steps is 100k, but local discriminator step is 0
         values: [0.0001, 0.0001]    # learning rate each interval.
 
-
+gradient_accumulation_steps: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
diff --git a/examples/melgan/conf/melgan.v1.yaml b/examples/melgan/conf/melgan.v1.yaml
@@ -53,7 +53,7 @@ lambda_feat_match: 10.0
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 16                 # Batch size.
+batch_size: 16                 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 batch_max_steps: 8192          # Length of each audio in batch for training. Make sure dividable by hop_size.
 batch_max_steps_valid: 81920   # Length of each audio for validation. Make sure dividable by hope_size.
 remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
@@ -73,6 +73,7 @@ discriminator_optimizer_params:
     beta_1: 0.5
     beta_2: 0.9
 
+gradient_accumulation_steps: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
diff --git a/examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml b/examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml
@@ -67,7 +67,7 @@ lambda_adv: 2.5              # Loss balancing coefficient for adversarial loss.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 64                 # Batch size.
+batch_size: 64                 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 batch_max_steps: 9600          # Length of each audio in batch for training. Make sure dividable by hop_size.
 batch_max_steps_valid: 48000   # Length of each audio for validation. Make sure dividable by hope_size.
 remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
@@ -91,6 +91,7 @@ discriminator_optimizer_params:
         values: [0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
     amsgrad: false
 
+gradient_accumulation_steps: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
diff --git a/examples/multiband_melgan/conf/multiband_melgan.v1.yaml b/examples/multiband_melgan/conf/multiband_melgan.v1.yaml
@@ -67,7 +67,7 @@ lambda_adv: 2.5              # Loss balancing coefficient for adversarial loss.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 64                 # Batch size.
+batch_size: 64                 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 batch_max_steps: 8192          # Length of each audio in batch for training. Make sure dividable by hop_size.
 batch_max_steps_valid: 8192    # Length of each audio for validation. Make sure dividable by hope_size.
 remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
@@ -91,6 +91,7 @@ discriminator_optimizer_params:
         values: [0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
     amsgrad: false
 
+gradient_accumulation_steps: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
diff --git a/examples/multiband_pwgan/conf/multiband_pwgan.v1.yaml b/examples/multiband_pwgan/conf/multiband_pwgan.v1.yaml
@@ -61,7 +61,7 @@ lambda_adv: 2.5              # Loss balancing coefficient for adversarial loss.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 64                 # Batch size.
+batch_size: 64                 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 batch_max_steps: 8192          # Length of each audio in batch for training. Make sure dividable by hop_size.
 batch_max_steps_valid: 81920   # Length of each audio for validation. Make sure dividable by hope_size.
 remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
@@ -85,7 +85,7 @@ discriminator_optimizer_params:
         decay_steps: 200000
         decay_rate: 0.5
 
-
+gradient_accumulation_steps: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
diff --git a/examples/multiband_pwgan/conf/multiband_pwgan.v1ft.yaml b/examples/multiband_pwgan/conf/multiband_pwgan.v1ft.yaml
@@ -65,7 +65,7 @@ lambda_adv: 2.5              # Loss balancing coefficient for adversarial loss.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 64                 # Batch size.
+batch_size: 64                 # Batch size for each GPU with asuming that gradient_accumulation_steps == 1.
 batch_max_steps: 8192          # Length of each audio in batch for training. Make sure dividable by hop_size.
 batch_max_steps_valid: 81920   # Length of each audio for validation. Make sure dividable by hope_size.
 remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
@@ -90,6 +90,7 @@ discriminator_optimizer_params:
         decay_steps: 70000
         decay_rate: 0.5
 
+gradient_accumulation_steps: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
diff --git a/examples/parallel_wavegan/conf/parallel_wavegan.v1.yaml b/examples/parallel_wavegan/conf/parallel_wavegan.v1.yaml
@@ -65,8 +65,8 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 6                 # Batch size.
-batch_max_steps: 25600          # Length of each audio in batch for training. Make sure dividable by hop_size.
+batch_size: 6                  # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
+batch_max_steps: 25600         # Length of each audio in batch for training. Make sure dividable by hop_size.
 batch_max_steps_valid: 81920   # Length of each audio for validation. Make sure dividable by hope_size.
 remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true              # Whether to allow cache in dataset. If true, it requires cpu memory.
@@ -90,6 +90,7 @@ discriminator_optimizer_params:
         decay_steps: 200000
         decay_rate: 0.5
 
+gradient_accumulation_steps: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
diff --git a/examples/tacotron2/conf/tacotron2.baker.v1.yaml b/examples/tacotron2/conf/tacotron2.baker.v1.yaml
@@ -47,13 +47,13 @@ tacotron2_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 32             # Batch size.
+batch_size: 32             # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32   # remove all targets has mel_length <= 32 
 is_shuffle: true           # shuffle dataset after each epoch.
 use_fixed_shapes: true     # use_fixed_shapes for training (2x speed-up)
-                           # refer (https://github.com/dathudeptrai/TensorflowTTS/issues/34#issuecomment-642309118)
+                           # refer (https://github.com/tensorspeech/TensorflowTTS/issues/34#issuecomment-642309118)
 
 ###########################################################
 #             OPTIMIZER & SCHEDULER SETTING               #
@@ -65,6 +65,7 @@ optimizer_params:
     warmup_proportion: 0.02
     weight_decay: 0.001
 
+gradient_accumulation_steps: 1
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|decoder_cell' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable
diff --git a/examples/tacotron2/conf/tacotron2.kss.v1.yaml b/examples/tacotron2/conf/tacotron2.kss.v1.yaml
@@ -47,12 +47,12 @@ tacotron2_params:
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 32             # Batch size.
+batch_size: 32             # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
 mel_length_threshold: 32   # remove all targets has mel_length <= 32 
 is_shuffle: true           # shuffle dataset after each epoch.
-use_fixed_shapes: false    # use_fixed_shapes for training (2x speed-up)
+use_fixed_shapes: true     # use_fixed_shapes for training (2x speed-up)
                            # refer (https://github.com/dathudeptrai/TensorflowTTS/issues/34#issuecomment-642309118)
 
 ###########################################################
@@ -65,6 +65,7 @@ optimizer_params:
     warmup_proportion: 0.02
     weight_decay: 0.001
 
+gradient_accumulation_steps: 1
 var_train_expr: null  # trainable variable expr (eg. 'embeddings|decoder_cell' )
                       # must separate by |. if var_train_expr is null then we 
                       # training all variable
diff --git a/examples/tacotron2/conf/tacotron2.v1.yaml b/examples/tacotron2/conf/tacotron2.v1.yaml