wav2vec2 finetuning in PyTorch for Audio Recognition

pythonlessons · pythonlessons · commit dab3a494acdf · 2023-09-26T17:02:34.000+03:00
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -10,7 +10,8 @@
             "request": "launch",
             "program": "${file}",
             "console": "integratedTerminal",
-            "justMyCode": false
+            "justMyCode": false,
+            "subProcess": true,
         }
     ]
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## [1.1.1] - 2022-09-26
+### Changed
+- Included `self._executor` as generator in `mltu.dataProvider.DataProvider` object, to enable functionality to modify batch preprocessing without changing original code
+- Introduced changes in `mltu.torch.dataProvider.py` to handle data in multiprocessing and multithreading modes, for faster preprocessing while torch models
+- Modified `mltu.transformers.AudioPadding` object, to work with batches of raw audio data
+
+### Added
+- Created tutorial `10_wav2vec2_torch` (Audio to Text model) that shows how to train wav2vec2 model with mltu
+
+
 ## [1.1.0] - 2022-08-28
 ### Changed
 - Changed `mltu.transformers.SpectrogramPadding` object, to pad spectrogram end with zeros instead of start
@@ -10,6 +20,7 @@
 - Created `mltu.tensorflow.transformer.callbacks` module, that contains `EncDecSplitCallback` callback, to split Transformer model into separate encoder and decoder models
 - Created `mltu.tensorflow.transformer.utils` module, that contains `MaskedLoss` loss and `MaskedAccuracy` metric, used for training Transformer models
 
+
 ## [1.0.15] - 2022-07-15
 ### Changed
 - Fixed bug in `mltu.dataProvider.DataProvider` to work with `batch_postprocessors`.
diff --git a/Tutorials/10_wav2vec2_torch/configs.py b/Tutorials/10_wav2vec2_torch/configs.py
@@ -10,15 +10,17 @@ def __init__(self):
             "Models/10_wav2vec2_torch",
             datetime.strftime(datetime.now(), "%Y%m%d%H%M"),
         )
-        self.batch_size = 6
+        self.batch_size = 8
         self.train_epochs = 60
         self.train_workers = 20
 
-        self.init_lr = 1.0e-7
+        self.init_lr = 1.0e-8
         self.lr_after_warmup = 1e-05
         self.final_lr = 5e-06
-        self.warmup_epochs = 5
+        self.warmup_epochs = 10
         self.decay_epochs = 40
+        self.weight_decay = 0.005
+        self.mixed_precision = True
 
         self.max_audio_length = 246000 
         self.max_label_length = 256
diff --git a/Tutorials/10_wav2vec2_torch/test.py b/Tutorials/10_wav2vec2_torch/test.py
@@ -1,10 +1,7 @@
-import cv2
-import typing
 import numpy as np
 
 from mltu.inferenceModel import OnnxInferenceModel
 from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer
-from mltu.preprocessors import AudioReader
 
 class Wav2vec2(OnnxInferenceModel):
     def __init__(self, *args, **kwargs):
@@ -21,48 +18,19 @@ def predict(self, audio: np.ndarray):
         return text
 
 if __name__ == "__main__":
+    import librosa
     import pandas as pd
     from tqdm import tqdm
-    import onnxruntime as ort
 
-    # model_path = "Models/11_wav2vec2_torch/202309131152/model.onnx"
-    # session = ort.InferenceSession(model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
-
-    # audio_len = 246000
-    # # Prepare input data (replace 'input' with the actual input name)
-    # input_data = {'input': np.random.randn(1, audio_len).astype(np.float32)}
-
-    # # Run inference
-    # output = session.run(None, input_data)
-
-    model = Wav2vec2(model_path="Models/11_wav2vec2_torch/202309141138/model.onnx")
+    model = Wav2vec2(model_path="Models/10_wav2vec2_torch/202309171434/model.onnx")
 
     # The list of multiple [audio_path, label] for validation
-    val_dataset = pd.read_csv("Models/11_wav2vec2_torch/202309141138/val.csv").values.tolist()
-
-
-    # model.vocab = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
-    audioReader = AudioReader(sample_rate=16000)
-
-
-    # dataset_path = "Datasets/LJSpeech-1.1"
-    # metadata_path = dataset_path + "/metadata.csv"
-    # wavs_path = dataset_path + "/wavs/"
-
-    # # Read metadata file and parse it
-    # metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
-    # dataset = []
-    # # vocab = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
-    # for file_name, transcription, normalized_transcription in metadata_df.values.tolist():
-    #     path = f"Datasets/LJSpeech-1.1/wavs/{file_name}.wav"
-    #     new_label = "".join([l for l in normalized_transcription.lower() if l in model.vocab])
-    #     dataset.append([path, new_label])
-
+    val_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/val.csv").values.tolist()
 
     accum_cer, accum_wer = [], []
     pbar = tqdm(val_dataset)
     for vaw_path, label in pbar:
-        audio, label = audioReader(vaw_path, label)
+        audio, sr = librosa.load(vaw_path, sr=16000)
 
         prediction_text = model.predict(audio)
 
diff --git a/Tutorials/10_wav2vec2_torch/train.py b/Tutorials/10_wav2vec2_torch/train.py
@@ -67,19 +67,21 @@ def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
         LabelIndexer(vocab),
         LabelPadding(max_word_length=configs.max_label_length, padding_value=len(vocab)),
         ],
-    use_cache=True,
-    use_multiprocessing=False,
+    use_cache=False,
     batch_postprocessors=[
         AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True)
-    ]
+    ],
+    use_multiprocessing=True,
+    max_queue_size=10,
+    workers=64,
 )
-
 train_dataProvider, test_dataProvider = data_provider.split(split=0.9)
-train_dataProvider.augmentors = [
-        RandomAudioNoise(), 
-        RandomAudioPitchShift(), 
-        RandomAudioTimeStretch()
-    ]
+
+# train_dataProvider.augmentors = [
+#         RandomAudioNoise(), 
+#         RandomAudioPitchShift(), 
+#         RandomAudioTimeStretch()
+#     ]
 
 vocab = sorted(vocab)
 configs.vocab = vocab
@@ -90,17 +92,11 @@ class CustomWav2Vec2Model(nn.Module):
     def __init__(self, hidden_states, dropout_rate=0.2, **kwargs):
         super(CustomWav2Vec2Model, self).__init__( **kwargs)
         pretrained_name = "facebook/wav2vec2-base-960h"
-        self.model = Wav2Vec2ForCTC.from_pretrained(pretrained_name).wav2vec2
-        # self.model.freeze_feature_encoder()
-        self.dropout = nn.Dropout(p=dropout_rate)
-        self.linear = nn.Linear(self.model.config.hidden_size, hidden_states)
+        self.model = Wav2Vec2ForCTC.from_pretrained(pretrained_name, vocab_size=hidden_states, ignore_mismatched_sizes=True)
+        self.model.freeze_feature_encoder() # this part does not need to be fine-tuned
 
     def forward(self, inputs):
-        output = self.model(inputs, attention_mask=None).last_hidden_state
-        # Apply dropout
-        output = self.dropout(output)
-        # Apply linear layer
-        output = self.linear(output)
+        output = self.model(inputs, attention_mask=None).logits
         # Apply softmax
         output = F.log_softmax(output, -1)
         return output
@@ -118,6 +114,7 @@ def forward(self, inputs):
     decay_epochs=configs.decay_epochs,
     final_lr=configs.final_lr,
     initial_lr=configs.init_lr,
+    verbose=True,
 )
 tb_callback = TensorBoard(configs.model_path + "/logs")
 earlyStopping = EarlyStopping(monitor="val_CER", patience=16, mode="min", verbose=1)
@@ -133,12 +130,13 @@ def forward(self, inputs):
 # create model object that will handle training and testing of the network
 model = Model(
     custom_model, 
-    loss = CTCLoss(blank=len(configs.vocab)),
-    optimizer = torch.optim.AdamW(custom_model.parameters(), lr=configs.init_lr, weight_decay=1e-5),
+    loss = CTCLoss(blank=len(configs.vocab), zero_infinity=True),
+    optimizer = torch.optim.AdamW(custom_model.parameters(), lr=configs.init_lr, weight_decay=configs.weight_decay),
     metrics=[
         CERMetric(configs.vocab), 
         WERMetric(configs.vocab)
     ],
+    mixed_precision=configs.mixed_precision,
 )
 
 # Save training and validation datasets as csv files
diff --git a/Tutorials/10_wav2vec2_torch/train_tf.py b/Tutorials/10_wav2vec2_torch/train_tf.py
@@ -24,12 +24,12 @@
 from configs import ModelConfigs
 
 configs = ModelConfigs()
-from transformers import TFWav2Vec2Model
+from transformers import TFWav2Vec2ForCTC
 from mltu.preprocessors import AudioReader
 
 
-train_dataset = pd.read_csv("Models/11_wav2vec2_torch/202309141138/train.csv").values.tolist()
-validation_dataset = pd.read_csv("Models/11_wav2vec2_torch/202309141138/val.csv").values.tolist()
+train_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/train.csv").values.tolist()
+validation_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/val.csv").values.tolist()
 
 # Create a data provider for the dataset
 train_dataProvider = DataProvider(
@@ -71,28 +71,24 @@
     use_cache=True,
 )
 
-class TFWav2Vec2ForCTC(layers.Layer):
-    def __init__(self, output_dim, dropout_rate=0.2, **kwargs):
+class CustomWav2Vec2Model(layers.Layer):
+    def __init__(self, output_dim, **kwargs):
         super().__init__(**kwargs)
 
-        self.wav2vec2 = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
-        self.dropout = layers.Dropout(dropout_rate)
-        self.final_layer = layers.Dense(output_dim, activation="softmax")
+        pretrained_name = "facebook/wav2vec2-base-960h"
+        self.model = TFWav2Vec2ForCTC.from_pretrained(pretrained_name, vocab_size=output_dim, ignore_mismatched_sizes=True)
+        self.model.freeze_feature_encoder() # https://huggingface.co/blog/fine-tune-wav2vec2-english
 
     def __call__(self, inputs):
-        outputs = self.wav2vec2(inputs)
+        outputs = self.model(inputs)
 
-        hidden_states = outputs.last_hidden_state
-
-        dropout = self.dropout(hidden_states)
-
-        final_state = self.final_layer(dropout)
+        final_state = tf.nn.softmax(outputs.logits, axis=-1)
 
         return final_state
 
 custom_model = tf.keras.Sequential([
     layers.Input(shape=(None,), name="input", dtype=tf.float32),
-    TFWav2Vec2ForCTC(len(configs.vocab)+1, dropout_rate=0.2)
+    CustomWav2Vec2Model(len(configs.vocab)+1)
 ])
 
 for data in train_dataProvider:
@@ -105,7 +101,7 @@ def __call__(self, inputs):
 
 # Compile the model and print summary
 custom_model.compile(
-    optimizer=tf.keras.optimizers.AdamW(learning_rate=configs.init_lr, weight_decay=1e-5), 
+    optimizer=tf.keras.optimizers.AdamW(learning_rate=configs.init_lr, weight_decay=configs.weight_decay), 
     loss=CTCloss(), 
     metrics=[
         CERMetric(vocabulary=configs.vocab),
diff --git a/mltu/__init__.py b/mltu/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.1.0"
+__version__ = "1.1.1"
 
 from .annotations.images import Image
 from .annotations.images import CVImage
diff --git a/mltu/dataProvider.py b/mltu/dataProvider.py
@@ -205,10 +205,21 @@ def get_batch_annotations(self, index: int) -> typing.List:
 
         return batch_annotations
     
+    def start_executor(self) -> None:
+        """ Start the executor to process data"""
+        def executor(batch_data):
+            for data in batch_data:
+                yield self.process_data(data)
+
+        if not hasattr(self, "_executor"):
+            self._executor = executor
+
     def __iter__(self):
         """ Create a generator that iterate over the Sequence."""
-        for item in (self[i] for i in range(len(self))):
-            yield item
+        self.start_executor()
+        for index in range(len(self)):
+            results = self[index]
+            yield results
 
     def process_data(self, batch_data):
         """ Process data batch of data """
@@ -250,19 +261,22 @@ def process_data(self, batch_data):
         return data, annotation
 
     def __getitem__(self, index: int):
-        """ Returns a batch of data by batch index"""
+        """ Returns a batch of processed data by index
+        
+        Args:
+            index (int): index of batch
+            
+        Returns:
+            tuple: batch of data and batch of annotations
+        """
         dataset_batch = self.get_batch_annotations(index)
         
         # First read and preprocess the batch data
         batch_data, batch_annotations = [], []
-        for index, batch in enumerate(dataset_batch):
-
-            data, annotation = self.process_data(batch)
-
+        for data, annotation in self._executor(dataset_batch):
             if data is None or annotation is None:
                 self.logger.warning("Data or annotation is None, skipping.")
                 continue
-
             batch_data.append(data)
             batch_annotations.append(annotation)
 
@@ -272,4 +286,4 @@ def __getitem__(self, index: int):
 
             return batch_data, batch_annotations
 
-        return np.array(batch_data), np.array(batch_annotations)
+        return np.array(batch_data), np.array(batch_annotations)
diff --git a/mltu/torch/callbacks.py b/mltu/torch/callbacks.py
@@ -440,6 +440,7 @@ class WarmupCosineDecay(Callback):
         decay_epochs (int): Number of decay epochs
         initial_lr (float, optional): Initial learning rate. Defaults to 0.0.
         verbose (bool, optional): Whether to print learning rate. Defaults to False.
+        warmup_steps (int, optional): Number of warmup steps. Defaults to None.
     """
     def __init__(
             self, 
@@ -448,6 +449,7 @@ def __init__(
             warmup_epochs: int, 
             decay_epochs: int, 
             initial_lr: float=0.0, 
+            warmup_steps: int=None,
             verbose=False
         ) -> None:
         super(WarmupCosineDecay, self).__init__()
@@ -456,24 +458,43 @@ def __init__(
         self.warmup_epochs = warmup_epochs
         self.decay_epochs = decay_epochs
         self.initial_lr = initial_lr
+        self.warmup_steps = warmup_steps
         self.verbose = verbose
+        self.step = None
+
+        self.warmup_lrs = np.linspace(self.initial_lr, self.lr_after_warmup, self.warmup_epochs)
+        if warmup_steps:
+            self.step = 0
+            self.warmup_epochs = 0
+            self.warmup_lrs = np.linspace(self.initial_lr, self.lr_after_warmup, warmup_steps)
 
     def on_epoch_begin(self, epoch: int, logs: dict=None):
         """ Adjust learning rate at the beginning of each epoch """
 
+        if self.warmup_steps:
+            return logs
+
         if epoch >= self.warmup_epochs + self.decay_epochs:
             return logs
 
         if epoch <= self.warmup_epochs:
-            lr = np.linspace(self.initial_lr, self.lr_after_warmup, 5)[epoch-1]
+            lr = self.warmup_lrs[epoch-1]
         else:
             progress = (epoch - self.warmup_epochs) / self.decay_epochs
             lr = self.final_lr + 0.5 * (self.lr_after_warmup - self.final_lr) * (1 + np.cos(np.pi * progress))
 
         self.model.optimizer.param_groups[0]["lr"] = lr
         
         if self.verbose:
-            print(f"Epoch {epoch + 1} - Learning Rate: {lr}")
+            self.logger.info(f"Epoch {epoch} - Learning Rate: {lr}")
+
+    def on_train_batch_begin(self, batch: int, logs: dict=None):
+        if self.warmup_steps and self.step is not None:
+            if self.step < self.warmup_steps:
+                self.model.optimizer.param_groups[0]["lr"] = self.warmup_lrs[self.step]
+                self.step += 1
+            else:
+                self.step = None
     
     def on_epoch_end(self, epoch: int, logs: dict=None):
         logs = logs or {}
diff --git a/mltu/torch/dataProvider.py b/mltu/torch/dataProvider.py
diff --git a/mltu/torch/model.py b/mltu/torch/model.py
diff --git a/mltu/transformers.py b/mltu/transformers.py

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,8 @@`
`10`	`10`	`"request": "launch",`
`11`	`11`	`"program": "${file}",`
`12`	`12`	`"console": "integratedTerminal",`
`13`		`- "justMyCode": false`
	`13`	`+ "justMyCode": false,`
	`14`	`+ "subProcess": true,`
`14`	`15`	`}`
`15`	`16`	`]`
`16`	`17`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "1.1.0"`
	`1`	`+__version__ = "1.1.1"`
`2`	`2`
`3`	`3`	`from .annotations.images import Image`
`4`	`4`	`from .annotations.images import CVImage`