pythonlessons
diff --git a/‎.vscode/launch.json‎
Lines changed: 2 additions & 1 deletion b/‎.vscode/launch.json‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 11 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎Tutorials/09_translation_transformer/requirements.txt‎
Lines changed: 3 additions & 1 deletion b/‎Tutorials/09_translation_transformer/requirements.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎Tutorials/09_translation_transformer/test.py‎
Lines changed: 9 additions & 6 deletions b/‎Tutorials/09_translation_transformer/test.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎Tutorials/09_translation_transformer/train.py‎
Lines changed: 3 additions & 0 deletions b/‎Tutorials/09_translation_transformer/train.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Tutorials/10_wav2vec2_torch/configs.py‎
Lines changed: 28 additions & 0 deletions b/‎Tutorials/10_wav2vec2_torch/configs.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎Tutorials/10_wav2vec2_torch/requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎Tutorials/10_wav2vec2_torch/requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Tutorials/10_wav2vec2_torch/test.py‎
Lines changed: 43 additions & 0 deletions b/‎Tutorials/10_wav2vec2_torch/test.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎Tutorials/10_wav2vec2_torch/train.py‎
Lines changed: 157 additions & 0 deletions b/‎Tutorials/10_wav2vec2_torch/train.py‎
Lines changed: 157 additions & 0 deletions
@@ -10,7 +10,8 @@
             "request": "launch",
             "program": "${file}",
             "console": "integratedTerminal",
-            "justMyCode": false
+            "justMyCode": false,
+            "subProcess": true,
         }
     ]
 }
@@ -1,3 +1,13 @@
+## [1.1.1] - 2022-09-26
+### Changed
+- Included `self._executor` as generator in `mltu.dataProvider.DataProvider` object, to enable functionality to modify batch preprocessing without changing original code
+- Introduced changes in `mltu.torch.dataProvider.py` to handle data in multiprocessing and multithreading modes, for faster preprocessing while torch models
+- Modified `mltu.transformers.AudioPadding` object, to work with batches of raw audio data
+
+### Added
+- Created tutorial `10_wav2vec2_torch` (Audio to Text model) that shows how to train wav2vec2 model with mltu
+
+
 ## [1.1.0] - 2022-08-28
 ### Changed
 - Changed `mltu.transformers.SpectrogramPadding` object, to pad spectrogram end with zeros instead of start
@@ -10,6 +20,7 @@
 - Created `mltu.tensorflow.transformer.callbacks` module, that contains `EncDecSplitCallback` callback, to split Transformer model into separate encoder and decoder models
 - Created `mltu.tensorflow.transformer.utils` module, that contains `MaskedLoss` loss and `MaskedAccuracy` metric, used for training Transformer models
 
+
 ## [1.0.15] - 2022-07-15
 ### Changed
 - Fixed bug in `mltu.dataProvider.DataProvider` to work with `batch_postprocessors`.
 
@@ -1 +1,3 @@
-beautifulsoup4
+beautifulsoup4
+tf2onnx==1.14.0
+onnx==1.12.0
@@ -1,4 +1,5 @@
 import numpy as np
+import time
 
 from mltu.tokenizers import CustomTokenizer
 from mltu.inferenceModel import OnnxInferenceModel
@@ -12,6 +13,7 @@ def __init__(self, *args, **kwargs):
         self.detokenizer = CustomTokenizer.load(self.metadata["detokenizer"])
 
     def predict(self, sentence):
+        start = time.time()
         tokenized_sentence = self.tokenizer.texts_to_sequences([sentence])[0]
         encoder_input = np.pad(tokenized_sentence, (0, self.tokenizer.max_length - len(tokenized_sentence)), constant_values=0).astype(np.int64)
 
@@ -30,8 +32,7 @@ def predict(self, sentence):
                 break
 
         results = self.detokenizer.detokenize([tokenized_results])
-        return results[0]
-
+        return results[0], time.time() - start
 
 def read_files(path):
     with open(path, "r", encoding="utf-8") as f:
@@ -49,11 +50,13 @@ def read_files(path):
 max_lenght = 500
 val_examples = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
 
-translator = PtEnTranslator("Models/09_translation_transformer/202307241748/model.onnx")
+translator = PtEnTranslator("Models/09_translation_transformer/202308241514/model.onnx")
 
 val_dataset = []
 for es, en in val_examples:
-    results = translator.predict(es)
-    print(en)
-    print(results)
+    results, duration = translator.predict(es)
+    print("Spanish:     ", es.lower())
+    print("English:     ", en.lower())
+    print("English pred:", results)
+    print(duration)
     print()
@@ -123,12 +123,15 @@ def preprocess_inputs(data_batch, label_batch):
 model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=False)
 encDecSplitCallback = EncDecSplitCallback(configs.model_path, encoder_metadata={"tokenizer": tokenizer.dict()}, decoder_metadata={"detokenizer": detokenizer.dict()})
 
+configs.save()
+
 # Train the model
 transformer.fit(
     train_dataProvider, 
     validation_data=val_dataProvider, 
     epochs=configs.train_epochs,
     callbacks=[
+        earlystopper,
         warmupCosineDecay,
         checkpoint, 
         tb_callback, 
 
@@ -0,0 +1,28 @@
+import os
+from datetime import datetime
+
+from mltu.configs import BaseModelConfigs
+
+class ModelConfigs(BaseModelConfigs):
+    def __init__(self):
+        super().__init__()
+        self.model_path = os.path.join(
+            "Models/10_wav2vec2_torch",
+            datetime.strftime(datetime.now(), "%Y%m%d%H%M"),
+        )
+        self.batch_size = 8
+        self.train_epochs = 60
+        self.train_workers = 20
+
+        self.init_lr = 1.0e-8
+        self.lr_after_warmup = 1e-05
+        self.final_lr = 5e-06
+        self.warmup_epochs = 10
+        self.decay_epochs = 40
+        self.weight_decay = 0.005
+        self.mixed_precision = True
+
+        self.max_audio_length = 246000 
+        self.max_label_length = 256
+
+        self.vocab = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
@@ -0,0 +1,3 @@
+torch==1.13.1+cu117
+transformers==4.33.1
+onnx
@@ -0,0 +1,43 @@
+import numpy as np
+
+from mltu.inferenceModel import OnnxInferenceModel
+from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer
+
+class Wav2vec2(OnnxInferenceModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def predict(self, audio: np.ndarray):
+
+        audio = np.expand_dims(audio, axis=0).astype(np.float32)
+
+        preds = self.model.run(None, {self.input_name: audio})[0]
+
+        text = ctc_decoder(preds, self.metadata["vocab"])[0]
+
+        return text
+
+if __name__ == "__main__":
+    import librosa
+    import pandas as pd
+    from tqdm import tqdm
+
+    model = Wav2vec2(model_path="Models/10_wav2vec2_torch/202309171434/model.onnx")
+
+    # The list of multiple [audio_path, label] for validation
+    val_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/val.csv").values.tolist()
+
+    accum_cer, accum_wer = [], []
+    pbar = tqdm(val_dataset)
+    for vaw_path, label in pbar:
+        audio, sr = librosa.load(vaw_path, sr=16000)
+
+        prediction_text = model.predict(audio)
+
+        cer = get_cer(prediction_text, label)
+        wer = get_wer(prediction_text, label)
+
+        accum_cer.append(cer)
+        accum_wer.append(wer)
+
+        pbar.set_description(f"Average CER: {np.average(accum_cer):.4f}, Average WER: {np.average(accum_wer):.4f}")
@@ -0,0 +1,157 @@
+import os
+import tarfile
+import pandas as pd
+from tqdm import tqdm
+from io import BytesIO
+from urllib.request import urlopen
+
+import torch
+from torch import nn
+from transformers import Wav2Vec2ForCTC
+import torch.nn.functional as F
+
+from mltu.torch.model import Model
+from mltu.torch.losses import CTCLoss
+from mltu.torch.dataProvider import DataProvider
+from mltu.torch.metrics import CERMetric, WERMetric
+from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Model2onnx, WarmupCosineDecay
+from mltu.augmentors import RandomAudioNoise, RandomAudioPitchShift, RandomAudioTimeStretch
+
+from mltu.preprocessors import AudioReader
+from mltu.transformers import LabelIndexer, LabelPadding, AudioPadding
+
+from configs import ModelConfigs
+
+configs = ModelConfigs()
+
+
+def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
+    http_response = urlopen(url)
+
+    data = b""
+    iterations = http_response.length // chunk_size + 1
+    for _ in tqdm(range(iterations)):
+        data += http_response.read(chunk_size)
+
+    tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2")
+    tarFile.extractall(path=extract_to)
+    tarFile.close()
+
+
+dataset_path = os.path.join("Datasets", "LJSpeech-1.1")
+if not os.path.exists(dataset_path):
+    download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets")
+
+dataset_path = "Datasets/LJSpeech-1.1"
+metadata_path = dataset_path + "/metadata.csv"
+wavs_path = dataset_path + "/wavs/"
+
+# Read metadata file and parse it
+metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
+dataset = []
+vocab = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+for file_name, transcription, normalized_transcription in metadata_df.values.tolist():
+    path = f"Datasets/LJSpeech-1.1/wavs/{file_name}.wav"
+    new_label = "".join([l for l in normalized_transcription.lower() if l in vocab])
+    dataset.append([path, new_label])
+
+# Create a data provider for the dataset
+data_provider = DataProvider(
+    dataset=dataset,
+    skip_validation=True,
+    batch_size=configs.batch_size,
+    data_preprocessors=[
+        AudioReader(sample_rate=16000),
+        ],
+    transformers=[
+        LabelIndexer(vocab),
+        LabelPadding(max_word_length=configs.max_label_length, padding_value=len(vocab)),
+        ],
+    use_cache=False,
+    batch_postprocessors=[
+        AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True)
+    ],
+    use_multiprocessing=True,
+    max_queue_size=10,
+    workers=64,
+)
+train_dataProvider, test_dataProvider = data_provider.split(split=0.9)
+
+# train_dataProvider.augmentors = [
+#         RandomAudioNoise(), 
+#         RandomAudioPitchShift(), 
+#         RandomAudioTimeStretch()
+#     ]
+
+vocab = sorted(vocab)
+configs.vocab = vocab
+configs.save()
+
+
+class CustomWav2Vec2Model(nn.Module):
+    def __init__(self, hidden_states, dropout_rate=0.2, **kwargs):
+        super(CustomWav2Vec2Model, self).__init__( **kwargs)
+        pretrained_name = "facebook/wav2vec2-base-960h"
+        self.model = Wav2Vec2ForCTC.from_pretrained(pretrained_name, vocab_size=hidden_states, ignore_mismatched_sizes=True)
+        self.model.freeze_feature_encoder() # this part does not need to be fine-tuned
+
+    def forward(self, inputs):
+        output = self.model(inputs, attention_mask=None).logits
+        # Apply softmax
+        output = F.log_softmax(output, -1)
+        return output
+
+custom_model = CustomWav2Vec2Model(hidden_states = len(vocab)+1)
+
+# put on cuda device if available
+if torch.cuda.is_available():
+    custom_model = custom_model.cuda()
+
+# create callbacks
+warmupCosineDecay = WarmupCosineDecay(
+    lr_after_warmup=configs.lr_after_warmup,
+    warmup_epochs=configs.warmup_epochs,
+    decay_epochs=configs.decay_epochs,
+    final_lr=configs.final_lr,
+    initial_lr=configs.init_lr,
+    verbose=True,
+)
+tb_callback = TensorBoard(configs.model_path + "/logs")
+earlyStopping = EarlyStopping(monitor="val_CER", patience=16, mode="min", verbose=1)
+modelCheckpoint = ModelCheckpoint(configs.model_path + "/model.pt", monitor="val_CER", mode="min", save_best_only=True, verbose=1)
+model2onnx = Model2onnx(
+    saved_model_path=configs.model_path + "/model.pt",
+    input_shape=(1, configs.max_audio_length), 
+    verbose=1,
+    metadata={"vocab": configs.vocab},
+    dynamic_axes={"input": {0: "batch_size", 1: "sequence_length"}, "output": {0: "batch_size", 1: "sequence_length"}}
+)
+
+# create model object that will handle training and testing of the network
+model = Model(
+    custom_model, 
+    loss = CTCLoss(blank=len(configs.vocab), zero_infinity=True),
+    optimizer = torch.optim.AdamW(custom_model.parameters(), lr=configs.init_lr, weight_decay=configs.weight_decay),
+    metrics=[
+        CERMetric(configs.vocab), 
+        WERMetric(configs.vocab)
+    ],
+    mixed_precision=configs.mixed_precision,
+)
+
+# Save training and validation datasets as csv files
+train_dataProvider.to_csv(os.path.join(configs.model_path, "train.csv"))
+test_dataProvider.to_csv(os.path.join(configs.model_path, "val.csv"))
+
+model.fit(
+    train_dataProvider, 
+    test_dataProvider, 
+    epochs=configs.train_epochs, 
+    callbacks=[
+        warmupCosineDecay, 
+        tb_callback, 
+        earlyStopping,
+        modelCheckpoint, 
+        model2onnx
+    ]
+)
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,8 @@`
`10`	`10`	`"request": "launch",`
`11`	`11`	`"program": "${file}",`
`12`	`12`	`"console": "integratedTerminal",`
`13`		`- "justMyCode": false`
	`13`	`+ "justMyCode": false,`
	`14`	`+ "subProcess": true,`
`14`	`15`	`}`
`15`	`16`	`]`
`16`	`17`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+torch==1.13.1+cu117`
	`2`	`+transformers==4.33.1`
	`3`	`+onnx`