11import os
22import copy
33import typing
4- # import librosa
54import numpy as np
65import pandas as pd
76from tqdm import tqdm
87import tensorflow as tf
9- # from scipy import signal
10- # from scipy.io import wavfile
11-
12- from tensorflow .keras .preprocessing .sequence import pad_sequences
138
149import logging
1510logging .basicConfig (format = '%(asctime)s %(levelname)s %(name)s: %(message)s' )
@@ -151,6 +146,7 @@ def __getitem__(self, index: int):
151146 for preprocessor in self ._data_preprocessors :
152147 data , annotation = preprocessor (data , annotation )
153148
149+ # If data is None, remove it from the dataset
154150 if data is None :
155151 self ._dataset .remove (dataset_batch [index ])
156152 continue
@@ -168,89 +164,4 @@ def __getitem__(self, index: int):
168164 for transformer in self ._transformers :
169165 batch_data , batch_annotations = zip (* [transformer (data , annotation ) for data , annotation in zip (batch_data , batch_annotations )])
170166
171- return np .array (batch_data ), np .array (batch_annotations )
172-
173- class SoundDataProvider (DataProvider ):
174- def __init__ (
175- self ,
176- vocab : typing .List [str ] = None ,
177- * args ,
178- ** kwargs
179- ) -> None :
180- # Intherit all arguments from parent class
181- # super().__init__(dataset)
182- # TensorFlowDataProvider.__init__(self, *args, **kwargs)
183- super ().__init__ (* args , ** kwargs )
184- self .vocab = vocab
185-
186- # Mapping characters to integers
187- self .char_to_num = tf .keras .layers .StringLookup (vocabulary = self .vocab , oov_token = "" )
188- # Mapping integers back to original characters
189- self .num_to_char = tf .keras .layers .StringLookup (
190- vocabulary = self .char_to_num .get_vocabulary (), oov_token = "" , invert = True
191- )
192-
193- # An integer scalar Tensor. The window length in samples.
194- self .frame_length = 256
195- # An integer scalar Tensor. The number of samples to step.
196- self .frame_step = 160
197- # An integer scalar Tensor. The size of the FFT to apply.
198- # If not provided, uses the smallest power of 2 enclosing frame_length.
199- self .fft_length = 384
200-
201- def __getitem__ (self , index : int ):
202- """ Returns a batch of data by index"""
203- batch_annotations = self .get_batch_annotations (index )
204-
205- data , labels = [], []
206- # bzz =[]
207- for file_path , label in batch_annotations :
208-
209- # x, sr = librosa.load(file_path, sr=44100)
210- # X = librosa.stft(x)
211- # Xdb = librosa.amplitude_to_db(abs(X))
212- # bzz.append(Xdb)
213-
214- # sample_rate, samples = wavfile.read(file_path)
215- # frequencies, times, _spectrogram = signal.spectrogram(samples, sample_rate)
216-
217- # 1. Read wav file
218- file = tf .io .read_file (file_path )
219- # 2. Decode the wav file
220- audio , _ = tf .audio .decode_wav (file )
221- audio = tf .squeeze (audio , axis = - 1 )
222- # 3. Change type to float
223- audio = tf .cast (audio , tf .float32 )
224- # 4. Get the spectrogram
225- spectrogram = tf .signal .stft (audio , frame_length = self .frame_length , frame_step = self .frame_step , fft_length = self .fft_length )
226- # 5. We only need the magnitude, which can be derived by applying tf.abs
227- spectrogram = tf .abs (spectrogram )
228- spectrogram = tf .math .pow (spectrogram , 0.5 )
229- # 6. normalisation
230- means = tf .math .reduce_mean (spectrogram , 1 , keepdims = True )
231- stddevs = tf .math .reduce_std (spectrogram , 1 , keepdims = True )
232- spectrogram = (spectrogram - means ) / (stddevs + 1e-10 )
233- ###########################################
234- ## Process the label
235- ##########################################
236- # 7. Convert label to Lower case
237- label = tf .strings .lower (label )
238- # 8. Split the label
239- label = tf .strings .unicode_split (label , input_encoding = "UTF-8" )
240- # 9. Map the characters in label to numbers
241- label = self .char_to_num (label )
242- # 10. Return a dict as our model is expecting two inputs
243-
244- # final_labels = pad_sequences([label], maxlen=len(label), padding='post', value=len(self.vocab))[0]
245-
246- data .append (spectrogram .numpy ())
247- labels .append (label .numpy ())
248-
249- padded_data = pad_sequences (data , maxlen = max ([len (d ) for d in data ]), padding = 'post' , value = 0 , dtype = 'float32' )
250- padded_labels = pad_sequences (labels , maxlen = max ([len (l ) for l in labels ]), padding = 'post' , value = len (self .vocab ))
251-
252- if self ._transformers :
253- for transformer in self ._transformers :
254- padded_data , padded_labels = zip (* [transformer (data , label ) for data , label in zip (padded_data , padded_labels )])
255-
256- return np .array (padded_data ), np .array (padded_labels )
167+ return np .array (batch_data ), np .array (batch_annotations )
0 commit comments