Skip to content

Commit 047fcc9

Browse files
committed
added doc & predict_proba for deep version
1 parent c9e2f2e commit 047fcc9

File tree

4 files changed

+116
-52
lines changed

4 files changed

+116
-52
lines changed

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
- **tqdm==4.28.1**
1717
- **matplotlib==2.2.3**
1818
- **pyaudio==0.2.11**
19-
- **[ffmpeg](https://ffmpeg.org/) (optional)**: used to add more sample audio by converting to 16000Hz sample rate and mono channel which is provided in ``convert_wavs.py``
19+
- **[ffmpeg](https://ffmpeg.org/) (optional)**: used if you want to add more sample audio by converting to 16000Hz sample rate and mono channel which is provided in ``convert_wavs.py``
2020

2121
Install these libraries by the following command:
2222
```
@@ -122,6 +122,14 @@ print(f"Prediction: {prediction}")
122122
0.7948717948717948
123123
Prediction: angry
124124
```
125+
Predicting probabilities is also possible (for classification ofc):
126+
```python
127+
print(deeprec.predict_proba("data/emodb/wav/15b09Fa.wav"))
128+
```
129+
**Output:**
130+
```
131+
{'sad': 0.0005244638, 'neutral': 0.0016906325, 'happy': 0.9977849}
132+
```
125133
### Confusion Matrix
126134
```python
127135
print(deeprec.confusion_matrix(percentage=True, labeled=True))

create_csv.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ def write_emodb_csv(emotions=["sad", "neutral", "happy"], train_name="train_emo.
3838
target['path'].append(file)
3939
if verbose:
4040
print("[EMO-DB] Total files to write:", len(target['path']))
41+
42+
# dividing training/testing sets
4143
n_samples = len(target['path'])
4244
test_size = int((1-train_size) * n_samples)
4345
train_size = int(train_size * n_samples)
@@ -100,18 +102,26 @@ def write_custom_csv(emotions=['sad', 'neutral', 'happy'], train_name="train_cus
100102
for i, file in enumerate(glob.glob(f"data/train-custom/*_{category}.wav")):
101103
train_target["path"].append(file)
102104
train_target["emotion"].append(category)
103-
# if verbose:
104-
# print(f"[Custom Dataset] There are {i} training audio files for category:{category}")
105+
if verbose:
106+
try:
107+
print(f"[Custom Dataset] There are {i} training audio files for category:{category}")
108+
except NameError:
109+
# in case {i} doesn't exist
110+
pass
105111

106112
# test data
107113
for i, file in enumerate(glob.glob(f"data/test-custom/*_{category}.wav")):
108114
test_target["path"].append(file)
109115
test_target["emotion"].append(category)
110-
# if verbose:
111-
# print(f"[Custom Dataset] There are {i} testing audio files for category:{category}")
112-
116+
if verbose:
117+
try:
118+
print(f"[Custom Dataset] There are {i} testing audio files for category:{category}")
119+
except NameError:
120+
pass
121+
122+
# write CSVs
113123
if train_target["path"]:
114124
pd.DataFrame(train_target).to_csv(train_name)
115125

116126
if test_target["path"]:
117-
pd.DataFrame(test_target).to_csv(test_name)
127+
pd.DataFrame(test_target).to_csv(test_name)

deep_emotion_recognition.py

Lines changed: 65 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -40,44 +40,46 @@ class DeepEmotionRecognizer(EmotionRecognizer):
4040
"""
4141
The Deep Learning version of the Emotion Recognizer.
4242
This class uses RNN (LSTM, GRU, etc.) and Dense layers.
43+
#TODO add CNNs
4344
"""
4445
def __init__(self, **kwargs):
4546
"""
4647
params:
4748
emotions (list): list of emotions to be used. Note that these emotions must be available in
4849
RAVDESS_TESS & EMODB Datasets, available nine emotions are the following:
4950
'neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'ps' ( pleasant surprised ), 'boredom'.
50-
tess_ravdess (bool): whether to use TESS & RAVDESS Speech datasets, default is True
51-
emodb (bool): whether to use EMO-DB Speech dataset, default is True,
51+
Default is ["sad", "neutral", "happy"].
52+
tess_ravdess (bool): whether to use TESS & RAVDESS Speech datasets, default is True.
53+
emodb (bool): whether to use EMO-DB Speech dataset, default is True.
5254
custom_db (bool): whether to use custom Speech dataset that is located in `data/train-custom`
53-
and `data/test-custom`, default is True
54-
tess_ravdess_name (str): the name of the output CSV file for TESS&RAVDESS dataset, default is "tess_ravdess.csv"
55-
emodb_name (str): the name of the output CSV file for EMO-DB dataset, default is "emodb.csv"
56-
custom_db_name (str): the name of the output CSV file for the custom dataset, default is "custom.csv"
55+
and `data/test-custom`, default is True.
56+
tess_ravdess_name (str): the name of the output CSV file for TESS&RAVDESS dataset, default is "tess_ravdess.csv".
57+
emodb_name (str): the name of the output CSV file for EMO-DB dataset, default is "emodb.csv".
58+
custom_db_name (str): the name of the output CSV file for the custom dataset, default is "custom.csv".
5759
features (list): list of speech features to use, default is ["mfcc", "chroma", "mel"]
58-
(i.e MFCC, Chroma and MEL spectrogram )
59-
classification (bool): whether to use classification or regression, default is True
60-
balance (bool): whether to balance the dataset ( both training and testing ), default is True
61-
verbose (bool/int): whether to print messages on certain tasks
60+
(i.e MFCC, Chroma and MEL spectrogram ).
61+
classification (bool): whether to use classification or regression, default is True.
62+
balance (bool): whether to balance the dataset ( both training and testing ), default is True.
63+
verbose (bool/int): whether to print messages on certain tasks.
6264
==========================================================
6365
Model params
64-
n_rnn_layers (int): number of RNN layers, default is 2
65-
cell (keras.layers.RNN instance): RNN cell used to train the model, default is LSTM
66-
rnn_units (int): number of units of `cell`, default is 128
67-
n_dense_layers (int): number of Dense layers, default is 2
68-
dense_units (int): number of units of the Dense layers, default is 128
66+
n_rnn_layers (int): number of RNN layers, default is 2.
67+
cell (keras.layers.RNN instance): RNN cell used to train the model, default is LSTM.
68+
rnn_units (int): number of units of `cell`, default is 128.
69+
n_dense_layers (int): number of Dense layers, default is 2.
70+
dense_units (int): number of units of the Dense layers, default is 128.
6971
dropout (list/float): dropout rate,
70-
- if list, it indicates the dropout rate of each layer
71-
- if float, it indicates the dropout rate for all layers
72-
default is 0.3
72+
- if list, it indicates the dropout rate of each layer.
73+
- if float, it indicates the dropout rate for all layers.
74+
Default is 0.3.
7375
==========================================================
7476
Training params
75-
batch_size (int): number of samples per gradient update, default is 64
76-
epochs (int): number of epochs, default is 1000
77-
optimizer (str/keras.optimizers.Optimizer instance): optimizer used to train, default is "adam"
78-
loss (str, callback from keras.losses): loss function that is used to minimize during training,
77+
batch_size (int): number of samples per gradient update, default is 64.
78+
epochs (int): number of epochs, default is 1000.
79+
optimizer (str/keras.optimizers.Optimizer instance): optimizer used to train, default is "adam".
80+
loss (str/callback from keras.losses): loss function that is used to minimize during training,
7981
default is "categorical_crossentropy" for classification and "mean_squared_error" for
80-
regression
82+
regression.
8183
"""
8284
# init EmotionRecognizer
8385
super().__init__(None, **kwargs)
@@ -117,6 +119,12 @@ def __init__(self, **kwargs):
117119
self.model_created = False
118120

119121
def _update_model_name(self):
122+
"""
123+
Generates a unique model name based on parameters passed and put it on `self.model_name`.
124+
This is used when saving the model.
125+
"""
126+
# get first letters of emotions, for instance:
127+
# ["sad", "neutral", "happy"] => 'HNS' (sorted alphabetically)
120128
emotions_str = get_first_letters(self.emotions)
121129
# 'c' for classification & 'r' for regression
122130
problem_type = 'c' if self.classification else 'r'
@@ -128,15 +136,19 @@ def _get_model_filename(self):
128136
return f"results/{self.model_name}"
129137

130138
def _model_exists(self):
131-
"""Checks if model already exists in disk, returns the filename,
132-
returns `None` otherwise"""
139+
"""
140+
Checks if model already exists in disk, returns the filename,
141+
and returns `None` otherwise.
142+
"""
133143
filename = self._get_model_filename()
134144
return filename if os.path.isfile(filename) else None
135145

136146
def _compute_input_length(self):
147+
"""
148+
Calculates the input shape to be able to construct the model.
149+
"""
137150
if not self.data_loaded:
138151
self.load_data()
139-
140152
self.input_length = self.X_train[0].shape[1]
141153

142154
def _verify_emotions(self):
@@ -146,9 +158,8 @@ def _verify_emotions(self):
146158

147159
def create_model(self):
148160
"""
149-
Constructs the neural network
161+
Constructs the neural network based on parameters passed.
150162
"""
151-
152163
if self.model_created:
153164
# model already created, why call twice
154165
return
@@ -196,17 +207,23 @@ def create_model(self):
196207
print("[+] Model created")
197208

198209
def load_data(self):
210+
"""
211+
Loads and extracts features from the audio files for the db's specified.
212+
And then reshapes the data.
213+
"""
199214
super().load_data()
200-
# reshape to 3 dims
215+
# reshape X's to 3 dims
201216
X_train_shape = self.X_train.shape
202217
X_test_shape = self.X_test.shape
203218
self.X_train = self.X_train.reshape((1, X_train_shape[0], X_train_shape[1]))
204219
self.X_test = self.X_test.reshape((1, X_test_shape[0], X_test_shape[1]))
205220

206221
if self.classification:
222+
# one-hot encode when its classification
207223
self.y_train = to_categorical([ self.emotions2int[str(e)] for e in self.y_train ])
208224
self.y_test = to_categorical([ self.emotions2int[str(e)] for e in self.y_test ])
209225

226+
# reshape labels
210227
y_train_shape = self.y_train.shape
211228
y_test_shape = self.y_test.shape
212229
if self.classification:
@@ -217,7 +234,12 @@ def load_data(self):
217234
self.y_test = self.y_test.reshape((1, y_test_shape[0], 1))
218235

219236
def train(self, override=False):
220-
237+
"""
238+
Trains the neural network.
239+
Params:
240+
override (bool): whether to override the previous identical model, can be used
241+
when you changed the dataset, default is False
242+
"""
221243
# if model isn't created yet, create it
222244
if not self.model_created:
223245
self.create_model()
@@ -262,6 +284,19 @@ def predict(self, audio_path):
262284
else:
263285
return self.model.predict(feature)[0][0][0]
264286

287+
def predict_proba(self, audio_path):
288+
if self.classification:
289+
feature = extract_feature(audio_path, **self.audio_config).reshape((1, 1, self.input_length))
290+
proba = self.model.predict(feature)[0][0]
291+
result = {}
292+
for prob, emotion in zip(proba, self.emotions):
293+
result[emotion] = prob
294+
return result
295+
else:
296+
raise NotImplementedError("Probability prediction doesn't make sense for regression")
297+
298+
299+
265300
def test_score(self):
266301
y_test = self.y_test[0]
267302
if self.classification:

emotion_recognition.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def __init__(self, model, **kwargs):
2626
emotions (list): list of emotions to be used. Note that these emotions must be available in
2727
RAVDESS_TESS & EMODB Datasets, available nine emotions are the following:
2828
'neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'ps' ( pleasant surprised ), 'boredom'.
29+
Default is ["sad", "neutral", "happy"].
2930
tess_ravdess (bool): whether to use TESS & RAVDESS Speech datasets, default is True
3031
emodb (bool): whether to use EMO-DB Speech dataset, default is True,
3132
custom_db (bool): whether to use custom Speech dataset that is located in `data/train-custom`
@@ -79,7 +80,11 @@ def __init__(self, model, **kwargs):
7980
self.model_trained = False
8081

8182
def _set_metadata_filenames(self):
82-
# get first letters of selected emotions
83+
"""
84+
Protected method to get all CSV (metadata) filenames into two instance attributes:
85+
- `self.train_desc_files` for training CSVs
86+
- `self.test_desc_files` for testing CSVs
87+
"""
8388
train_desc_files, test_desc_files = [], []
8489
if self.tess_ravdess:
8590
train_desc_files.append(f"train_{self.tess_ravdess_name}")
@@ -91,7 +96,7 @@ def _set_metadata_filenames(self):
9196
train_desc_files.append(f"train_{self.custom_db_name}")
9297
test_desc_files.append(f"test_{self.custom_db_name}")
9398

94-
# set them to be class attributes
99+
# set them to be object attributes
95100
self.train_desc_files = train_desc_files
96101
self.test_desc_files = test_desc_files
97102

@@ -169,13 +174,18 @@ def predict(self, audio_path):
169174
return self.model.predict(feature)[0]
170175

171176
def predict_proba(self, audio_path):
172-
""""""
173-
feature = extract_feature(audio_path, **self.audio_config).reshape(1, -1)
174-
proba = self.model.predict_proba(feature)[0]
175-
result = {}
176-
for emotion, prob in zip(self.emotions, proba):
177-
result[emotion] = prob
178-
return result
177+
"""
178+
Predicts the probability of each emotion.
179+
"""
180+
if self.classification:
181+
feature = extract_feature(audio_path, **self.audio_config).reshape(1, -1)
182+
proba = self.model.predict_proba(feature)[0]
183+
result = {}
184+
for emotion, prob in zip(self.emotions, proba):
185+
result[emotion] = prob
186+
return result
187+
else:
188+
raise NotImplementedError("Probability prediction doesn't make sense for regression")
179189

180190
def grid_search(self, params, n_jobs=2):
181191
"""
@@ -193,7 +203,7 @@ def determine_best_model(self, train=True):
193203
Loads best estimators and determine which is best for test data,
194204
and then set it to `self.model`.
195205
if `train` is True, then train that model on train data, so the model
196-
will be ready for testing/predicting.
206+
will be ready for inference.
197207
In case of regression, the metric used is MSE and accuracy for classification.
198208
Note that the execution of this method may take several minutes due
199209
to training all estimators (stored in `grid` folder) for determining the best possible one.
@@ -277,11 +287,12 @@ def test_fbeta_score(self, beta):
277287
return fbeta_score(self.y_test, y_pred, beta, average='micro')
278288

279289
def confusion_matrix(self, percentage=True, labeled=True):
280-
"""Compute confusion matrix to evaluate the test accuracy of the classification
281-
and returns it as numpy matrix or pandas dataframe (depends on params)
290+
"""
291+
Computes confusion matrix to evaluate the test accuracy of the classification
292+
and returns it as numpy matrix or pandas dataframe (depends on params).
282293
params:
283-
percentage (bool): whether to use percentage instead of number of samples, default is True
284-
labeled (bool): whether to label the columns and indexes in the dataframe
294+
percentage (bool): whether to use percentage instead of number of samples, default is True.
295+
labeled (bool): whether to label the columns and indexes in the dataframe.
285296
"""
286297
if not self.classification:
287298
raise NotImplementedError("Confusion matrix works only when it is a classification problem")
@@ -339,7 +350,7 @@ def get_samples_by_class(self):
339350

340351
def get_random_emotion(self, emotion, partition="train"):
341352
"""
342-
Returns random `emotion` data sample index on `partition`
353+
Returns random `emotion` data sample index on `partition`.
343354
"""
344355
if partition == "train":
345356
index = random.choice(list(range(len(self.y_train))))

0 commit comments

Comments
 (0)