added doc & predict_proba for deep version

x4nth055 · x4nth055 · commit 047fcc9a59e6 · 2019-08-22T12:18:13.000+02:00
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@
 - **tqdm==4.28.1**
 - **matplotlib==2.2.3**
 - **pyaudio==0.2.11**
-- **[ffmpeg](https://ffmpeg.org/) (optional)**: used to add more sample audio by converting to 16000Hz sample rate and mono channel which is provided in ``convert_wavs.py``
+- **[ffmpeg](https://ffmpeg.org/) (optional)**: used if you want to add more sample audio by converting to 16000Hz sample rate and mono channel which is provided in ``convert_wavs.py``
 
 Install these libraries by the following command:
 ```
@@ -122,6 +122,14 @@ print(f"Prediction: {prediction}")
 0.7948717948717948
 Prediction: angry
 ```
+Predicting probabilities is also possible (for classification ofc):
+```python
+print(deeprec.predict_proba("data/emodb/wav/15b09Fa.wav"))
+```
+**Output:**
+```
+{'sad': 0.0005244638, 'neutral': 0.0016906325, 'happy': 0.9977849}
+```
 ### Confusion Matrix
 ```python
 print(deeprec.confusion_matrix(percentage=True, labeled=True))
diff --git a/create_csv.py b/create_csv.py
@@ -38,6 +38,8 @@ def write_emodb_csv(emotions=["sad", "neutral", "happy"], train_name="train_emo.
         target['path'].append(file)
     if verbose:
         print("[EMO-DB] Total files to write:", len(target['path']))
+        
+    # dividing training/testing sets
     n_samples = len(target['path'])
     test_size = int((1-train_size) * n_samples)
     train_size = int(train_size * n_samples)
@@ -100,18 +102,26 @@ def write_custom_csv(emotions=['sad', 'neutral', 'happy'], train_name="train_cus
         for i, file in enumerate(glob.glob(f"data/train-custom/*_{category}.wav")):
             train_target["path"].append(file)
             train_target["emotion"].append(category)
-        # if verbose:
-        #     print(f"[Custom Dataset] There are {i} training audio files for category:{category}")
+        if verbose:
+            try:
+                print(f"[Custom Dataset] There are {i} training audio files for category:{category}")
+            except NameError:
+                # in case {i} doesn't exist
+                pass
         
         # test data
         for i, file in enumerate(glob.glob(f"data/test-custom/*_{category}.wav")):
             test_target["path"].append(file)
             test_target["emotion"].append(category)
-        # if verbose:
-        #     print(f"[Custom Dataset] There are {i} testing audio files for category:{category}")
-        
+        if verbose:
+            try:
+                print(f"[Custom Dataset] There are {i} testing audio files for category:{category}")
+            except NameError:
+                pass
+    
+    # write CSVs
     if train_target["path"]:
         pd.DataFrame(train_target).to_csv(train_name)
 
     if test_target["path"]:
-            pd.DataFrame(test_target).to_csv(test_name)
+        pd.DataFrame(test_target).to_csv(test_name)
diff --git a/deep_emotion_recognition.py b/deep_emotion_recognition.py
@@ -40,44 +40,46 @@ class DeepEmotionRecognizer(EmotionRecognizer):
     """
     The Deep Learning version of the Emotion Recognizer.
     This class uses RNN (LSTM, GRU, etc.) and Dense layers.
+    #TODO add CNNs
     """
     def __init__(self, **kwargs):
         """
         params:
             emotions (list): list of emotions to be used. Note that these emotions must be available in
                 RAVDESS_TESS & EMODB Datasets, available nine emotions are the following:
                     'neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'ps' ( pleasant surprised ), 'boredom'.
-            tess_ravdess (bool): whether to use TESS & RAVDESS Speech datasets, default is True
-            emodb (bool): whether to use EMO-DB Speech dataset, default is True,
+                Default is ["sad", "neutral", "happy"].
+            tess_ravdess (bool): whether to use TESS & RAVDESS Speech datasets, default is True.
+            emodb (bool): whether to use EMO-DB Speech dataset, default is True.
             custom_db (bool): whether to use custom Speech dataset that is located in `data/train-custom`
-                and `data/test-custom`, default is True
-            tess_ravdess_name (str): the name of the output CSV file for TESS&RAVDESS dataset, default is "tess_ravdess.csv"
-            emodb_name (str): the name of the output CSV file for EMO-DB dataset, default is "emodb.csv"
-            custom_db_name (str): the name of the output CSV file for the custom dataset, default is "custom.csv"
+                and `data/test-custom`, default is True.
+            tess_ravdess_name (str): the name of the output CSV file for TESS&RAVDESS dataset, default is "tess_ravdess.csv".
+            emodb_name (str): the name of the output CSV file for EMO-DB dataset, default is "emodb.csv".
+            custom_db_name (str): the name of the output CSV file for the custom dataset, default is "custom.csv".
             features (list): list of speech features to use, default is ["mfcc", "chroma", "mel"]
-                (i.e MFCC, Chroma and MEL spectrogram )
-            classification (bool): whether to use classification or regression, default is True
-            balance (bool): whether to balance the dataset ( both training and testing ), default is True
-            verbose (bool/int): whether to print messages on certain tasks
+                (i.e MFCC, Chroma and MEL spectrogram ).
+            classification (bool): whether to use classification or regression, default is True.
+            balance (bool): whether to balance the dataset ( both training and testing ), default is True.
+            verbose (bool/int): whether to print messages on certain tasks.
             ==========================================================
             Model params
-            n_rnn_layers (int): number of RNN layers, default is 2
-            cell (keras.layers.RNN instance): RNN cell used to train the model, default is LSTM
-            rnn_units (int): number of units of `cell`, default is 128
-            n_dense_layers (int): number of Dense layers, default is 2
-            dense_units (int): number of units of the Dense layers, default is 128
+            n_rnn_layers (int): number of RNN layers, default is 2.
+            cell (keras.layers.RNN instance): RNN cell used to train the model, default is LSTM.
+            rnn_units (int): number of units of `cell`, default is 128.
+            n_dense_layers (int): number of Dense layers, default is 2.
+            dense_units (int): number of units of the Dense layers, default is 128.
             dropout (list/float): dropout rate,
-                - if list, it indicates the dropout rate of each layer
-                - if float, it indicates the dropout rate for all layers
-                default is 0.3
+                - if list, it indicates the dropout rate of each layer.
+                - if float, it indicates the dropout rate for all layers.
+                Default is 0.3.
             ==========================================================
             Training params
-            batch_size (int): number of samples per gradient update, default is 64
-            epochs (int): number of epochs, default is 1000
-            optimizer (str/keras.optimizers.Optimizer instance): optimizer used to train, default is "adam"
-            loss (str, callback from keras.losses): loss function that is used to minimize during training,
+            batch_size (int): number of samples per gradient update, default is 64.
+            epochs (int): number of epochs, default is 1000.
+            optimizer (str/keras.optimizers.Optimizer instance): optimizer used to train, default is "adam".
+            loss (str/callback from keras.losses): loss function that is used to minimize during training,
                 default is "categorical_crossentropy" for classification and "mean_squared_error" for 
-                regression
+                regression.
         """
         # init EmotionRecognizer
         super().__init__(None, **kwargs)
@@ -117,6 +119,12 @@ def __init__(self, **kwargs):
         self.model_created = False
 
     def _update_model_name(self):
+        """
+        Generates a unique model name based on parameters passed and put it on `self.model_name`.
+        This is used when saving the model.
+        """
+        # get first letters of emotions, for instance:
+        # ["sad", "neutral", "happy"] => 'HNS' (sorted alphabetically)
         emotions_str = get_first_letters(self.emotions)
         # 'c' for classification & 'r' for regression
         problem_type = 'c' if self.classification else 'r'
@@ -128,15 +136,19 @@ def _get_model_filename(self):
         return f"results/{self.model_name}"
 
     def _model_exists(self):
-        """Checks if model already exists in disk, returns the filename,
-        returns `None` otherwise"""
+        """
+        Checks if model already exists in disk, returns the filename,
+        and returns `None` otherwise.
+        """
         filename = self._get_model_filename()
         return filename if os.path.isfile(filename) else None
 
     def _compute_input_length(self):
+        """
+        Calculates the input shape to be able to construct the model.
+        """
         if not self.data_loaded:
             self.load_data()
-
         self.input_length = self.X_train[0].shape[1]
 
     def _verify_emotions(self):
@@ -146,9 +158,8 @@ def _verify_emotions(self):
 
     def create_model(self):
         """
-        Constructs the neural network
+        Constructs the neural network based on parameters passed.
         """
-
         if self.model_created:
             # model already created, why call twice
             return
@@ -196,17 +207,23 @@ def create_model(self):
             print("[+] Model created")
 
     def load_data(self):
+        """
+        Loads and extracts features from the audio files for the db's specified.
+        And then reshapes the data.
+        """
         super().load_data()
-        # reshape to 3 dims
+        # reshape X's to 3 dims
         X_train_shape = self.X_train.shape
         X_test_shape = self.X_test.shape
         self.X_train = self.X_train.reshape((1, X_train_shape[0], X_train_shape[1]))
         self.X_test = self.X_test.reshape((1, X_test_shape[0], X_test_shape[1]))
 
         if self.classification:
+            # one-hot encode when its classification
             self.y_train = to_categorical([ self.emotions2int[str(e)] for e in self.y_train ])
             self.y_test = to_categorical([ self.emotions2int[str(e)] for e in self.y_test ])
         
+        # reshape labels
         y_train_shape = self.y_train.shape
         y_test_shape = self.y_test.shape
         if self.classification:
@@ -217,7 +234,12 @@ def load_data(self):
             self.y_test = self.y_test.reshape((1, y_test_shape[0], 1))
 
     def train(self, override=False):
-        
+        """
+        Trains the neural network.
+        Params:
+            override (bool): whether to override the previous identical model, can be used
+                when you changed the dataset, default is False
+        """
         # if model isn't created yet, create it
         if not self.model_created:
             self.create_model()
@@ -262,6 +284,19 @@ def predict(self, audio_path):
         else:
             return self.model.predict(feature)[0][0][0]
 
+    def predict_proba(self, audio_path):
+        if self.classification:
+            feature = extract_feature(audio_path, **self.audio_config).reshape((1, 1, self.input_length))
+            proba = self.model.predict(feature)[0][0]
+            result = {}
+            for prob, emotion in zip(proba, self.emotions):
+                result[emotion] = prob
+            return result
+        else:
+            raise NotImplementedError("Probability prediction doesn't make sense for regression")
+
+
+
     def test_score(self):
         y_test = self.y_test[0]
         if self.classification:
diff --git a/emotion_recognition.py b/emotion_recognition.py
@@ -26,6 +26,7 @@ def __init__(self, model, **kwargs):
             emotions (list): list of emotions to be used. Note that these emotions must be available in
                 RAVDESS_TESS & EMODB Datasets, available nine emotions are the following:
                     'neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'ps' ( pleasant surprised ), 'boredom'.
+                Default is ["sad", "neutral", "happy"].
             tess_ravdess (bool): whether to use TESS & RAVDESS Speech datasets, default is True
             emodb (bool): whether to use EMO-DB Speech dataset, default is True,
             custom_db (bool): whether to use custom Speech dataset that is located in `data/train-custom`
@@ -79,7 +80,11 @@ def __init__(self, model, **kwargs):
         self.model_trained = False
 
     def _set_metadata_filenames(self):
-        # get first letters of selected emotions
+        """
+        Protected method to get all CSV (metadata) filenames into two instance attributes:
+        - `self.train_desc_files` for training CSVs
+        - `self.test_desc_files` for testing CSVs
+        """
         train_desc_files, test_desc_files = [], []
         if self.tess_ravdess:
             train_desc_files.append(f"train_{self.tess_ravdess_name}")
@@ -91,7 +96,7 @@ def _set_metadata_filenames(self):
             train_desc_files.append(f"train_{self.custom_db_name}")
             test_desc_files.append(f"test_{self.custom_db_name}")
 
-        # set them to be class attributes
+        # set them to be object attributes
         self.train_desc_files = train_desc_files
         self.test_desc_files  = test_desc_files
 
@@ -169,13 +174,18 @@ def predict(self, audio_path):
         return self.model.predict(feature)[0]
 
     def predict_proba(self, audio_path):
-        """"""
-        feature = extract_feature(audio_path, **self.audio_config).reshape(1, -1)
-        proba = self.model.predict_proba(feature)[0]
-        result = {}
-        for emotion, prob in zip(self.emotions, proba):
-            result[emotion] = prob
-        return result
+        """
+        Predicts the probability of each emotion.
+        """
+        if self.classification:
+            feature = extract_feature(audio_path, **self.audio_config).reshape(1, -1)
+            proba = self.model.predict_proba(feature)[0]
+            result = {}
+            for emotion, prob in zip(self.emotions, proba):
+                result[emotion] = prob
+            return result
+        else:
+            raise NotImplementedError("Probability prediction doesn't make sense for regression")
 
     def grid_search(self, params, n_jobs=2):
         """
@@ -193,7 +203,7 @@ def determine_best_model(self, train=True):
         Loads best estimators and determine which is best for test data,
         and then set it to `self.model`.
         if `train` is True, then train that model on train data, so the model
-        will be ready for testing/predicting.
+        will be ready for inference.
         In case of regression, the metric used is MSE and accuracy for classification.
         Note that the execution of this method may take several minutes due
         to training all estimators (stored in `grid` folder) for determining the best possible one.
@@ -277,11 +287,12 @@ def test_fbeta_score(self, beta):
         return fbeta_score(self.y_test, y_pred, beta, average='micro')
 
     def confusion_matrix(self, percentage=True, labeled=True):
-        """Compute confusion matrix to evaluate the test accuracy of the classification
-        and returns it as numpy matrix or pandas dataframe (depends on params)
+        """
+        Computes confusion matrix to evaluate the test accuracy of the classification
+        and returns it as numpy matrix or pandas dataframe (depends on params).
         params:
-            percentage (bool): whether to use percentage instead of number of samples, default is True
-            labeled (bool): whether to label the columns and indexes in the dataframe
+            percentage (bool): whether to use percentage instead of number of samples, default is True.
+            labeled (bool): whether to label the columns and indexes in the dataframe.
         """
         if not self.classification:
             raise NotImplementedError("Confusion matrix works only when it is a classification problem")
@@ -339,7 +350,7 @@ def get_samples_by_class(self):
 
     def get_random_emotion(self, emotion, partition="train"):
         """
-        Returns random `emotion` data sample index on `partition`
+        Returns random `emotion` data sample index on `partition`.
         """
         if partition == "train":
             index = random.choice(list(range(len(self.y_train))))