1- import numpy as np
1+ from typing import Optional , Union
22from warnings import warn
3- from . helper import next_pow2 , signal_to_frame , round_half_up , magspec
4- from . helper import mel2hz , hz2mel , is_pow2
5- from . helper import basicplot
3+
4+ import numpy as np
5+ from pyamapping import mel_to_hz , hz_to_mel
66from scipy .signal import get_window
77from scipy .fftpack import dct
8+
9+ from .helper import next_pow2 , signal_to_frame , round_half_up , magspec
10+ from .helper import is_pow2
11+ from .helper import basicplot
812import pya .asig
913import logging
1014
@@ -64,10 +68,12 @@ class Amfcc:
6468 An array of the MFCC coeffcient, size: nframes x ncep
6569 """
6670
67- def __init__ (self , x , sr = None , label = '' , n_per_frame = None ,
68- hopsize = None , nfft = None , window = 'hann' , nfilters = 26 ,
69- ncep = 13 , ceplifter = 22 , preemph = 0.95 ,
70- append_energy = True , cn = None ):
71+ def __init__ (self , x : Union [pya .Asig , np .ndarray ], sr : Optional [int ] = None ,
72+ label : str = '' , n_per_frame : Optional [int ] = None ,
73+ hopsize : Optional [int ] = None , nfft : Optional [int ] = None ,
74+ window : str = 'hann' , nfilters : int = 26 ,
75+ ncep : int = 13 , ceplifter : int = 22 , preemph : float = 0.95 ,
76+ append_energy : bool = True , cn : Optional [list ] = None ):
7177 """Initialize Amfcc object
7278
7379 Parameters
@@ -103,14 +109,14 @@ def __init__(self, x, sr=None, label='', n_per_frame=None,
103109 append_energy : bool
104110 If true, the zeroth cepstral coefficient is replaced with the log
105111 of the total frame energy.
106- cn : list
107- A list of channel name based on the Asig .
112+ cn : list or None
113+ A list of channel names, size should match the channels .
108114 """
109115 # ----------Prepare attributes ------------`-------------
110116 # First prepare for parameters
111117 # x represent the audio signal, which can be Asig object or np.array.
112118 self .im = None
113- if type ( x ) == pya .asig .Asig :
119+ if isinstance ( x , pya .asig .Asig ) :
114120 self .sr = x .sr
115121 self .x = x .sig
116122 self .label = '' .join ([x .label , "_mfccs" ])
@@ -129,9 +135,9 @@ def __init__(self, x, sr=None, label='', n_per_frame=None,
129135 self .duration = np .shape (x )[0 ] / self .sr
130136 self .label = label
131137 self .channels = 1 if self .x .ndim == 1 else self .x .shape [1 ]
132- self .cn = None
138+ self .cn = cn
133139 else :
134- msg = "x can only be either a numpy.ndarray or pya. Asig object."
140+ msg = "x can only be either a numpy.ndarray or Asig object."
135141 raise TypeError (msg )
136142
137143 # default 25ms length window.
@@ -211,7 +217,7 @@ def __repr__(self):
211217 return f"Amfcc({ self .label } ): sr { self .sr } , length: { self .duration } s"
212218
213219 @staticmethod
214- def preemphasis (x , coeff = 0.97 ):
220+ def preemphasis (x : np . ndarray , coeff : float = 0.97 ):
215221 """Pre-emphasis filter to whiten the spectrum.
216222 Pre-emphasis is a way of compensating for the
217223 rapid decaying spectrum of speech.
@@ -233,7 +239,8 @@ def preemphasis(x, coeff=0.97):
233239 return np .append (x [0 ], x [1 :] - coeff * x [:- 1 ])
234240
235241 @staticmethod
236- def mel_filterbanks (sr , nfilters = 26 , nfft = 512 , lowfreq = 0 , highfreq = None ):
242+ def mel_filterbanks (sr : int , nfilters : int = 26 , nfft : int = 512 ,
243+ lowfreq : float = 0 , highfreq : Optional [float ] = None ):
237244 """Compute a Mel-filterbank. The filters are stored in the rows,
238245 the columns correspond to fft bins. The filters are returned as
239246 an array of size nfilt * (nfft/2 + 1)
@@ -246,9 +253,9 @@ def mel_filterbanks(sr, nfilters=26, nfft=512, lowfreq=0, highfreq=None):
246253 The number of filters, default 20
247254 nfft : int
248255 The size of FFT, default 512
249- lowfreq : int or float
256+ lowfreq : float
250257 The lowest band edge of the mel filters, default 0 Hz
251- highfreq : int or float
258+ highfreq : float
252259 The highest band edge of the mel filters, default sr // 2
253260
254261 Returns
@@ -260,12 +267,12 @@ def mel_filterbanks(sr, nfilters=26, nfft=512, lowfreq=0, highfreq=None):
260267 highfreq = highfreq or sr // 2
261268
262269 # compute points evenly spaced in mels
263- lowmel = hz2mel (lowfreq )
264- highmel = hz2mel (highfreq )
270+ lowmel = hz_to_mel (lowfreq )
271+ highmel = hz_to_mel (highfreq )
265272 melpoints = np .linspace (lowmel , highmel , nfilters + 2 )
266273 # our points are in Hz, but we use fft bins, so we have to convert
267274 # from Hz to fft bin number
268- bin = np .floor ((nfft + 1 ) * mel2hz (melpoints ) / sr )
275+ bin = np .floor ((nfft + 1 ) * mel_to_hz (melpoints ) / sr )
269276
270277 filter_banks = np .zeros ([nfilters , nfft // 2 + 1 ])
271278 for j in range (0 , nfilters ):
@@ -276,7 +283,7 @@ def mel_filterbanks(sr, nfilters=26, nfft=512, lowfreq=0, highfreq=None):
276283 return filter_banks
277284
278285 @staticmethod
279- def lifter (cepstra , L = 22 ):
286+ def lifter (cepstra : np . ndarray , L : int = 22 ):
280287 """Apply a cepstral lifter the the matrix of cepstra.
281288 This has the effect of increasing the magnitude of
282289 the high frequency DCT coeffs.
@@ -315,21 +322,23 @@ def lifter(cepstra, L=22):
315322 # values of L <= 0, do nothing
316323 return cepstra
317324
318- def plot (self , cmap = 'inferno' , show_bar = True ,
319- offset = 0 , scale = 1. , xlim = None , ylim = None ,
320- x_as_time = True , nxlabel = 8 , ax = None , ** kwargs ):
325+ def plot (self , show_bar : bool = True , offset : int = 0 , scale : float = 1. ,
326+ xlim : Optional [ float ] = None , ylim : Optional [ float ] = None ,
327+ x_as_time : bool = True , nxlabel : int = 8 , ax = None , ** kwargs ):
321328 """Plot Amfcc.features via matshow, x is frames/time, y is the MFCCs
322329
323330 Parameters
324331 ----------
325- figsize : (float, float), optional, default: None
326- Figure size, width, height in inches, Default = [6.4, 4.8]
327- cmap : str
328- colormap for matplotlib. Default is 'inferno'.
329332 show_bar : bool, optional
330333 Default is True, show colorbar.
331- x_as_time : bool, optional
332- Default is True, show x axis as time or sample index.
334+ offset: int
335+ It is the spacing between channel, without setting it every channel will be overlayed onto each other.
336+ scale: float
337+ Visual scaling for improve visibility
338+ xlim: float, optional
339+ x axis value range limit
340+ ylim: float, optional
341+ y axis value range limit
333342 nxlabel : int, optional
334343 The amountt of labels on the x axis. Default is 8 .
335344 """
0 commit comments