NoiseReductionProject/estnoiseg.m at master · steve3nto/NoiseReductionProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
 function [x,zo]=estnoiseg(yf,tz,pp)
%ESTNOISEG - estimate MMSE noise spectrum [x,zo]=(yf,tz,pp)
%
% Usage:    ninc=round(0.016*fs);   % frame increment [fs=sample frequency]
%           ovf=2;                  % overlap factor
%           f=rfft(enframe(s,hanning(ovf*ninc,'periodic'),ninc),ovf*ninc,2);
%           f=f.*conj(f);           % convert to power spectrum
%           x=estnoiseg(f,ninc/fs); % estimate the noise power spectrum
%
% Inputs:
%   yf      input power spectra (one row per frame)
%   tz      frame increment in seconds
%           Alternatively, the input state from a previous call (see below)
%   pp      algorithm parameters [optional]
%
% Outputs:
%   x       estimated noise power spectra (one row per frame)
%   zo      output state
%
% The algorithm parameters are defined in reference [1] from which equation
% numbers are given in parentheses. They are as follows:
%
%        pp.tax      % smoothing time constant for noise power estimate [0.0717 seconds](8)
%        pp.tap      % smoothing time constant for smoothed speech prob [0.152 seconds](23)
%        pp.psthr    % threshold for smoothed speech probability [0.99] (24)
%        pp.pnsaf    % noise probability safety value [0.01] (24)
%        pp.pspri    % prior speech probability [0.5] (18)
%        pp.asnr     % active SNR in dB [15] (18)
%        pp.psini    % initial speech probability [0.5] (23)
%        pp.tavini   % assumed speech absent time at start [0.064 seconds]
%
% If convenient, you can call estnoiseg in chunks of arbitrary size. Thus the following are equivalent:
%
%                   (a) dp=estnoiseg(yp(1:300),tinc);
%
%                   (b) [dp(1:100,:),z]=estnoiseg(yp(1:100,:),tinc);
%                       [dp(101:200,:),z]=estnoiseg(yp(101:200,:),z);
%                       [dp(201:300,:),z]=estnoiseg(yp(201:300,:),z);


% This is intended to be a precise implementation of [1] for a frame rate of 62.5 Hz.
% Time constants are adjusted for other frame rates.
%
% Refs:
%    [1] Gerkmann, T. & Hendriks, R. C.
%        Unbiased MMSE-Based Noise Power Estimation With Low Complexity and Low Tracking Delay
%        IEEE Trans Audio, Speech, Language Processing, 2012, 20, 1383-1393

%       Copyright (C) Mike Brookes 2012
%      Version: $Id: estnoiseg.m 3387 2013-08-23 12:32:47Z dmb $
%
%   VOICEBOX is a MATLAB toolbox for speech processing.
%   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%   This program is free software; you can redistribute it and/or modify
%   it under the terms of the GNU General Public License as published by
%   the Free Software Foundation; either version 2 of the License, or
%   (at your option) any later version.

%   This program is distributed in the hope that it will be useful,
%   but WITHOUT ANY WARRANTY; without even the implied warranty of
%   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%   GNU General Public License for more details.
%
%   You can obtain a copy of the GNU General Public License from
%   http://www.gnu.org/copyleft/gpl.html or by writing to
%   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.

[nr,nrf]=size(yf);          % number of frames and freq bins
x=zeros(nr,nrf);            % initialize output arrays
if isempty(yf) && isstruct(tz)             % no real data
     zo=tz;                  % just keep the same state
else
     if isstruct(tz)         % take parameters from a previous call
        nrcum=tz.nrcum;     % cumulative number of frames
         xt=tz.xt;           % smoothed power spectrum
         pslp=tz.pslp;       % correction factor (9)
         tinc=tz.tinc;       % frame increment
         qq=tz.qq;           % parameter structure
     else
         tinc = tz;          % second argument is frame increment
         nrcum=0;            % no frames so far
         % default algorithm constants
         qq.tax=0.0717;      % noise output smoothing time constant = -tinc/log(0.8) (8)
         qq.tap=0.152;       % speech prob smoothing time constant = -tinc/log(0.9) (23)
         qq.psthr=0.99;      % threshold for smoothed speech probability [0.99] (24)
         qq.pnsaf=0.01;      % noise probability safety value [0.01] (24)
         qq.pspri=0.5;       % prior speech probability [0.5] (18)
         qq.asnr=15;         % active SNR in dB [15] (18)
         qq.psini=0.5;       % initial speech probability [0.5] (23)
         qq.tavini=0.064;        % assumed speech absent time at start [64 ms]

         if nargin>=3 && ~isempty(pp)  % update fields from pp input
             qqn=fieldnames(qq);
             for i=1:length(qqn)
                 if isfield(pp,qqn{i})
                     qq.(qqn{i})=pp.(qqn{i});
                 end
             end
         end
         pslp=repmat(qq.psini,1,nrf); % initialize smoothed speech presence prob
         xt=[];                       % initialize just in case the first call has no data
     end

     % unpack parameters needed within the loop

     psthr=qq.psthr;     % threshold for smoothed speech probability [0.99] (24)
     pnsaf=qq.pnsaf;     % noise probability safety value [0.01] (24)

     % derived algorithm constants

     ax=exp(-tinc/qq.tax); % noise output smoothing factor = 0.8 (8)
     axc=1-ax;
     ap=exp(-tinc/qq.tap); % noise output smoothing factor = 0.9 (23)
     apc=1-ap;
     xih1=10^(qq.asnr/10); % speech-present SNR
     xih1r=1/(1+xih1)-1;
     pfac=(1/qq.pspri-1)*(1+xih1); % p(noise)/p(speech) (18)

     if nrcum==0 && nr>0       % initialize values for first frame
         xt=qq.psini*mean(yf(1:max(1,min(nr,round(1+qq.tavini/tinc))),:),1);  % initial noise estimate
     end

     % loop for each frame
     for t=1:nr
         yft=yf(t,:);        % noisy speech power spectrum
         ph1y=(1+pfac*exp(-xih1r*yft./xt)).^(-1); % a-posteriori speech presence prob (18)
         pslp=ap*pslp+apc*ph1y; % smoothed speech presence prob (23)
         ph1y=min(ph1y,1-pnsaf*(pslp>psthr)); % limit ph1y (24)
         xtr=(1-ph1y).*yft+ph1y.*xt; % estimated raw noise spectrum (22)
         xt=ax*xt+axc*xtr;  % smooth the noise estimate (8)
         x(t,:)=xt;  % save the noise estimate
     end
     if nargout>1    % we need to store the state for next time
         zo.nrcum=nrcum+nr;      % number of frames so far
         zo.xt=xt;          % smoothed power spectrum
         zo.pslp=pslp;               % correction factor (9)
         zo.tinc=tinc;     % must be the last one
         zo.qq=qq;
     end
     if ~nargout
         clf;
         subplot(212);
         plot((1:nr)*tinc,10*log10([sum(yf,2) sum(x,2)]))
         ylabel('Frame Energy (dB)');
         xlabel(sprintf('Time (s)   [%d ms frame incr]',round(tinc*1000)));
         axisenlarge([-1 -1.05]);
         legend('input','noise','Location','Best');
         subplot(211);
         plot(1:nrf,10*log10([sum(yf,1)'/nr sum(x,1)'/nr]))
         ylabel('Power (dB)');
         xlabel('Frequency bin');
         axisenlarge([-1 -1.05]);
         legend('input','noise','Location','Best');
     end
 end