3030 http://zafarrafii.com
3131 https://github.com/zafarrafii
3232 https://www.linkedin.com/in/zafarrafii/
33- 01/12 /21
33+ 01/19 /21
3434"""
3535
3636import numpy as np
@@ -775,7 +775,7 @@ def simonline(audio_signal, sampling_frequency):
775775 plt.show()
776776 """
777777
778- # Number of samples and channels
778+ # Get the number of samples and channels in the audio signal
779779 number_samples , number_channels = np .shape (audio_signal )
780780
781781 # Set the parameters for the STFT
@@ -785,39 +785,37 @@ def simonline(audio_signal, sampling_frequency):
785785 window_function = scipy .signal .hamming (window_length , sym = False )
786786 step_length = int (window_length / 2 )
787787
788- # Number of time frames
788+ # Derive the number of time frames
789789 number_times = int (np .ceil ((number_samples - window_length ) / step_length + 1 ))
790790
791- # Buffer length in time frames
792- buffer_length2 = int (
793- round ((buffer_length * sampling_frequency - window_length ) / step_length + 1 )
794- )
791+ # Derive the number of frequency channels
792+ number_frequencies = int (window_length / 2 + 1 )
795793
796- # Initialize the buffer spectrogram
797- buffer_spectrogram = np .zeros (
798- (int (window_length / 2 + 1 ), buffer_length2 , number_channels )
799- )
794+ # Get the buffer length in time frames
795+ buffer_length2 = round ((buffer_length * sampling_frequency ) / step_length )
800796
801- # Loop over the time frames to compute the buffer spectrogram (the last frame will be the frame to be processed)
802- for time_index in range ( 0 , buffer_length2 - 1 ):
797+ # Initialize the buffer spectrogram
798+ buffer_spectrogram = np . zeros (( number_frequencies , buffer_length2 , number_channels ))
803799
804- # Sample index in the signal
805- sample_index = step_length * time_index
800+ # Loop over the time frames to compute the buffer spectrogram
801+ # (the last frame will be the frame to be processed)
802+ k = 0
803+ for j in range (buffer_length2 - 1 ):
806804
807805 # Loop over the channels
808- for channel_index in range (0 , number_channels ):
806+ for i in range (number_channels ):
809807
810808 # Compute the FT of the segment
811809 buffer_ft = np .fft .fft (
812- audio_signal [sample_index : window_length + sample_index , channel_index ]
813- * window_function ,
810+ audio_signal [k : k + window_length , i ] * window_function ,
814811 axis = 0 ,
815812 )
816813
817- # Derive the spectrum of the frame
818- buffer_spectrogram [:, time_index , channel_index ] = abs (
819- buffer_ft [0 : int (window_length / 2 + 1 )]
820- )
814+ # Derive the magnitude spectrum and save it in the buffer spectrogram
815+ buffer_spectrogram [:, j , i ] = abs (buffer_ft [0 :number_frequencies ])
816+
817+ # Update the index
818+ k = k + step_length
821819
822820 # Zero-pad the audio signal at the end
823821 audio_signal = np .pad (
@@ -827,12 +825,14 @@ def simonline(audio_signal, sampling_frequency):
827825 constant_values = 0 ,
828826 )
829827
830- # Similarity distance in time frames
831- similarity_distance2 = int (round (similarity_distance * sample_rate / step_length ))
828+ # Get the similarity distance in time frames
829+ similarity_distance2 = int (
830+ round (similarity_distance * sampling_frequency / step_length )
831+ )
832832
833- # Cutoff frequency in frequency channels for the dual high-pass filter of the foreground
833+ # Get the cutoff frequency in frequency channels for the dual high-pass filter of the foreground
834834 cutoff_frequency2 = (
835- int (np .ceil (cutoff_frequency * (window_length - 1 ) / sample_rate )) - 1
835+ int (np .ceil (cutoff_frequency * (window_length - 1 ) / sampling_frequency )) - 1
836836 )
837837
838838 # Initialize the background signal
@@ -841,41 +841,33 @@ def simonline(audio_signal, sampling_frequency):
841841 )
842842
843843 # Loop over the time frames to compute the background signal
844- for time_index in range (buffer_length2 - 1 , number_times ):
845-
846- # Sample index in the signal
847- sample_index = step_length * time_index
844+ for j in range (buffer_length2 - 1 , number_times ):
848845
849- # Time index of the current frame
850- current_index = time_index % buffer_length2
846+ # Get the time index of the current frame
847+ j0 = j % buffer_length2
851848
852849 # Initialize the FT of the current segment
853850 current_ft = np .zeros ((window_length , number_channels ), dtype = complex )
854851
855852 # Loop over the channels
856- for channel_index in range (0 , number_channels ):
853+ for i in range (number_channels ):
857854
858855 # Compute the FT of the current segment
859- current_ft [:, channel_index ] = np .fft .fft (
860- audio_signal [sample_index : window_length + sample_index , channel_index ]
861- * window_function ,
856+ current_ft [:, i ] = np .fft .fft (
857+ audio_signal [k : k + window_length , i ] * window_function ,
862858 axis = 0 ,
863859 )
864860
865- # Derive the spectrum of the current frame and update the buffer spectrogram
866- buffer_spectrogram [:, current_index , channel_index ] = np .abs (
867- current_ft [0 : int (window_length / 2 + 1 ), channel_index ]
868- )
861+ # Derive the magnitude spectrum and update the buffer spectrogram
862+ buffer_spectrogram [:, j0 , i ] = np .abs (current_ft [0 :number_frequencies , i ])
869863
870- # Cosine similarity between the spectrum of the current frame and the past frames , for all the channels
864+ # Compute the cosine similarity between the current frame and the past ones , for all the channels
871865 similarity_vector = _similaritymatrix (
872866 np .mean (buffer_spectrogram , axis = 2 ),
873- np .mean (
874- buffer_spectrogram [:, current_index : current_index + 1 , :], axis = 2
875- ),
867+ np .mean (buffer_spectrogram [:, j0 : j0 + 1 , :], axis = 2 ),
876868 )
877869
878- # Indices of the similar frames
870+ # Estimate the indices of the similar frames
879871 _ , similarity_indices = _localmaxima (
880872 similarity_vector [:, 0 ],
881873 similarity_threshold ,
@@ -884,46 +876,44 @@ def simonline(audio_signal, sampling_frequency):
884876 )
885877
886878 # Loop over the channels
887- for channel_index in range (0 , number_channels ):
879+ for i in range (number_channels ):
888880
889881 # Compute the repeating spectrum for the current frame
890882 repeating_spectrum = np .median (
891- buffer_spectrogram [:, similarity_indices , channel_index ], axis = 1
883+ buffer_spectrogram [:, similarity_indices , i ], axis = 1
892884 )
893885
894886 # Refine the repeating spectrum
895887 repeating_spectrum = np .minimum (
896- repeating_spectrum , buffer_spectrogram [:, current_index , channel_index ]
888+ repeating_spectrum , buffer_spectrogram [:, j0 , i ]
897889 )
898890
899891 # Derive the repeating mask for the current frame
900892 repeating_mask = (repeating_spectrum + np .finfo (float ).eps ) / (
901- buffer_spectrogram [:, current_index , channel_index ]
902- + np .finfo (float ).eps
893+ buffer_spectrogram [:, j0 , i ] + np .finfo (float ).eps
903894 )
904895
905- # High -pass filtering of the dual foreground
896+ # Perform a high -pass filtering of the dual foreground
906897 repeating_mask [1 : cutoff_frequency2 + 2 ] = 1
907898
908- # Mirror the frequency channels
899+ # Recover the mirrored frequencies
909900 repeating_mask = np .concatenate ((repeating_mask , repeating_mask [- 2 :0 :- 1 ]))
910901
911902 # Apply the mask to the FT of the current segment
912- background_ft = repeating_mask * current_ft [:, channel_index ]
913-
914- # Inverse FT of the current segment
915- background_signal [
916- sample_index : window_length + sample_index , channel_index
917- ] = background_signal [
918- sample_index : window_length + sample_index , channel_index
919- ] + np .real (
920- np .fft .ifft (background_ft , axis = 0 )
921- )
903+ background_ft = repeating_mask * current_ft [:, i ]
922904
923- # Truncate the signal to the original length
905+ # Take the inverse FT of the current segment
906+ background_signal [k : k + window_length , i ] = background_signal [
907+ k : k + window_length , i
908+ ] + np .real (np .fft .ifft (background_ft , axis = 0 ))
909+
910+ # Update the index
911+ k = k + step_length
912+
913+ # Truncate the signal to the original number of samples
924914 background_signal = background_signal [0 :number_samples , :]
925915
926- # Un-window the signal (just in case )
916+ # Normalize the signal by the gain introduced by the COLA (if any )
927917 background_signal = background_signal / sum (
928918 window_function [0 :window_length :step_length ]
929919 )
0 commit comments