diff --git a/README.md b/README.md index b899d05..28b74f4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,31 @@ -# pysepm -Python implementation of performance metrics in Loizou's Speech Enhancement book +# pysepm - Python Speech Enhancement Performance Measures (Quality and Intelligibility) +[![DOI](https://zenodo.org/badge/220233987.svg)](https://zenodo.org/badge/latestdoi/220233987) + +Python implementation of objective quality and intelligibilty measures mentioned in Philipos C. Loizou's great [Speech Enhancement Book](https://www.crcpress.com/Speech-Enhancement-Theory-and-Practice-Second-Edition/Loizou/p/book/9781138075573). The Python implementations are checked with the MATLAB implementations attached to the book (see [Link](https://crcpress.com/downloads/K14513/K14513_CD_Files.zip)) + +# Install with pip +Install pysepm: +``` +pip3 install https://github.com/schmiph2/pysepm/archive/master.zip +``` +# Examples +Please find a Jupyter Notebook with examples for all implemented measures in the [examples folder](https://github.com/schmiph2/pysepm/tree/master/examples). + +# Implemented Measures +## Speech Quality Measures ++ Segmental Signal-to-Noise Ratio (SNRseg) ++ Frequency-weighted Segmental SNR (fwSNRseg) ++ Log-likelihood Ratio (LLR) ++ Weighted Spectral Slope (WSS) ++ Perceptual Evaluation of Speech Quality (PESQ), ([python-pesq](https://github.com/ludlows/python-pesq) implementation by ludlows) ++ Composite Objective Speech Quality (composite) ++ Cepstrum Distance Objective Speech Quality Measure (CD) + +## Speech Intelligibility Measures ++ Short-time objective intelligibility (STOI), ([pystoi](https://github.com/mpariente/pystoi) implementation by mpariente) ++ Coherence and speech intelligibility index (CSII) ++ Normalized-covariance measure (NCM) + +## Dereverberation Measures (TODO) ++ Bark spectral distortion (BSD) ++ Scale-invariant signal to distortion ratio (SI-SDR) diff --git a/examples/1_noisySpeech_16000_Hz.wav b/examples/1_noisySpeech_16000_Hz.wav new file mode 100644 index 0000000..cb93d43 Binary files /dev/null and b/examples/1_noisySpeech_16000_Hz.wav differ diff --git a/examples/1_processed_16000_Hz.wav b/examples/1_processed_16000_Hz.wav new file mode 100644 index 0000000..88468b6 Binary files /dev/null and b/examples/1_processed_16000_Hz.wav differ diff --git a/examples/1_speech_16000_Hz.wav b/examples/1_speech_16000_Hz.wav new file mode 100644 index 0000000..1e90d29 Binary files /dev/null and b/examples/1_speech_16000_Hz.wav differ diff --git a/examples/examplesForCalculatingMeasures.ipynb b/examples/examplesForCalculatingMeasures.ipynb new file mode 100644 index 0000000..4bb8e46 --- /dev/null +++ b/examples/examplesForCalculatingMeasures.ipynb @@ -0,0 +1,1027 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io import wavfile\n", + "import sys\n", + "sys.path.append(\"../\") \n", + "import pysepm\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load Audio Files" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: WavFileWarning: Chunk (non-data) not understood, skipping it.\n", + " \"\"\"Entry point for launching an IPython kernel.\n", + "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: WavFileWarning: Chunk (non-data) not understood, skipping it.\n", + " \n", + "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: WavFileWarning: Chunk (non-data) not understood, skipping it.\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + } + ], + "source": [ + "fs, clean_speech = wavfile.read('1_speech_16000_Hz.wav')\n", + "fs, noisy_speech = wavfile.read('1_noisySpeech_16000_Hz.wav')\n", + "fs, enhanced_speech = wavfile.read('1_processed_16000_Hz.wav')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calc Measures" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## fwSNRseg" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5.677122381286333" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.fwSNRseg(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.7018915637639442" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.fwSNRseg(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SNRseg" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.21686174881958412" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.SNRseg(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-1.2033705198493232" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.SNRseg(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LLR" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.2530765533868626" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.llr(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.6044180854910925" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.llr(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## WSS" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "44.69861369131994" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.wss(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "66.79548150633117" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.wss(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cepstrum Distance (CD)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.187803223674154" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.cepstrum_distance(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.308018500622689" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.cepstrum_distance(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## STOI" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8389211808320151" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.stoi(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.661152205056174" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.stoi(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CSII" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.7336988398405825, 0.4363089356143529, 0.018848854017316796)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.csii(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.527623436040412, 0.2437425452887004, 0.010118033728508934)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.csii(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PESQ" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(nan, 1.1624178886413574)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.pesq(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(nan, 1.0594704151153564)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.pesq(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Composite" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2.037992006581127, 1.8630831647556954, 1.5433156477547219)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.composite(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1.0, 1.5970461451303148, 1.0)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.composite(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NCM" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "pysepm.ncm(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "pysepm.ncm(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SRMR" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/srmrpy/hilbert.py:69: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n", + " h = h[ind]\n" + ] + }, + { + "data": { + "text/plain": [ + "3.2050299299144114" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.srmr(noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.317370449764798" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.srmr(enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BSD" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "include pre-emphasis\n" + ] + }, + { + "data": { + "text/plain": [ + "38086949616.83964" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.bsd(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "include pre-emphasis\n" + ] + }, + { + "data": { + "text/plain": [ + "51047508.139671564" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pysepm.bsd(clean_speech, enhanced_speech, fs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Measure Execution Times" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "61.1 ms ± 667 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.fwSNRseg(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.5 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.SNRseg(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "113 ms ± 188 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.llr(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "80 ms ± 552 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.wss(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "92.5 ms ± 391 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.cepstrum_distance(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "143 ms ± 381 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.stoi(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "82.7 ms ± 450 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.csii(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "309 ms ± 510 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.pesq(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "521 ms ± 1.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.composite(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5.13 s ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.ncm(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/srmrpy/hilbert.py:69: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n", + " h = h[ind]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.6 s ± 660 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.srmr(clean_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "include pre-emphasis\n", + "bark filter not tested\n", + "37.9 ms ± 319 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit pysepm.bsd(clean_speech, noisy_speech, fs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f1a89dd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[build-system] +requires = ["setuptools", "wheel", "Cython","numpy"] diff --git a/pysepm/__init__.py b/pysepm/__init__.py new file mode 100755 index 0000000..c8ff390 --- /dev/null +++ b/pysepm/__init__.py @@ -0,0 +1,8 @@ +__version__ = '0.1' + + +from .qualityMeasures import fwSNRseg,SNRseg,llr,wss,composite,pesq,cepstrum_distance +from .intelligibilityMeasures import stoi,csii,ncm +from .reverberationMeasures import srr_seg,bsd#,srmr + + diff --git a/pysepm/intelligibilityMeasures.py b/pysepm/intelligibilityMeasures.py new file mode 100755 index 0000000..c181dd1 --- /dev/null +++ b/pysepm/intelligibilityMeasures.py @@ -0,0 +1,278 @@ +from scipy.signal import stft,resample,butter,lfilter,hilbert +from scipy.interpolate import interp1d +from pystoi import stoi as pystoi # https://github.com/mpariente/pystoi +import numpy as np + +from .util import extract_overlapped_windows,resample_matlab_like + +stoi = pystoi + +def fwseg_noise(clean_speech, processed_speech,fs,frameLen=0.03, overlap=0.75): + + clean_length = len(clean_speech) + processed_length = len(processed_speech) + rms_all=np.linalg.norm(clean_speech)/np.sqrt(processed_length) + + winlength = round(frameLen*fs) #window length in samples + skiprate = int(np.floor((1-overlap)*frameLen*fs)) #window skip in samples + max_freq = fs/2 #maximum bandwidth + num_crit = 16 # number of critical bands + n_fft = int(2**np.ceil(np.log2(2*winlength))) + n_fftby2 = int(n_fft/2) + + cent_freq=np.zeros((num_crit,)) + bandwidth=np.zeros((num_crit,)) + + # ---------------------------------------------------------------------- + # Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) + # ---------------------------------------------------------------------- + cent_freq[0] = 150.0000; bandwidth[0] = 100.0000; + cent_freq[1] = 250.000; bandwidth[1] = 100.0000; + cent_freq[2] = 350.000; bandwidth[2] = 100.0000; + cent_freq[3] = 450.000; bandwidth[3] = 110.0000; + cent_freq[4] = 570.000; bandwidth[4] = 120.0000; + cent_freq[5] = 700.000; bandwidth[5] = 140.0000; + cent_freq[6] = 840.000; bandwidth[6] = 150.0000; + cent_freq[7] = 1000.000; bandwidth[7] = 160.000; + cent_freq[8] = 1170.000; bandwidth[8] = 190.000; + cent_freq[9] = 1370.000; bandwidth[9] = 210.000; + cent_freq[10] = 1600.000; bandwidth[10]= 240.000; + cent_freq[11] = 1850.000; bandwidth[11]= 280.000; + cent_freq[12] = 2150.000; bandwidth[12]= 320.000; + cent_freq[13] = 2500.000; bandwidth[13]= 380.000; + cent_freq[14] = 2900.000; bandwidth[14]= 450.000; + cent_freq[15] = 3400.000; bandwidth[15]= 550.000; + + Weight=np.array([0.0192,0.0312,0.0926,0.1031,0.0735,0.0611,0.0495,0.044,0.044,0.049,0.0486,0.0493, 0.049,0.0547,0.0555,0.0493]) + + # ---------------------------------------------------------------------- + # Set up the critical band filters. Note here that Gaussianly shaped + # filters are used. Also, the sum of the filter weights are equivalent + # for each critical band filter. Filter less than -30 dB and set to + # zero. + # ---------------------------------------------------------------------- + + all_f0=np.zeros((num_crit,)) + crit_filter=np.zeros((num_crit,int(n_fftby2))) + g = np.zeros((num_crit,n_fftby2)) + + b = bandwidth; + q = cent_freq/1000; + p = 4*1000*q/b; # Eq. (7) + + #15.625=4000/256 + j = np.arange(0,n_fftby2) + + for i in range(num_crit): + g[i,:]=np.abs(1-j*(fs/n_fft)/(q[i]*1000));# Eq. (9) + crit_filter[i,:] = (1+p[i]*g[i,:])*np.exp(-p[i]*g[i,:]);# Eq. (8) + + num_frames = int(clean_length/skiprate-(winlength/skiprate)); # number of frames + start = 0 # starting sample + hannWin = 0.5*(1-np.cos(2*np.pi*np.arange(1,winlength+1)/(winlength+1))) + + f,t,clean_spec=stft(clean_speech[0:int(num_frames)*skiprate+int(winlength-skiprate)], fs=fs, window=hannWin, nperseg=winlength, noverlap=winlength-skiprate, nfft=n_fft, detrend=False, return_onesided=False, boundary=None, padded=False) + f,t,processed_spec=stft(processed_speech[0:int(num_frames)*skiprate+int(winlength-skiprate)], fs=fs, window=hannWin, nperseg=winlength, noverlap=winlength-skiprate, nfft=n_fft, detrend=False, return_onesided=False, boundary=None, padded=False) + + clean_frames = extract_overlapped_windows(clean_speech[0:int(num_frames)*skiprate+int(winlength-skiprate)],winlength,winlength-skiprate,None) + rms_seg = np.linalg.norm(clean_frames,axis=-1)/np.sqrt(winlength); + rms_db = 20*np.log10(rms_seg/rms_all); + #-------------------------------------------------------------- + # cal r2_high,r2_middle,r2_low + highInd = np.where(rms_db>=0) + highInd = highInd[0] + middleInd = np.where((rms_db>=-10) & (rms_db<0)) + middleInd = middleInd[0] + lowInd = np.where(rms_db<-10) + lowInd = lowInd[0] + + num_high = np.sum(clean_spec[0:n_fftby2,highInd]*np.conj(processed_spec[0:n_fftby2,highInd]),axis=-1) + denx_high = np.sum(np.abs(clean_spec[0:n_fftby2,highInd])**2,axis=-1) + deny_high = np.sum(np.abs(processed_spec[0:n_fftby2,highInd])**2,axis=-1); + + num_middle = np.sum(clean_spec[0:n_fftby2,middleInd]*np.conj(processed_spec[0:n_fftby2,middleInd]),axis=-1) + denx_middle = np.sum(np.abs(clean_spec[0:n_fftby2,middleInd])**2,axis=-1) + deny_middle = np.sum(np.abs(processed_spec[0:n_fftby2,middleInd])**2,axis=-1); + + num_low = np.sum(clean_spec[0:n_fftby2,lowInd]*np.conj(processed_spec[0:n_fftby2,lowInd]),axis=-1) + denx_low = np.sum(np.abs(clean_spec[0:n_fftby2,lowInd])**2,axis=-1) + deny_low = np.sum(np.abs(processed_spec[0:n_fftby2,lowInd])**2,axis=-1); + + num2_high = np.abs(num_high)**2; + r2_high = num2_high/(denx_high*deny_high); + + num2_middle = np.abs(num_middle)**2; + r2_middle = num2_middle/(denx_middle*deny_middle); + + num2_low = np.abs(num_low)**2; + r2_low = num2_low/(denx_low*deny_low); + #-------------------------------------------------------------- + # cal distortion frame by frame + + clean_spec = np.abs(clean_spec); + processed_spec = np.abs(processed_spec)**2; + + W_freq=Weight + + processed_energy = crit_filter.dot((processed_spec[0:n_fftby2,highInd].T*r2_high).T) + de_processed_energy= crit_filter.dot((processed_spec[0:n_fftby2,highInd].T*(1-r2_high)).T) + SDR = processed_energy/de_processed_energy;# Eq 13 in Kates (2005) + SDRlog=10*np.log10(SDR); + SDRlog_lim = SDRlog + SDRlog_lim[SDRlog_lim<-15]=-15 + SDRlog_lim[SDRlog_lim>15]=15 # limit between [-15, 15] + Tjm = (SDRlog_lim+15)/30; + distortionh = W_freq.dot(Tjm)/np.sum(W_freq,axis=0) + distortionh[distortionh<0]=0 + + + processed_energy = crit_filter.dot((processed_spec[0:n_fftby2,middleInd].T*r2_middle).T) + de_processed_energy= crit_filter.dot((processed_spec[0:n_fftby2,middleInd].T*(1-r2_middle)).T) + SDR = processed_energy/de_processed_energy;# Eq 13 in Kates (2005) + SDRlog=10*np.log10(SDR); + SDRlog_lim = SDRlog + SDRlog_lim[SDRlog_lim<-15]=-15 + SDRlog_lim[SDRlog_lim>15]=15 # limit between [-15, 15] + Tjm = (SDRlog_lim+15)/30; + distortionm = W_freq.dot(Tjm)/np.sum(W_freq,axis=0) + distortionm[distortionm<0]=0 + + processed_energy = crit_filter.dot((processed_spec[0:n_fftby2,lowInd].T*r2_low).T) + de_processed_energy= crit_filter.dot((processed_spec[0:n_fftby2,lowInd].T*(1-r2_low)).T) + SDR = processed_energy/de_processed_energy;# Eq 13 in Kates (2005) + SDRlog=10*np.log10(SDR); + SDRlog_lim = SDRlog + SDRlog_lim[SDRlog_lim<-15]=-15 + SDRlog_lim[SDRlog_lim>15]=15 # limit between [-15, 15] + Tjm = (SDRlog_lim+15)/30; + distortionl = W_freq.dot(Tjm)/np.sum(W_freq,axis=0) + distortionl[distortionl<0]=0 + + return distortionh,distortionm,distortionl + + +def csii(clean_speech, processed_speech,sample_rate): + sampleLen= min(len( clean_speech), len( processed_speech)) + clean_speech= clean_speech[0: sampleLen] + processed_speech= processed_speech[0: sampleLen] + vec_CSIIh,vec_CSIIm,vec_CSIIl = fwseg_noise(clean_speech, processed_speech, sample_rate) + + CSIIh=np.mean(vec_CSIIh) + CSIIm=np.mean(vec_CSIIm) + CSIIl=np.mean(vec_CSIIl) + return CSIIh,CSIIm,CSIIl + + + +def get_band(M,Fs): + # This function sets the bandpass filter band edges. + # It assumes that the sampling frequency is 8000 Hz. + A = 165 + a = 2.1 + K = 1 + L = 35 + CF = 300; + x_100 = (L/a)*np.log10(CF/A + K) + CF = Fs/2-600 + x_8000 = (L/a)*np.log10(CF/A + K); + LX = x_8000 - x_100 + x_step = LX / M + x = np.arange(x_100,x_8000+x_step+1e-20,x_step) + if len(x) == M: + np.append(x,x_8000) + + BAND = A*(10**(a*x/L) - K) + return BAND + +def get_ansis(BAND): + fcenter=(BAND[0:-1]+BAND[1:])/2; + + # Data from Table B.1 in "ANSI (1997). S3.5–1997 Methods for Calculation of the Speech Intelligibility + # Index. New York: American National Standards Institute." + f=np.array([150,250,350,450,570,700,840,1000,1170,1370,1600,1850,2150,2500,2900,3400,4000,4800,5800,7000,8500]) + BIF=np.array([0.0192,0.0312,0.0926,0.1031,0.0735,0.0611,0.0495,0.0440,0.0440,0.0490,0.0486,0.0493,0.0490,0.0547,0.0555,0.0493,0.0359,0.0387,0.0256,0.0219,0.0043]) + f_ANSI = interp1d(f,BIF) + ANSIs= f_ANSI(fcenter); + return fcenter,ANSIs + + +def ncm(clean_speech,processed_speech,fs): + + if fs != 8000 and fs != 16000: + raise ValueError('fs must be either 8 kHz or 16 kHz') + + + + x= clean_speech # clean signal + y= processed_speech # noisy signal + F_SIGNAL = fs + + F_ENVELOPE = 32 # limits modulations to 0 Ly: + x = x[0:Ly] + if Ly > Lx: + y = y[0:Lx] + + Lx = len(x); + Ly = len(y); + + X_BANDS = np.zeros((Lx,M_CHANNELS)) + Y_BANDS = np.zeros((Lx,M_CHANNELS)) + + # DESIGN BANDPASS FILTERS + for a in range(M_CHANNELS): + B_bp,A_bp = butter( 4 , np.array([BAND[a],BAND[a+1]])*(2/F_SIGNAL),btype='bandpass') + X_BANDS[:,a] = lfilter( B_bp , A_bp , x ) + Y_BANDS[:,a] = lfilter( B_bp , A_bp , y ) + + gcd = np.gcd(F_SIGNAL, F_ENVELOPE) + # CALCULATE HILBERT ENVELOPES, and resample at F_ENVELOPE Hz + analytic_x = hilbert( X_BANDS,axis=0); + X = np.abs( analytic_x ); + #X = resample( X , round(len(x)/F_SIGNAL*F_ENVELOPE)); + X = resample_matlab_like(X,F_ENVELOPE,F_SIGNAL) + analytic_y = hilbert( Y_BANDS,axis=0); + Y = np.abs( analytic_y ); + #Y = resample( Y , round(len(x)/F_SIGNAL*F_ENVELOPE)); + Y = resample_matlab_like(Y,F_ENVELOPE,F_SIGNAL) + ## ---compute weights based on clean signal's rms envelopes----- + # + Ldx, pp=X.shape + p=3 # power exponent - see Eq. 12 + + ro2 = np.zeros((M_CHANNELS,)) + asnr = np.zeros((M_CHANNELS,)) + TI = np.zeros((M_CHANNELS,)) + + for k in range(M_CHANNELS): + x_tmp= X[ :, k] + y_tmp= Y[ :, k] + lambda_x= np.linalg.norm( x_tmp- np.mean( x_tmp))**2 + lambda_y= np.linalg.norm( y_tmp- np.mean( y_tmp))**2 + lambda_xy= np.sum( (x_tmp- np.mean( x_tmp))*(y_tmp- np.mean( y_tmp))) + ro2[k]= (lambda_xy**2)/ (lambda_x*lambda_y) + asnr[k]= 10*np.log10( (ro2[k]+ 1e-20)/ (1- ro2[k]+ 1e-20)); # Eq.9 in [1] + + if asnr[k]< -15: + asnr[k]= -15 + elif asnr[k]> 15: + asnr[k]= 15 + + TI[k]= (asnr[k]+ 15)/ 30 # Eq.10 in [1] + + ncm_val= WEIGHT.dot(TI)/np.sum(WEIGHT) # Eq.11 + return ncm_val diff --git a/pysepm/qualityMeasures.py b/pysepm/qualityMeasures.py new file mode 100755 index 0000000..f2c657c --- /dev/null +++ b/pysepm/qualityMeasures.py @@ -0,0 +1,446 @@ +from scipy.signal import stft,get_window,correlate,resample +from scipy.linalg import solve_toeplitz,toeplitz +import scipy +import pesq as pypesq # https://github.com/ludlows/python-pesq +import numpy as np +from numba import jit +from .util import extract_overlapped_windows + +def SNRseg(clean_speech, processed_speech,fs, frameLen=0.03, overlap=0.75): + eps=np.finfo(np.float64).eps + + winlength = round(frameLen*fs) #window length in samples + skiprate = int(np.floor((1-overlap)*frameLen*fs)) #window skip in samples + MIN_SNR = -10 # minimum SNR in dB + MAX_SNR = 35 # maximum SNR in dB + + hannWin=0.5*(1-np.cos(2*np.pi*np.arange(1,winlength+1)/(winlength+1))) + clean_speech_framed=extract_overlapped_windows(clean_speech,winlength,winlength-skiprate,hannWin) + processed_speech_framed=extract_overlapped_windows(processed_speech,winlength,winlength-skiprate,hannWin) + + signal_energy = np.power(clean_speech_framed,2).sum(-1) + noise_energy = np.power(clean_speech_framed-processed_speech_framed,2).sum(-1) + + segmental_snr = 10*np.log10(signal_energy/(noise_energy+eps)+eps) + segmental_snr[segmental_snrMAX_SNR]=MAX_SNR + segmental_snr=segmental_snr[:-1] # remove last frame -> not valid + return np.mean(segmental_snr) + +def fwSNRseg(cleanSig, enhancedSig, fs, frameLen=0.03, overlap=0.75): + if cleanSig.shape!=enhancedSig.shape: + raise ValueError('The two signals do not match!') + eps=np.finfo(np.float64).eps + cleanSig=cleanSig.astype(np.float64)+eps + enhancedSig=enhancedSig.astype(np.float64)+eps + winlength = round(frameLen*fs) #window length in samples + skiprate = int(np.floor((1-overlap)*frameLen*fs)) #window skip in samples + max_freq = fs/2 #maximum bandwidth + num_crit = 25# number of critical bands + n_fft = 2**np.ceil(np.log2(2*winlength)) + n_fftby2 = int(n_fft/2) + gamma=0.2 + + cent_freq=np.zeros((num_crit,)) + bandwidth=np.zeros((num_crit,)) + + cent_freq[0] = 50.0000; bandwidth[0] = 70.0000; + cent_freq[1] = 120.000; bandwidth[1] = 70.0000; + cent_freq[2] = 190.000; bandwidth[2] = 70.0000; + cent_freq[3] = 260.000; bandwidth[3] = 70.0000; + cent_freq[4] = 330.000; bandwidth[4] = 70.0000; + cent_freq[5] = 400.000; bandwidth[5] = 70.0000; + cent_freq[6] = 470.000; bandwidth[6] = 70.0000; + cent_freq[7] = 540.000; bandwidth[7] = 77.3724; + cent_freq[8] = 617.372; bandwidth[8] = 86.0056; + cent_freq[9] = 703.378; bandwidth[9] = 95.3398; + cent_freq[10] = 798.717; bandwidth[10] = 105.411; + cent_freq[11] = 904.128; bandwidth[11] = 116.256; + cent_freq[12] = 1020.38; bandwidth[12] = 127.914; + cent_freq[13] = 1148.30; bandwidth[13] = 140.423; + cent_freq[14] = 1288.72; bandwidth[14] = 153.823; + cent_freq[15] = 1442.54; bandwidth[15] = 168.154; + cent_freq[16] = 1610.70; bandwidth[16] = 183.457; + cent_freq[17] = 1794.16; bandwidth[17] = 199.776; + cent_freq[18] = 1993.93; bandwidth[18] = 217.153; + cent_freq[19] = 2211.08; bandwidth[19] = 235.631; + cent_freq[20] = 2446.71; bandwidth[20] = 255.255; + cent_freq[21] = 2701.97; bandwidth[21] = 276.072; + cent_freq[22] = 2978.04; bandwidth[22] = 298.126; + cent_freq[23] = 3276.17; bandwidth[23] = 321.465; + cent_freq[24] = 3597.63; bandwidth[24] = 346.136; + + + W=np.array([0.003,0.003,0.003,0.007,0.010,0.016,0.016,0.017,0.017,0.022,0.027,0.028,0.030,0.032,0.034,0.035,0.037,0.036,0.036,0.033,0.030,0.029,0.027,0.026, + 0.026]) + + bw_min=bandwidth[0] + min_factor = np.exp (-30.0 / (2.0 * 2.303));# % -30 dB point of filter + + all_f0=np.zeros((num_crit,)) + crit_filter=np.zeros((num_crit,int(n_fftby2))) + j = np.arange(0,n_fftby2) + + + for i in range(num_crit): + f0 = (cent_freq[i] / max_freq) * (n_fftby2) + all_f0[i] = np.floor(f0); + bw = (bandwidth[i] / max_freq) * (n_fftby2); + norm_factor = np.log(bw_min) - np.log(bandwidth[i]); + crit_filter[i,:] = np.exp (-11 *(((j - np.floor(f0))/bw)**2) + norm_factor) + crit_filter[i,:] = crit_filter[i,:]*(crit_filter[i,:] > min_factor) + + num_frames = len(cleanSig)/skiprate-(winlength/skiprate)# number of frames + start = 1 # starting sample + #window = 0.5*(1 - cos(2*pi*(1:winlength).T/(winlength+1))); + + + hannWin=0.5*(1-np.cos(2*np.pi*np.arange(1,winlength+1)/(winlength+1))) + f,t,Zxx=stft(cleanSig[0:int(num_frames)*skiprate+int(winlength-skiprate)], fs=fs, window=hannWin, nperseg=winlength, noverlap=winlength-skiprate, nfft=n_fft, detrend=False, return_onesided=True, boundary=None, padded=False) + clean_spec=np.abs(Zxx) + clean_spec=clean_spec[:-1,:] + clean_spec=(clean_spec/clean_spec.sum(0)) + f,t,Zxx=stft(enhancedSig[0:int(num_frames)*skiprate+int(winlength-skiprate)], fs=fs, window=hannWin, nperseg=winlength, noverlap=winlength-skiprate, nfft=n_fft, detrend=False, return_onesided=True, boundary=None, padded=False) + enh_spec=np.abs(Zxx) + enh_spec=enh_spec[:-1,:] + enh_spec=(enh_spec/enh_spec.sum(0)) + + clean_energy=(crit_filter.dot(clean_spec)) + processed_energy=(crit_filter.dot(enh_spec)) + error_energy=np.power(clean_energy-processed_energy,2) + error_energy[error_energy35]=35 + + return np.mean(distortion) +@jit +def lpcoeff(speech_frame, model_order): + eps=np.finfo(np.float64).eps + # ---------------------------------------------------------- + # (1) Compute Autocorrelation Lags + # ---------------------------------------------------------- + winlength = max(speech_frame.shape) + R = np.zeros((model_order+1,)) + for k in range(model_order+1): + if k==0: + R[k]=np.sum(speech_frame[0:]*speech_frame[0:]) + else: + R[k]=np.sum(speech_frame[0:-k]*speech_frame[k:]) + + + #R=scipy.signal.correlate(speech_frame,speech_frame) + #R=R[len(speech_frame)-1:len(speech_frame)+model_order] + # ---------------------------------------------------------- + # (2) Levinson-Durbin + # ---------------------------------------------------------- + a = np.ones((model_order,)) + a_past = np.ones((model_order,)) + rcoeff = np.zeros((model_order,)) + E = np.zeros((model_order+1,)) + + E[0]=R[0] + + for i in range(0,model_order): + a_past[0:i] = a[0:i] + + sum_term = np.sum(a_past[0:i]*R[i:0:-1]) + + if E[i]==0.0: # prevents zero division error, numba doesn't allow try/except statements + rcoeff[i]= np.inf + else: + rcoeff[i]=(R[i+1] - sum_term) / (E[i]) + + a[i]=rcoeff[i] + #if i==0: + # a[0:i] = a_past[0:i] - rcoeff[i]*np.array([]) + #else: + if i>0: + a[0:i] = a_past[0:i] - rcoeff[i]*a_past[i-1::-1] + + E[i+1]=(1-rcoeff[i]*rcoeff[i])*E[i] + + acorr = R; + refcoeff = rcoeff; + lpparams = np.ones((model_order+1,)) + lpparams[1:] = -a + return(lpparams,R) + +def llr(clean_speech, processed_speech, fs, used_for_composite=False, frameLen=0.03, overlap=0.75): + eps=np.finfo(np.float64).eps + alpha = 0.95 + winlength = round(frameLen*fs) #window length in samples + skiprate = int(np.floor((1-overlap)*frameLen*fs)) #window skip in samples + if fs<10000: + P = 10 # LPC Analysis Order + else: + P = 16 # this could vary depending on sampling frequency. + + hannWin=0.5*(1-np.cos(2*np.pi*np.arange(1,winlength+1)/(winlength+1))) + clean_speech_framed=extract_overlapped_windows(clean_speech+eps,winlength,winlength-skiprate,hannWin) + processed_speech_framed=extract_overlapped_windows(processed_speech+eps,winlength,winlength-skiprate,hannWin) + numFrames=clean_speech_framed.shape[0] + numerators = np.zeros((numFrames-1,)) + denominators = np.zeros((numFrames-1,)) + + for ii in range(numFrames-1): + A_clean,R_clean=lpcoeff(clean_speech_framed[ii,:],P) + A_proc,R_proc=lpcoeff(processed_speech_framed[ii,:],P) + + numerators[ii]=A_proc.dot(toeplitz(R_clean).dot(A_proc.T)) + denominators[ii]=A_clean.dot(toeplitz(R_clean).dot(A_clean.T)) + + + frac=numerators/(denominators) + frac[np.isnan(frac)]=np.inf + frac[frac<=0]=1000 + distortion = np.log(frac) + if not used_for_composite: + distortion[distortion>2]=2 # this line is not in composite measure but in llr matlab implementation of loizou + distortion = np.sort(distortion) + distortion = distortion[:int(round(len(distortion)*alpha))] + return np.mean(distortion) + + +@jit +def find_loc_peaks(slope,energy): + num_crit = len(energy) + + loc_peaks=np.zeros_like(slope) + + for ii in range(len(slope)): + n=ii + if slope[ii]>0: + while ((n 0)): + n=n+1 + loc_peaks[ii]=energy[n-1] + else: + while ((n>=0) and (slope[n] <= 0)): + n=n-1 + loc_peaks[ii]=energy[n+1] + + return loc_peaks + + + +def wss(clean_speech, processed_speech, fs, frameLen=0.03, overlap=0.75): + + Kmax = 20 # value suggested by Klatt, pg 1280 + Klocmax = 1 # value suggested by Klatt, pg 1280 + alpha = 0.95 + if clean_speech.shape!=processed_speech.shape: + raise ValueError('The two signals do not match!') + eps=np.finfo(np.float64).eps + clean_speech=clean_speech.astype(np.float64)+eps + processed_speech=processed_speech.astype(np.float64)+eps + winlength = round(frameLen*fs) #window length in samples + skiprate = int(np.floor((1-overlap)*frameLen*fs)) #window skip in samples + max_freq = fs/2 #maximum bandwidth + num_crit = 25# number of critical bands + n_fft = 2**np.ceil(np.log2(2*winlength)) + n_fftby2 = int(n_fft/2) + + cent_freq=np.zeros((num_crit,)) + bandwidth=np.zeros((num_crit,)) + + cent_freq[0] = 50.0000; bandwidth[0] = 70.0000; + cent_freq[1] = 120.000; bandwidth[1] = 70.0000; + cent_freq[2] = 190.000; bandwidth[2] = 70.0000; + cent_freq[3] = 260.000; bandwidth[3] = 70.0000; + cent_freq[4] = 330.000; bandwidth[4] = 70.0000; + cent_freq[5] = 400.000; bandwidth[5] = 70.0000; + cent_freq[6] = 470.000; bandwidth[6] = 70.0000; + cent_freq[7] = 540.000; bandwidth[7] = 77.3724; + cent_freq[8] = 617.372; bandwidth[8] = 86.0056; + cent_freq[9] = 703.378; bandwidth[9] = 95.3398; + cent_freq[10] = 798.717; bandwidth[10] = 105.411; + cent_freq[11] = 904.128; bandwidth[11] = 116.256; + cent_freq[12] = 1020.38; bandwidth[12] = 127.914; + cent_freq[13] = 1148.30; bandwidth[13] = 140.423; + cent_freq[14] = 1288.72; bandwidth[14] = 153.823; + cent_freq[15] = 1442.54; bandwidth[15] = 168.154; + cent_freq[16] = 1610.70; bandwidth[16] = 183.457; + cent_freq[17] = 1794.16; bandwidth[17] = 199.776; + cent_freq[18] = 1993.93; bandwidth[18] = 217.153; + cent_freq[19] = 2211.08; bandwidth[19] = 235.631; + cent_freq[20] = 2446.71; bandwidth[20] = 255.255; + cent_freq[21] = 2701.97; bandwidth[21] = 276.072; + cent_freq[22] = 2978.04; bandwidth[22] = 298.126; + cent_freq[23] = 3276.17; bandwidth[23] = 321.465; + cent_freq[24] = 3597.63; bandwidth[24] = 346.136; + + + W=np.array([0.003,0.003,0.003,0.007,0.010,0.016,0.016,0.017,0.017,0.022,0.027,0.028,0.030,0.032,0.034,0.035,0.037,0.036,0.036,0.033,0.030,0.029,0.027,0.026, + 0.026]) + + bw_min=bandwidth[0] + min_factor = np.exp (-30.0 / (2.0 * 2.303));# % -30 dB point of filter + + all_f0=np.zeros((num_crit,)) + crit_filter=np.zeros((num_crit,int(n_fftby2))) + j = np.arange(0,n_fftby2) + + + for i in range(num_crit): + f0 = (cent_freq[i] / max_freq) * (n_fftby2) + all_f0[i] = np.floor(f0); + bw = (bandwidth[i] / max_freq) * (n_fftby2); + norm_factor = np.log(bw_min) - np.log(bandwidth[i]); + crit_filter[i,:] = np.exp (-11 *(((j - np.floor(f0))/bw)**2) + norm_factor) + crit_filter[i,:] = crit_filter[i,:]*(crit_filter[i,:] > min_factor) + + num_frames = len(clean_speech)/skiprate-(winlength/skiprate)# number of frames + start = 1 # starting sample + + hannWin=0.5*(1-np.cos(2*np.pi*np.arange(1,winlength+1)/(winlength+1))) + scale = np.sqrt(1.0 / hannWin.sum()**2) + + f,t,Zxx=stft(clean_speech[0:int(num_frames)*skiprate+int(winlength-skiprate)], fs=fs, window=hannWin, nperseg=winlength, noverlap=winlength-skiprate, nfft=n_fft, detrend=False, return_onesided=True, boundary=None, padded=False) + clean_spec=np.power(np.abs(Zxx)/scale,2) + clean_spec=clean_spec[:-1,:] + + f,t,Zxx=stft(processed_speech[0:int(num_frames)*skiprate+int(winlength-skiprate)], fs=fs, window=hannWin, nperseg=winlength, noverlap=winlength-skiprate, nfft=n_fft, detrend=False, return_onesided=True, boundary=None, padded=False) + proc_spec=np.power(np.abs(Zxx)/scale,2) + proc_spec=proc_spec[:-1,:] + + clean_energy=(crit_filter.dot(clean_spec)) + log_clean_energy=10*np.log10(clean_energy) + log_clean_energy[log_clean_energy<-100]=-100 + proc_energy=(crit_filter.dot(proc_spec)) + log_proc_energy=10*np.log10(proc_energy) + log_proc_energy[log_proc_energy<-100]=-100 + + log_clean_energy_slope=np.diff(log_clean_energy,axis=0) + log_proc_energy_slope=np.diff(log_proc_energy,axis=0) + + dBMax_clean = np.max(log_clean_energy,axis=0) + dBMax_processed = np.max(log_proc_energy,axis=0) + + numFrames=log_clean_energy_slope.shape[-1] + + clean_loc_peaks=np.zeros_like(log_clean_energy_slope) + proc_loc_peaks=np.zeros_like(log_proc_energy_slope) + for ii in range(numFrames): + clean_loc_peaks[:,ii]=find_loc_peaks(log_clean_energy_slope[:,ii],log_clean_energy[:,ii]) + proc_loc_peaks[:,ii]=find_loc_peaks(log_proc_energy_slope[:,ii],log_proc_energy[:,ii]) + + + Wmax_clean = Kmax / (Kmax + dBMax_clean - log_clean_energy[:-1,:]) + Wlocmax_clean = Klocmax / ( Klocmax + clean_loc_peaks - log_clean_energy[:-1,:]) + W_clean = Wmax_clean * Wlocmax_clean + + Wmax_proc = Kmax / (Kmax + dBMax_processed - log_proc_energy[:-1]) + Wlocmax_proc = Klocmax / ( Klocmax + proc_loc_peaks - log_proc_energy[:-1,:]) + W_proc = Wmax_proc * Wlocmax_proc + + W = (W_clean + W_proc)/2.0 + + distortion=np.sum(W*(log_clean_energy_slope- log_proc_energy_slope)**2,axis=0) + distortion=distortion/np.sum(W,axis=0) + distortion = np.sort(distortion) + distortion = distortion[:int(round(len(distortion)*alpha))] + return np.mean(distortion) + +def pesq(clean_speech, processed_speech, fs, mode=None): + if (fs != 8000) and (fs != 16000): + raise ValueError('fs must be either 8 kHz or 16 kHz') + + if mode is None: + if fs == 8000: + mode = 'nb' + elif fs == 16000: + mode = 'wb' + + if mode == 'nb': + mos_lqo = pypesq.pesq(fs,clean_speech, processed_speech, mode) + pesq_mos = 46607/14945 - (2000*np.log(1/(mos_lqo/4 - 999/4000) - 1))/2989#0.999 + ( 4.999-0.999 ) / ( 1+np.exp(-1.4945*pesq_mos+4.6607) ) + elif mode == 'wb': + mos_lqo = pypesq.pesq(fs,clean_speech, processed_speech, mode) + pesq_mos = np.NaN + else: + print('mode should be "nb" for narrowband or "wb" for wideband') + + return pesq_mos,mos_lqo + + +def composite(clean_speech, processed_speech, fs): + wss_dist=wss(clean_speech, processed_speech, fs) + llr_mean=llr(clean_speech, processed_speech, fs,used_for_composite=True) + segSNR=SNRseg(clean_speech, processed_speech, fs) + pesq_mos,mos_lqo = pesq(clean_speech, processed_speech,fs) + + if fs >= 16e3: + used_pesq_val = mos_lqo + else: + used_pesq_val = pesq_mos + + Csig = 3.093 - 1.029*llr_mean + 0.603*used_pesq_val-0.009*wss_dist + Csig = np.max((1,Csig)) + Csig = np.min((5, Csig)) # limit values to [1, 5] + Cbak = 1.634 + 0.478 *used_pesq_val - 0.007*wss_dist + 0.063*segSNR + Cbak = np.max((1, Cbak)) + Cbak = np.min((5,Cbak)) # limit values to [1, 5] + Covl = 1.594 + 0.805*used_pesq_val - 0.512*llr_mean - 0.007*wss_dist + Covl = np.max((1, Covl)) + Covl = np.min((5, Covl)) # limit values to [1, 5] + return Csig,Cbak,Covl + +@jit +def lpc2cep(a): + # + # converts prediction to cepstrum coefficients + # + # Author: Philipos C. Loizou + + M=len(a); + cep=np.zeros((M-1,)); + + cep[0]=-a[1] + + for k in range(2,M): + ix=np.arange(1,k) + vec1=cep[ix-1]*a[k-1:0:-1]*(ix) + cep[k-1]=-(a[k]+np.sum(vec1)/k); + return cep + + +def cepstrum_distance(clean_speech, processed_speech, fs, frameLen=0.03, overlap=0.75): + + + clean_length = len(clean_speech) + processed_length = len(processed_speech) + + winlength = round(frameLen*fs) #window length in samples + skiprate = int(np.floor((1-overlap)*frameLen*fs)) #window skip in samples + + if fs<10000: + P = 10 # LPC Analysis Order + else: + P=16; # this could vary depending on sampling frequency. + + C=10*np.sqrt(2)/np.log(10) + + numFrames = int(clean_length/skiprate-(winlength/skiprate)); # number of frames + + hannWin=0.5*(1-np.cos(2*np.pi*np.arange(1,winlength+1)/(winlength+1))) + clean_speech_framed=extract_overlapped_windows(clean_speech[0:int(numFrames)*skiprate+int(winlength-skiprate)],winlength,winlength-skiprate,hannWin) + processed_speech_framed=extract_overlapped_windows(processed_speech[0:int(numFrames)*skiprate+int(winlength-skiprate)],winlength,winlength-skiprate,hannWin) + distortion = np.zeros((numFrames,)) + + for ii in range(numFrames): + A_clean,R_clean=lpcoeff(clean_speech_framed[ii,:],P) + A_proc,R_proc=lpcoeff(processed_speech_framed[ii,:],P) + + C_clean=lpc2cep(A_clean) + C_processed=lpc2cep(A_proc) + distortion[ii] = min((10,C*np.linalg.norm(C_clean-C_processed))) + + IS_dist = distortion + alpha=0.95 + IS_len= round( len( IS_dist)* alpha) + IS = np.sort(IS_dist) + cep_mean= np.mean( IS[ 0: IS_len]) + return cep_mean diff --git a/pysepm/reverberationMeasures.py b/pysepm/reverberationMeasures.py new file mode 100755 index 0000000..6a400c6 --- /dev/null +++ b/pysepm/reverberationMeasures.py @@ -0,0 +1,117 @@ +from scipy.signal import resample,stft +import scipy +#import srmrpy #https://github.com/jfsantos/SRMRpy +import numpy as np +from .qualityMeasures import SNRseg + +def srr_seg(clean_speech, processed_speech,fs): + return SNRseg(clean_speech, processed_speech,fs) + + +#def srmr(speech,fs, n_cochlear_filters=23, low_freq=125, min_cf=4, max_cf=128, fast=False, norm=False): +# if fs == 8000: +# srmRatio,energy=srmrpy.srmr(speech, fs, n_cochlear_filters=n_cochlear_filters, low_freq=low_freq, min_cf=min_cf, max_cf=max_cf, fast=fast, norm=norm) +# return srmRatio +# +# elif fs == 16000: +# srmRatio,energy=srmrpy.srmr(speech, fs, n_cochlear_filters=n_cochlear_filters, low_freq=low_freq, min_cf=min_cf, max_cf=max_cf, fast=fast, norm=norm) +# return srmRatio +# else: +# numSamples=round(len(speech)/fs*16000) +# fs = 16000 +# srmRatio,energy=srmrpy.srmr(resample(speech, numSamples), fs, n_cochlear_filters=n_cochlear_filters, low_freq=low_freq, min_cf=min_cf, max_cf=max_cf, fast=fast, norm=norm) +# return srmRatio + + +def hz_to_bark(freqs_hz): + freqs_hz = np.asanyarray([freqs_hz]) + barks = (26.81*freqs_hz)/(1960+freqs_hz)-0.53 + barks[barks<2]=barks[barks<2]+0.15*(2-barks[barks<2]) + barks[barks>20.1]=barks[barks>20.1]+0.22*(barks[barks>20.1]-20.1) + return np.squeeze(barks) + +def bark_to_hz(barks): + barks = barks.copy() + barks = np.asanyarray([barks]) + barks[barks<2]=(barks[barks<2]-0.3)/0.85 + barks[barks>20.1]=(barks[barks>20.1]+4.422)/1.22 + freqs_hz = 1960 * (barks+0.53)/(26.28-barks) + return np.squeeze(freqs_hz) + +def bark_frequencies(n_barks=128, fmin=0.0, fmax=11025.0): + # 'Center freqs' of bark bands - uniformly spaced between limits + min_bark = hz_to_bark(fmin) + max_bark = hz_to_bark(fmax) + + barks = np.linspace(min_bark, max_bark, n_barks) + + return bark_to_hz(barks) + +def barks(fs, n_fft, n_barks=128, fmin=0.0, fmax=None, norm='area', dtype=np.float32): + + if fmax is None: + fmax = float(fs) / 2 + + + # Initialize the weights + n_barks = int(n_barks) + weights = np.zeros((n_barks, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = np.linspace(0,float(fs) / 2,int(1 + n_fft//2), endpoint=True) + + # 'Center freqs' of mel bands - uniformly spaced between limits + bark_f = bark_frequencies(n_barks + 2, fmin=fmin, fmax=fmax) + + fdiff = np.diff(bark_f) + ramps = np.subtract.outer(bark_f, fftfreqs) + + for i in range(n_barks): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i+2] / fdiff[i+1] + + # .. then intersect them with each other and zero + weights[i] = np.maximum(0, np.minimum(lower, upper)) + + if norm in (1, 'area'): + weightsPerBand=np.sum(weights,1); + for i in range(weights.shape[0]): + weights[i,:]=weights[i,:]/weightsPerBand[i] + return weights + +def bsd(clean_speech, processed_speech, fs, frameLen=0.03, overlap=0.75): + + pre_emphasis_coeff = 0.95 + b = np.array([1]) + a = np.array([1,pre_emphasis_coeff]) + clean_speech = scipy.signal.lfilter(b,a,clean_speech) + processed_speech = scipy.signal.lfilter(b,a,processed_speech) + + winlength = round(frameLen*fs) #window length in samples + skiprate = int(np.floor((1-overlap)*frameLen*fs)) #window skip in samples + max_freq = fs/2 #maximum bandwidth + n_fft = 2**np.ceil(np.log2(2*winlength)) + n_fftby2 = int(n_fft/2) + num_frames = len(clean_speech)/skiprate-(winlength/skiprate)# number of frames + + hannWin=scipy.signal.windows.hann(winlength)#0.5*(1-np.cos(2*np.pi*np.arange(1,winlength+1)/(winlength+1))) + f,t,Zxx=stft(clean_speech[0:int(num_frames)*skiprate+int(winlength-skiprate)], fs=fs, window=hannWin, nperseg=winlength, noverlap=winlength-skiprate, nfft=n_fft, detrend=False, return_onesided=True, boundary=None, padded=False) + clean_power_spec=np.square(np.sum(hannWin)*np.abs(Zxx)) + f,t,Zxx=stft(processed_speech[0:int(num_frames)*skiprate+int(winlength-skiprate)], fs=fs, window=hannWin, nperseg=winlength, noverlap=winlength-skiprate, nfft=n_fft, detrend=False, return_onesided=True, boundary=None, padded=False) + enh_power_spec=np.square(np.sum(hannWin)*np.abs(Zxx)) + + bark_filt = barks(fs, n_fft, n_barks=32) + clean_power_spec_bark= np.dot(bark_filt,clean_power_spec) + enh_power_spec_bark= np.dot(bark_filt,enh_power_spec) + + clean_power_spec_bark_2=np.square(clean_power_spec_bark) + diff_power_spec_2 = np.square(clean_power_spec_bark-enh_power_spec_bark) + + bsd = np.mean(np.sum(diff_power_spec_2,axis=0)/np.sum(clean_power_spec_bark_2,axis=0)) + return bsd + + + + + diff --git a/pysepm/util.py b/pysepm/util.py new file mode 100755 index 0000000..4da85c3 --- /dev/null +++ b/pysepm/util.py @@ -0,0 +1,56 @@ +import numpy as np +from scipy.signal import firls,upfirdn +from scipy.signal.windows import kaiser +from fractions import Fraction + +def extract_overlapped_windows(x,nperseg,noverlap,window=None): + # source: https://github.com/scipy/scipy/blob/v1.2.1/scipy/signal/spectral.py + step = nperseg - noverlap + shape = x.shape[:-1]+((x.shape[-1]-noverlap)//step, nperseg) + strides = x.strides[:-1]+(step*x.strides[-1], x.strides[-1]) + result = np.lib.stride_tricks.as_strided(x, shape=shape, + strides=strides) + if window is not None: + result = window * result + return result + +def resample_matlab_like(x_orig,p,q): + if len(x_orig.shape)>2: + raise ValueError('x must be a vector or 2d matrix') + + if x_orig.shape[0]