diff --git a/contrib/audio/Spoken_Digit_Recognition/MNIST_Speech.bson b/contrib/audio/Spoken_Digit_Recognition/MNIST_Speech.bson
new file mode 100644
index 000000000..2e6d9088d
Binary files /dev/null and b/contrib/audio/Spoken_Digit_Recognition/MNIST_Speech.bson differ
diff --git a/contrib/audio/Spoken_Digit_Recognition/Readme.md b/contrib/audio/Spoken_Digit_Recognition/Readme.md
new file mode 100644
index 000000000..bf20322bd
--- /dev/null
+++ b/contrib/audio/Spoken_Digit_Recognition/Readme.md
@@ -0,0 +1,25 @@
+# Spoken Digit Recognition
+This is an implementation of Spoken Digit Recognition by applying Convolutional Neural Networks using Spectrogram values of WAV audio files.
+Here, to train, [Spoken_Digit_Dataset](https://www.kaggle.com/divyanshu99/spoken-digit-dataset/download) from kaggle is used.
+
+## Steps for loading the data
+First Download the data from the given link : https://www.kaggle.com/divyanshu99/spoken-digit-dataset/download
+After downloading the data, to your workspace, extract the data and rename the folder as Spoken_Digit. Now, the data is ready to be processed.
+
+## About the Dataset
+Dataset consists of:
+- 4 speakers
+- 2,000 recordings (50 of each digit per speaker) at 8KHz frequency
+- Digits from 0 to 9
+- English pronunciations
+
+## About the Model
+In the model, spectrogram values for the WAV audio files are obatined which are then used to train the model after proper normalisation. Model consists of two layers of Convolution along with MaxPool, BatchNorm and two Dense Layers.
+
+## Test Accuracy
+Test data is assumed after making a 15% split from the total available dataset. Test accuracy of 92.33% was achieved after 20 iterations. Model with best performance was saved as Digit_Speech.bson
+
+## References
+
+ https://github.com/FluxML/model-zoo/blob/master/vision/mnist/conv.jl
+
diff --git a/contrib/audio/Spoken_Digit_Recognition/Spoken_Digit/Readme.md b/contrib/audio/Spoken_Digit_Recognition/Spoken_Digit/Readme.md
new file mode 100644
index 000000000..175d57fea
--- /dev/null
+++ b/contrib/audio/Spoken_Digit_Recognition/Spoken_Digit/Readme.md
@@ -0,0 +1 @@
+Here, the recordings folder is to be kept after unzipping the dataset file downloaded using the provided link.
diff --git a/contrib/audio/Spoken_Digit_Recognition/Src.jl b/contrib/audio/Spoken_Digit_Recognition/Src.jl
new file mode 100644
index 000000000..b5dd17b52
--- /dev/null
+++ b/contrib/audio/Spoken_Digit_Recognition/Src.jl
@@ -0,0 +1,166 @@
+using DSP,WAV
+using PyCall
+using PyPlot
+using MFCC
+using FFTW
+using Flux
+using Printf,BSON
+using Flux: onehotbatch, onecold, crossentropy, throttle, Conv,relu
+using Base.Iterators: partition
+using StatsBase
+using MLLabelUtils,MLDataPattern
+IpY = pyimport("IPython")
+using Images
+
+#Loading the data
+cd("./@__dir__") #Replace the dir with your directory where you have unzipped the Audio Data
+A = readdir("./Spoken_Digit/recordings")
+cd("./Spoken_Digit/recordings")
+X = []
+X_fs = []
+Y = []
+for i in 1:length(A)
+ s,fs = wavread(A[i])
+ push!(X,s)
+ push!(X_fs,fs)
+ push!(Y,Int(A[i][1]-'0'))
+end
+cd("./../../")
+
+#Converting the audio data into Spectrogram images which will then be used for training the model
+imgs = []
+for i in 1:length(X)
+ b = spectrogram(X[i][:,1])
+ push!(imgs,b.power)
+end
+
+labels = Y;
+#Shuffle the data before minibatch formation
+imgs_,labels_ = shuffleobs((imgs,labels));
+
+#Normalising the data
+for i in 1:length(imgs)
+ imgs[i] = Flux.normalise(imgs[i],dims=2)
+end
+
+#Use 85% of the total data as train data and rest as test data
+train_X,train_Y = img_[1:1701],labels_[1:1701]
+
+#Since the Spectrogram images of different audio signals will also be different, so they are converted to a common size
+img_size = (256,32)
+m,n = img_size
+
+#Function for minibatch formation
+function make_minibatch(X,Y,idxs)
+ X_batch = Array{Float32}(undef,(img_size)..., 1, length(idxs)) #Declaring an array of images as a batch
+ for i in 1:length(idxs)
+ img = Float32.(imresize((X[idxs[i]]),(img_size)...))#Resize the image
+ X_batch[:, :, :, i] = img
+ end
+ Y_batch = onehotbatch(Y[idxs], 0:9) #Onehot encode the labels
+ return (X_batch, Y_batch)
+end
+
+
+#Dividing the data into minibatches
+mb_indices = [] #Array of indices to be loaded as minibatches
+batch_size = 32
+for i in range(1,length(train_Y)-1,step = batch_size)
+ idxs = []
+ for j in i:i+batch_size-1
+ push!(idxs,j)
+ end
+ push!(mb_indices,idxs)
+end
+train_set = [make_minibatch(train_X,train_Y,mb_indices[i]) for i in 1:(size(mb_indices)[1]-1)];
+
+#Test data as a single batch
+batch_size=300
+ind = []
+for i in 1701:2000
+ push!(ind,i)
+end
+test_set = [make_minibatch(imgs_,labels_,ind)];
+
+@info("Constructing model...")
+model = Chain(
+ # First convolution, operating upon a m*n image
+ Conv((3, 3), 1=>64, pad=(1,1), relu),
+ MaxPool((2,2)),
+ BatchNorm(64,relu),
+
+ # Second convolution, operating upon a m/2*n/2 image
+ Conv((3, 3), 64=>32, pad=(1,1), relu),
+ MaxPool((2,2)),
+ BatchNorm(32,relu),
+ Dropout(0.10),
+
+ # Reshape 3d tensor into a 2d one, at this point it should be (m/4,n/4,32, N)
+ # which is where we get the 2048 in the `Dense` layer below:
+ x -> reshape(x, :, size(x, 4)),
+ Dense(Int(floor(m/4)*floor(n/4)*32), 128,relu),
+
+ Dense(128,10),
+
+ # Finally, softmax to get nice probabilities
+ softmax,
+)
+
+function loss(x, y)
+ # We augment `x` a little bit here, adding in random noise
+ x_aug = x .+ 0.1f0*(randn(eltype(x), size(x)))
+
+ y_hat = model(x_aug)
+ return crossentropy(y_hat, y)
+end
+#Accuracy function
+accuracy(x, y) = mean(onecold(model(x)) .== onecold(y))
+
+#Training the data
+opt = ADAM(0.001)
+epochs = 15
+
+@info("Beginning training loop...")
+best_acc = 0.0
+last_improvement = 0
+for epoch_idx in 1:epochs
+ global best_acc, last_improvement
+ # Train for a single epoch
+ Flux.train!(loss, params(model), train_set, opt)
+ x,y = train_set[1]
+ print("Epoch[$epoch_idx]: Train_Loss: ",loss(x,y),"\n")
+
+ # Calculate accuracy:
+ acc = accuracy(test_set[1]...)
+ @info(@sprintf("[%d]: Test accuracy: %.4f", epoch_idx, acc))
+
+ # If our accuracy is good enough, quit out.
+ if acc >= 0.95
+ @info(" -> Early-exiting: We reached our target accuracy of 95.0%")
+ break
+ end
+
+ # If this is the best accuracy we've seen so far, save the model out
+ if acc >= best_acc
+ @info(" -> New best accuracy! Saving model out to MNIST_Speech.bson") #Here, model is saved as MNIST_Speech.bson
+ BSON.@save joinpath(dirname(@__FILE__), "./MNIST_Speech.bson") model epoch_idx acc
+ best_acc = acc
+ last_improvement = epoch_idx
+ end
+
+ # If we haven't seen improvement in 5 epochs, drop our learning rate:
+ if epoch_idx - last_improvement >= 5 && opt.eta > 1e-4
+ opt.eta /= 10.0
+ @warn(" -> Haven't improved in a while, dropping learning rate to $(opt.eta)!")
+
+ # After dropping learning rate, give it a few epochs to improve
+ last_improvement = epoch_idx
+ end
+
+ if epoch_idx - last_improvement >= 10
+ @warn(" -> We're calling this converged.")
+ break
+ end
+end
+
+
diff --git a/contrib/audio/Spoken_Digit_Recognition/Test_Reproducability.ipynb b/contrib/audio/Spoken_Digit_Recognition/Test_Reproducability.ipynb
new file mode 100644
index 000000000..91ebee3b0
--- /dev/null
+++ b/contrib/audio/Spoken_Digit_Recognition/Test_Reproducability.ipynb
@@ -0,0 +1,303 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Testing Model Reproducibility\n",
+ "Test whether the results are reproducible or not using the same dataset and trained model weights"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "using DSP,WAV\n",
+ "using PyCall\n",
+ "using PyPlot\n",
+ "using MFCC\n",
+ "using FFTW\n",
+ "using Flux\n",
+ "using Printf,BSON\n",
+ "using Flux: onehotbatch, onecold, crossentropy, throttle, Conv,relu\n",
+ "using Base.Iterators: partition\n",
+ "using StatsBase\n",
+ "using MLLabelUtils,MLDataPattern\n",
+ "IpY = pyimport(\"IPython\")\n",
+ "using Images,MLBase\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading the data\n",
+ "We will be testing our model on the same dataset. Thouh different audio data can also be tried "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cd(\"./Downloads\") # Or the path of folder containing the dataset Folder(Spoken_Digit) after extraction\n",
+ "A = readdir(\"./Spoken_Digit/recordings\")\n",
+ "cd(\"./Spoken_Digit/recordings\")\n",
+ "X = []\n",
+ "X_fs = []\n",
+ "Y = []\n",
+ "for i in 1:length(A)\n",
+ " s,fs = wavread(A[i])\n",
+ " push!(X,s)\n",
+ " push!(X_fs,fs)\n",
+ " push!(Y,Int(A[i][1]-'0'))\n",
+ "end\n",
+ "cd(\"./../../\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data Preprocessing\n",
+ "Similar to training data we will change audio files to spectrogram, normalise the data and then use that as test set. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imgs = []\n",
+ "for i in 1:length(X)\n",
+ " b = spectrogram(X[i][:,1])\n",
+ " push!(imgs,b.power)\n",
+ "end\n",
+ "labels = Y;\n",
+ "for i in 1:length(imgs)\n",
+ " imgs[i] = Flux.normalise(imgs[i],dims=2)\n",
+ "end\n",
+ "\n",
+ "imgs_,labels_ = shuffleobs((imgs,labels));"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Loading the complete data as a single minibatch to test the model performance on the data as a whole"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "make_minibatch (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "img_size = (256,32)\n",
+ "m,n = img_size\n",
+ "\n",
+ "function make_minibatch(X,Y,idxs)\n",
+ " X_batch = Array{Float32}(undef,(img_size)..., 1, length(idxs))\n",
+ " for i in 1:length(idxs)\n",
+ " img = Float32.(imresize((X[idxs[i]]),(img_size)...))\n",
+ " X_batch[:, :, :, i] = img\n",
+ " end\n",
+ " Y_batch = onehotbatch(Y[idxs], 0:9)\n",
+ " return (X_batch, Y_batch)\n",
+ "end"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Take any random small chunk from the dataset for testing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mb_indices = [] \n",
+ "ind = []\n",
+ "for i in 500:800\n",
+ " push!(ind,i)\n",
+ "end\n",
+ "test_set = [make_minibatch(imgs_,labels_,ind)];"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "accuracy (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracy(x, y) = mean(onecold(model(x)) .== onecold(y))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Model Performance Testing of Trained Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Model evaluation on Training Data\n",
+ "BSON.@load \"./MNIST_Speech.bson\" model\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9833887043189369"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracy(test_set[1]...)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10×301 Array{Float32,2}:\n",
+ " 3.8437e-13 1.91665e-9 1.57188e-7 … 1.43238e-6 2.79504e-8 \n",
+ " 4.82582e-9 0.999623 0.000236523 2.06655e-8 0.000121791\n",
+ " 4.39358e-7 5.5426e-9 0.000238135 0.000882037 0.00011656 \n",
+ " 3.30992e-11 7.50751e-9 6.39884e-6 2.86941e-5 3.69492e-5 \n",
+ " 0.999999 1.28694e-6 0.995579 0.000281947 0.999325 \n",
+ " 9.67511e-7 8.0464e-5 0.00353203 … 8.95394e-9 0.000340723\n",
+ " 3.90226e-10 1.40138e-9 0.000303763 0.998564 5.52701e-5 \n",
+ " 4.46966e-11 7.38176e-8 2.74527e-5 8.30553e-8 2.76701e-6 \n",
+ " 2.86833e-11 1.60256e-9 6.0581e-5 0.000241771 8.11351e-8 \n",
+ " 2.36902e-11 0.000294738 1.61166e-5 3.64062e-10 6.61961e-7 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_label = model(test_set[1][1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Confusion Matrix:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10×10 Array{Int64,2}:\n",
+ " 28 0 0 0 0 0 0 0 0 0\n",
+ " 0 32 0 0 0 0 0 0 0 0\n",
+ " 0 0 24 0 0 0 0 0 0 0\n",
+ " 0 0 0 27 0 0 0 0 0 0\n",
+ " 0 0 0 0 35 0 0 0 0 0\n",
+ " 0 0 0 0 0 28 0 0 0 1\n",
+ " 0 0 0 0 0 0 26 0 0 0\n",
+ " 0 0 0 0 0 0 0 27 0 0\n",
+ " 0 0 0 1 1 0 1 0 32 0\n",
+ " 0 0 0 0 0 0 0 1 0 37"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_true = onecold(test_set[1][2])\n",
+ "y_pred = onecold(y_label)\n",
+ "\n",
+ "MLBase.confusmat(10,y_true,y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "name": "Untitled.ipynb",
+ "provenance": [],
+ "version": "0.3.2"
+ },
+ "kernelspec": {
+ "display_name": "Julia 1.3.1",
+ "language": "julia",
+ "name": "julia-1.3"
+ },
+ "language_info": {
+ "file_extension": ".jl",
+ "mimetype": "application/julia",
+ "name": "julia",
+ "version": "1.3.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/contrib/audio/Spoken_Digit_Recognition/Tutorial.ipynb b/contrib/audio/Spoken_Digit_Recognition/Tutorial.ipynb
new file mode 100644
index 000000000..4ce84460f
--- /dev/null
+++ b/contrib/audio/Spoken_Digit_Recognition/Tutorial.ipynb
@@ -0,0 +1,1040 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Spoken Digit Recognition\n",
+ "Let's have a walk through on using Julia for Spoken Digit Recognition. Here, we will use CNN model on the spectrogram values to predict the Digit. \n",
+ "We begin with importing all the important packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: CUDAdrv.jl failed to initialize, GPU functionality unavailable (set JULIA_CUDA_SILENT or JULIA_CUDA_VERBOSE to silence or expand this message)\n",
+ "└ @ CUDAdrv /home/blackforest/.julia/packages/CUDAdrv/mCr0O/src/CUDAdrv.jl:69\n",
+ "WARNING: using StatsBase.crossentropy in module Main conflicts with an existing identifier.\n"
+ ]
+ }
+ ],
+ "source": [
+ "using DSP,WAV\n",
+ "using PyCall\n",
+ "using PyPlot\n",
+ "using MFCC\n",
+ "using FFTW\n",
+ "using Flux\n",
+ "using Printf,BSON\n",
+ "using Flux: onehotbatch, onecold, crossentropy, throttle, Conv,relu\n",
+ "using Base.Iterators: partition\n",
+ "using StatsBase\n",
+ "using MLLabelUtils,MLDataPattern\n",
+ "IpY = pyimport(\"IPython\")\n",
+ "using Images\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading the Data\n",
+ "Once we have imported the packages, let's load the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cd(\"./Downloads\")\n",
+ "A = readdir(\"./Spoken_Digit/recordings\")\n",
+ "cd(\"./Spoken_Digit/recordings\")\n",
+ "X = []\n",
+ "X_fs = []\n",
+ "Y = []\n",
+ "for i in 1:length(A)\n",
+ " s,fs = wavread(A[i])\n",
+ " push!(X,s)\n",
+ " push!(X_fs,fs)\n",
+ " push!(Y,Int(A[i][1]-'0'))\n",
+ "end\n",
+ "cd(\"./../../\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysing the Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's first listen to the what these audio WAV files contain. For this, we will use here, IPython.display.Audio. In order "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PyObject "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "IpY = pyimport(\"IPython\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " Your browser does not support the audio element.\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ "PyObject "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Warning: `getindex(o::PyObject, s::AbstractString)` is deprecated in favor of dot overloading (`getproperty`) so elements should now be accessed as e.g. `o.\"s\"` instead of `o[\"s\"]`.\n",
+ "│ caller = show(::IOContext{Base.GenericIOBuffer{Array{UInt8,1}}}, ::MIME{Symbol(\"text/html\")}, ::PyObject) at PyCall.jl:895\n",
+ "└ @ PyCall /home/blackforest/.julia/packages/PyCall/ttONZ/src/PyCall.jl:895\n"
+ ]
+ }
+ ],
+ "source": [
+ "IpY.display.Audio(A[453])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Hmm, that does sound like two. So, similar kind of audio data, we need to classify through our model. Before going into building the model, first pre-analyse the audio files\n",
+ " \n",
+ "First, let's plot our data to see what it looks like. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "Figure(PyObject )"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "1-element Array{PyObject,1}:\n",
+ " PyObject "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "PyPlot.plot(X[1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "How about an FFT plot?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "Figure(PyObject )"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "1-element Array{PyObject,1}:\n",
+ " PyObject "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "PyPlot.plot(fft(X[1]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, we will plot the spectrogram of the audio file zero"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "Figure(PyObject )"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "([5.921396503211639e-11 6.144594757755415e-10 … 8.451382701637563e-12 1.756504456288814e-12; 1.8568523198236493e-9 1.0536784714698365e-8 … 2.8469931152361586e-9 5.448721894513743e-10; … ; 1.9158576126187585e-11 1.5013662229371582e-10 … 5.123048834666636e-12 1.0419974773609225e-11; 4.380290940223674e-12 6.62498304638197e-11 … 7.044207901278974e-13 1.4417284840034432e-12], [0.0, 31.25, 62.5, 93.75, 125.0, 156.25, 187.5, 218.75, 250.0, 281.25 … 3718.75, 3750.0, 3781.25, 3812.5, 3843.75, 3875.0, 3906.25, 3937.5, 3968.75, 4000.0], [0.016, 0.032, 0.048, 0.064, 0.08, 0.096, 0.112, 0.128, 0.144, 0.16 … 0.48, 0.496, 0.512, 0.528, 0.544, 0.56, 0.576, 0.592, 0.608, 0.624], PyObject )"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "c = PyPlot.specgram(X[1][:,1],Fs = X_fs[1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We will use here, spectrogram function of the DSP.jl to calculate the spectrogram values of the given audio files"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DSP.Periodograms.Spectrogram{Float64,AbstractFFTs.Frequencies{Float64}}([1.1931071772691992e-5 1.2794204112795591e-5 … 8.094883618521913e-6 3.1319986816039646e-7; 2.8106639028433923e-5 5.084930866656072e-5 … 1.2141545106504503e-5 3.206747897719205e-6; … ; 5.1947719534789895e-6 3.854394968184458e-5 … 3.116387268187047e-7 5.890268409838297e-7; 1.0890568868333794e-5 1.4311608044292973e-6 … 8.204829844873038e-8 9.867262443729622e-8], [0.0, 0.0015432098765432098, 0.0030864197530864196, 0.004629629629629629, 0.006172839506172839, 0.007716049382716049, 0.009259259259259259, 0.010802469135802469, 0.012345679012345678, 0.013888888888888888 … 0.4861111111111111, 0.48765432098765427, 0.4891975308641975, 0.4907407407407407, 0.4922839506172839, 0.49382716049382713, 0.49537037037037035, 0.49691358024691357, 0.4984567901234568, 0.5], 321.5:322.0:4507.5)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Y1_spec = spectrogram(X[1][:,1])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(:power, :freq, :time)"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fieldnames(typeof(Y1_spec))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "What it returns? It basically returns us the an array of power values in a timeframe of audio over different frequencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "325×14 Array{Float64,2}:\n",
+ " 1.19311e-5 1.27942e-5 0.00011184 … 8.09488e-6 3.132e-7 \n",
+ " 2.81066e-5 5.08493e-5 0.000233652 1.21415e-5 3.20675e-6 \n",
+ " 4.0484e-5 0.000131717 0.000272207 1.73776e-5 7.20905e-6 \n",
+ " 6.07955e-5 0.000288517 0.000365978 1.14003e-5 1.46983e-5 \n",
+ " 0.000143712 0.000636645 0.00057366 2.13868e-6 4.70806e-5 \n",
+ " 0.000151799 0.000645508 0.000853296 … 0.00012986 1.36235e-5 \n",
+ " 0.000532065 0.00209559 0.00180169 2.45591e-5 1.48478e-5 \n",
+ " 0.00170353 0.00590174 0.00495622 0.000241027 0.000286993\n",
+ " 0.0141123 0.039983 0.0309246 0.00337389 0.0041309 \n",
+ " 0.260717 0.247433 0.247253 0.0691463 0.0274001 \n",
+ " 0.0209742 0.0233435 0.0205739 … 0.00312731 0.0129292 \n",
+ " 0.0031157 0.00721872 0.0106745 0.000916135 0.0025006 \n",
+ " 0.00265383 0.00263328 0.00643533 0.00131455 0.00261374 \n",
+ " ⋮ ⋱ \n",
+ " 7.07666e-8 9.66128e-5 6.4111e-5 1.96808e-7 1.016e-6 \n",
+ " 9.62174e-6 5.79443e-5 8.89168e-5 8.37519e-8 2.48709e-7 \n",
+ " 1.96166e-6 5.70858e-7 2.52865e-5 … 4.86845e-7 2.51407e-7 \n",
+ " 2.97862e-6 5.49076e-6 0.000142155 3.25063e-7 3.7205e-7 \n",
+ " 1.20512e-5 5.46596e-6 4.24111e-5 7.01023e-7 7.32593e-7 \n",
+ " 3.61834e-6 9.55499e-6 0.00019572 9.9248e-7 1.72722e-7 \n",
+ " 3.73843e-6 7.71323e-5 8.17831e-5 1.00604e-6 7.68411e-7 \n",
+ " 4.92912e-7 2.97574e-5 3.97462e-5 … 1.4376e-7 3.09651e-7 \n",
+ " 7.55749e-7 5.81224e-6 1.39426e-5 3.78868e-7 7.33412e-7 \n",
+ " 1.59308e-6 6.75469e-5 7.42301e-5 4.35238e-7 1.09979e-6 \n",
+ " 5.19477e-6 3.85439e-5 7.91456e-5 3.11639e-7 5.89027e-7 \n",
+ " 1.08906e-5 1.43116e-6 0.000109007 8.20483e-8 9.86726e-8 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Y1_spec.power"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data Preprocessing\n",
+ "As discussed above, we will be using spectrogram values of the audio files as input to our CNN model. Let's generate the spectrograms of all the audio files. Here,we will be using the power values of the spectrogram as out image data at different frequencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imgs = []\n",
+ "for i in 1:length(X)\n",
+ " b = spectrogram(X[i][:,1])\n",
+ " push!(imgs,b.power)\n",
+ "end"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "labels = Y;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As our data is arranged in order,we first need to shuffle the data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Normalising the data\n",
+ "Normalising plays a very crucial role in easy convergence of the data, otherwise there are more chances of the model getting stuck in the local optimum. So we will normalise the data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in 1:length(imgs)\n",
+ " imgs[i] = Flux.normalise(imgs[i],dims=2)\n",
+ "end"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imgs_,labels_ = shuffleobs((imgs,labels));"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, we will use only 85% of the data as our training data in this example. And the rest of the data, we will use to test our model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_X,train_Y = imgs_[1:1701],labels_[1:1701];"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Our spectrogram values will be of different sizes. So we need to change them to into a common size before making batches. So here, we will be resizing spectrograms to image size of (256,32). It is preferable not to keep the size too small, as otherwise while shrinking the size, some data may get lost. Also very large size is not preferrable as it will consequently increase the traning time. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(256, 32)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "img_size = (256,32)\n",
+ "m,n = img_size"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Using the following function, we will divide the array into minibatches along with resizing the images to the required shape."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "make_minibatch (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "function make_minibatch(X,Y,idxs)\n",
+ " X_batch = Array{Float32}(undef,(img_size)..., 1, length(idxs))\n",
+ " for i in 1:length(idxs)\n",
+ " img = Float32.(imresize((X[idxs[i]]),(img_size)...))\n",
+ " X_batch[:, :, :, i] = img\n",
+ " end\n",
+ " Y_batch = onehotbatch(Y[idxs], 0:9)\n",
+ " return (X_batch, Y_batch)\n",
+ "end"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note that here we are using a batch size of 32, that seems reasonable in this case as the training data size is 1700 which isn't that big. So, batch_size of 32 seems to be an appropriate choice"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mb_indices = [] \n",
+ "batch_size = 32\n",
+ "\n",
+ "for i in range(1,length(train_Y)-1,step = batch_size)\n",
+ " idxs = []\n",
+ " for j in i:i+batch_size-1\n",
+ " push!(idxs,j)\n",
+ " end\n",
+ " push!(mb_indices,idxs)\n",
+ "end\n",
+ "train_set = [make_minibatch(train_X,train_Y,mb_indices[i]) for i in 1:(size(mb_indices)[1]-1)];"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(256, 32, 1, 32)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "size(train_set[1][1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We will be supplying our whole test data as a single batch. So let's create our test set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "batch_size=300\n",
+ "ind = []\n",
+ "for i in 1701:2000\n",
+ " push!(ind,i)\n",
+ "end\n",
+ "test_set = [make_minibatch(imgs_,labels_,ind)];"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Model\n",
+ "Once, we are done pre-processing tha data and our data is ready to get the model trained, its time to create our model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: Constructing model...\n",
+ "└ @ Main In[36]:1\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Chain(Conv((3, 3), 1=>64, relu), MaxPool((2, 2), pad = (0, 0, 0, 0), stride = (2, 2)), BatchNorm(64, λ = relu), Conv((3, 3), 64=>32, relu), MaxPool((2, 2), pad = (0, 0, 0, 0), stride = (2, 2)), BatchNorm(32, λ = relu), Dropout(0.1), #7, Dense(16384, 128, relu), Dense(128, 10), softmax)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "@info(\"Constructing model...\")\n",
+ "model = Chain(\n",
+ " # First convolution, operating upon a m*n image\n",
+ " Conv((3, 3), 1=>64, pad=(1,1), relu),\n",
+ " MaxPool((2,2)),\n",
+ " BatchNorm(64,relu),\n",
+ "\n",
+ " # Second convolution, operating upon a m/2*n/2 image\n",
+ " Conv((3, 3), 64=>32, pad=(1,1), relu),\n",
+ " MaxPool((2,2)),\n",
+ " BatchNorm(32,relu),\n",
+ " Dropout(0.10),\n",
+ " \n",
+ " # Reshape 3d tensor into a 2d one, at this point it should be (m/4,n/4,32, N)\n",
+ " # which is where we get the 2048 in the `Dense` layer below:\n",
+ " x -> reshape(x, :, size(x, 4)),\n",
+ " Dense(Int(floor(m/4)*floor(n/4)*32), 128,relu),\n",
+ " \n",
+ " Dense(128,10),\n",
+ "\n",
+ " # Finally, softmax to get nice probabilities\n",
+ " softmax,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this model, we will use crossentropy loss. While calculating loss, we will simply add some random noise to our data, so as to prevent the overfitting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "accuracy (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "function loss(x, y)\n",
+ " # We augment `x` a little bit here, adding in random noise\n",
+ " x_aug = x .+ 0.1f0*(randn(eltype(x), size(x)))\n",
+ "\n",
+ " y_hat = model(x_aug)\n",
+ " return crossentropy(y_hat, y)\n",
+ "end\n",
+ "accuracy(x, y) = mean(onecold(model(x)) .== onecold(y))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "What is accuracy before training the model????? \n",
+ "Let's check......."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.08333333333333333"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracy(test_set[1]...)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Training\n",
+ "Let's begin training our model. We will use ere ADAM optimiser with a learning rate of 0.001. We will train our model for 15 epochs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: Beginning training loop...\n",
+ "└ @ Main In[39]:4\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[1]: Train_Loss: 0.39074132\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [1]: Test accuracy: 0.6767\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[2]: Train_Loss: 0.013040102\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [2]: Test accuracy: 0.8567\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[3]: Train_Loss: 0.033996206\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [3]: Test accuracy: 0.8600\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[4]: Train_Loss: 0.0029821368\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [4]: Test accuracy: 0.9200\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[5]: Train_Loss: 0.0010024133\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [5]: Test accuracy: 0.9167\n",
+ "└ @ Main In[39]:16\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[6]: Train_Loss: 0.00067962485\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [6]: Test accuracy: 0.9200\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[7]: Train_Loss: 0.0005679158\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [7]: Test accuracy: 0.9233\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[8]: Train_Loss: 0.00038421655\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [8]: Test accuracy: 0.9233\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[9]: Train_Loss: 0.00037415794\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [9]: Test accuracy: 0.9233\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[10]: Train_Loss: 0.00028696572\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [10]: Test accuracy: 0.9233\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[11]: Train_Loss: 0.00027806658\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [11]: Test accuracy: 0.9267\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[12]: Train_Loss: 0.00019297587\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [12]: Test accuracy: 0.9267\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[13]: Train_Loss: 0.00020029144\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [13]: Test accuracy: 0.9267\n",
+ "└ @ Main In[39]:16\n",
+ "┌ Info: -> New best accuracy! Saving model out to MNIST_Speech.bson\n",
+ "└ @ Main In[39]:26\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[14]: Train_Loss: 0.00018595633\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [14]: Test accuracy: 0.9233\n",
+ "└ @ Main In[39]:16\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch[15]: Train_Loss: 0.00021959198\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Info: [15]: Test accuracy: 0.9233\n",
+ "└ @ Main In[39]:16\n"
+ ]
+ }
+ ],
+ "source": [
+ "opt = ADAM(0.001)\n",
+ "epochs = 15\n",
+ "\n",
+ "@info(\"Beginning training loop...\")\n",
+ "best_acc = 0.0\n",
+ "last_improvement = 0\n",
+ "for epoch_idx in 1:epochs\n",
+ " global best_acc, last_improvement\n",
+ " # Train for a single epoch\n",
+ " Flux.train!(loss, params(model), train_set, opt)\n",
+ " x,y = train_set[1] \n",
+ " print(\"Epoch[$epoch_idx]: Train_Loss: \",loss(x,y),\"\\n\")\n",
+ " \n",
+ " # Calculate accuracy:\n",
+ " acc = accuracy(test_set[1]...)\n",
+ " @info(@sprintf(\"[%d]: Test accuracy: %.4f\", epoch_idx, acc))\n",
+ "\n",
+ " # If our accuracy is good enough, quit out.\n",
+ " if acc >= 0.95\n",
+ " @info(\" -> Early-exiting: We reached our target accuracy of 95.0%\")\n",
+ " break\n",
+ " end\n",
+ "\n",
+ " # If this is the best accuracy we've seen so far, save the model out\n",
+ " if acc >= best_acc\n",
+ " @info(\" -> New best accuracy! Saving model out to MNIST_Speech.bson\") #Here, model is saved as MNIST_Speech.bson \n",
+ " BSON.@save joinpath(dirname(@__FILE__), \"./MNIST_Speech.bson\") model epoch_idx acc\n",
+ " best_acc = acc\n",
+ " last_improvement = epoch_idx\n",
+ " end\n",
+ "\n",
+ " # If we haven't seen improvement in 5 epochs, drop our learning rate:\n",
+ " if epoch_idx - last_improvement >= 5 && opt.eta > 1e-4\n",
+ " opt.eta /= 10.0\n",
+ " @warn(\" -> Haven't improved in a while, dropping learning rate to $(opt.eta)!\")\n",
+ "\n",
+ " # After dropping learning rate, give it a few epochs to improve\n",
+ " last_improvement = epoch_idx\n",
+ " end\n",
+ "\n",
+ " if epoch_idx - last_improvement >= 10\n",
+ " @warn(\" -> We're calling this converged.\")\n",
+ " break\n",
+ " end\n",
+ "end"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now our model is trained and ready to be tested"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Julia 1.3.1",
+ "language": "julia",
+ "name": "julia-1.3"
+ },
+ "language_info": {
+ "file_extension": ".jl",
+ "mimetype": "application/julia",
+ "name": "julia",
+ "version": "1.3.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}