lecture polishing

smidl · smidl · commit 086fa86fece8 · 2021-12-16T13:54:15.000+01:00
diff --git a/docs/src/lecture_12/lab_2.jl b/docs/src/lecture_12/lab_2.jl
@@ -0,0 +1,128 @@
+abstract type AbstractODEProblem end
+
+struct ODEProblem{F,T,U,P} <: AbstractODEProblem
+    f::F
+    tspan::T
+    u0::U
+    θ::P
+end
+
+abstract type ODESolver end
+struct Euler{T} <: ODESolver
+    dt::T
+end
+struct RK2{T} <: ODESolver
+    dt::T
+end
+
+function f(x,θ)
+    α, β, γ, δ = θ
+    x₁, x₂ = x
+
+    dx₁ = α*x₁ - β*x₁*x₂
+    dx₂ = δ*x₁*x₂ - γ*x₂
+
+    [dx₁, dx₂]
+end
+
+function solve(prob::AbstractODEProblem, solver::ODESolver)
+    t = prob.tspan[1]; u = prob.u0
+    us = [u]; ts = [t]
+    while t < prob.tspan[2]
+        (u,t) = solver(prob, u, t)
+        push!(us,u)
+        push!(ts,t)
+    end
+    ts, reduce(hcat,us)
+end
+
+function (solver::Euler)(prob::ODEProblem, u, t)
+    f, θ, dt  = prob.f, prob.θ, solver.dt
+    (u + dt*f(u,θ), t+dt)
+end
+
+function (solver::RK2)(prob::ODEProblem, u, t)
+    f, θ, dt  = prob.f, prob.θ, solver.dt
+    uh = u + f(u,θ)*dt
+    u + dt/2*(f(u,θ) + f(uh,θ)), t+dt
+end
+
+
+θ = [0.1,0.2,0.3,0.2]
+u0 = [1.0,1.0]
+tspan = (0.,100.)
+dt = 0.1
+prob = ODEProblem(f,tspan,u0,θ)
+
+t,X=solve(prob, RK2(0.2))
+
+using Plots
+p1 = plot(t, X[1,:], label="x", lw=3)
+plot!(p1, t, X[2,:], label="y", lw=3)
+
+display(p1)
+
+#
+θ = [0.2,0.2,0.3,0.2]
+u0 = [1.0,1.0]
+tspan = (0.,100.)
+dt = 0.1
+prob2 = ODEProblem(f,tspan,u0,θ)
+
+t,X2=solve(prob2, RK2(0.2))
+
+using Optim
+
+function loss(θin,prob::ODEProblem,Y)
+    prob.θ.=θin
+    t,Xn=solve(prob,RK2(0.2))
+    sum((Y.-Xn).^2)
+end
+θopt = copy(θ)
+O=Optim.optimize(θ->loss(θ,prob,X),θopt)
+O=Optim.optimize(θ->loss(θ,prob,X),θopt,LBFGS())
+
+using DiffEqFlux
+nn=FastDense(2,2)
+p = initial_params(nn)
+nn([1,2],p)
+
+
+function fy(x,θ)
+    α, β, γ, δ, ω = θ
+    x₁, x₂ = x
+
+    dx₁ = α*x₁ - β*x₁*x₂ + ω*x₂
+    dx₂ = δ*x₁*x₂ - γ*x₂
+
+    [dx₁, dx₂]
+end
+
+#
+θy = [0.2,0.2,0.3,0.2,0.1]
+u0 = [1.0,1.0]
+tspan = (0.,100.)
+dt = 0.1
+proby = ODEProblem(fy,tspan,u0,θy)
+
+t,Xy=solve(proby, RK2(0.2))
+
+py = plot(t, Xy[1,:], label="x", lw=3)
+plot!(py, t, Xy[2,:], label="y", lw=3)
+savefig("LV_omega.svg")
+
+function fnn(x,θ)
+    α, β, γ, δ = θ[1:4]
+    x₁, x₂ = x
+
+    dx₁ = α*x₁ - β*x₁*x₂ 
+    dx₂ = δ*x₁*x₂ - γ*x₂
+
+    [dx₁, dx₂]+nn(x,@view θ[5:end])
+end
+
+θnn = [0.2,0.2,0.3,0.2,0.01*initial_params(nn)...]
+probnn = ODEProblem(fnn,tspan,u0,θnn)
+
+θopt = copy(θnn)
+O=Optim.optimize(θ->loss(θ,probnn,Xy),θopt)
diff --git a/docs/src/lecture_13/lecture.md b/docs/src/lecture_13/lecture.md
@@ -1,17 +1,18 @@
 # Data-driven Ordinary Differential Equations
 
-We have looked into the uncertainty propagation trough an ODE in previous lecture. The uncertainty may stem from:
-- unknown boundary consitions (e.g. initial conditions)
-- unknown parameters, 
-- missing terms (hidden dynamics) of ODE
+We have looked into the uncertainty propagation through an ODE in the previous lecture. The uncertainty may stem from:
+- unknown boundary conditions (e.g. initial conditions)
+- unknown parameters (reproduction rates, etc.)
+- missing terms (hidden dynamics) of an ODE
 
 The uncertainty in the solution can be reduced when data are available. This can be either in incremental or batch form:
-- batch form: we have a set of data 
-- incremental: common e.g. in temporal evolution, when the data are measured on the fly and systems cna change in time
+- batch form: we have a set of data and we look for their explanation 
+- incremental: common e.g. in temporal evolution, when the data are measured on the fly and systems can change in time (stochastic ODE)
+  - if done right, the incremental solution also solves the batch problem.
 
 ## Fitting ODE solution to data
 
-Since ODE solver is a function like any other, it is possible to use general-purpose optimizers to optimize parameters of the ODE to match the output.
+Since the ODE solver is a function like any other, it is possible to use general-purpose optimizers to optimize parameters of the ODE to match the output.
 ```julia
 using Optim
 
@@ -22,14 +23,15 @@ function loss(θin,prob::ODEProblem,Y)
 end
 θopt = copy(θ)
 O=Optim.optimize(θ->loss(θ,prob,X),θopt)
+Olb=Optim.optimize(θ->loss(θ,prob,X),θopt,LBFGS())
 ```
 
-- show various optimizers?
-- gradient optimizers?
+- using the power of automatic differentiation (of the numerical solver)
+- in the case of ODE, the gradients can be modified to use the information about exact derivatives (adjoints) 
 
 ## Extending the ODE
 
-The previous approach will work only if the data were generated by the exact ODE. If the structure of ODE is different, e.g. soem terms are missing, we can never find an exact fit.
+The previous approach will work only if the data were generated by the exact ODE. If the structure of ODE is different, e.g. some terms are missing, we can never find an exact fit.
 
 ```math
 \begin{align}
@@ -40,13 +42,13 @@ dot{y}&=-\delta y+\gamma xy,
 
 We could "guess" what is the missing term or add a black box (neural network). The whole problem will become finding parameters ``\theta = [\theta_{ODE},\theta_{NN}]``. 
 ```math
-\frac{d\mathbf{x}}{dt}=f(\mathbf{x},\theta) + NN(\mathbf{x},\theta)
+\frac{d\mathbf{x}}{dt}=f(\mathbf{x},\theta_{ODE}) + NN(\mathbf{x},\theta_{NN})
 ```
 In the limiting case, we may learn only the network:
 ```math
-\frac{d\mathbf{x}}{dt}=f(\mathbf{x},\theta) + NN(\mathbf{x},\theta)
+\frac{d\mathbf{x}}{dt}= NN(\mathbf{x},\theta_{NN})
 ```
-known as the "Neural ODE" [citation needed].
+known as the "Neural ODE" (Chen et. al. 2018).
 
 # Neural Networks in Julia
 Many possible packages implementing Neural Networks (Flux, Knets, MXnets, tensorFlow) etc. By far the most used package is the ```Flux.jl```
@@ -82,7 +84,7 @@ function (a::Dense)(x::AbstractArray)
 end
 ```
 
-While it is straightforward compose a MLP:
+Building an MLP is straightforward:
 ```julia
 nx = 2
 nn = Chain(Dense(rand(nx,nx),rand(nx)))
@@ -107,7 +109,9 @@ gs[ps[1]]
 
 This approach has benefits and drawbacks:
 - it allows to write a very general code easily, 
-- removing parameter from optimization can be done by removing it from the parameter list
+- the list of parameters is accessible for modifications:
+  - removing parameter from optimization can be done by removing it from the parameter list
+  - adding a parameter (e.g. from the ODE) allows composition of NN with other code
 - it introduces and overhead
   - may be negligible for large models (hundrets of hidden neurons) that are dominated by matrix manipulation.
   - becomes significant for low dimensional models (ODEs)
@@ -140,7 +144,9 @@ end
 ```
 It does not store its parameters but operates on an external parameter vector:
 ```julia
-(f::FastDense)(x,p) = ((f.bias == true ) ? (f.σ.(reshape(p[1:(f.out*f.in)],f.out,f.in)*x .+ p[(f.out*f.in+1):end])) : (f.σ.(reshape(p[1:(f.out*f.in)],f.out,f.in)*x)))
+(f::FastDense)(x,p) = ((f.bias == true ) 
+  ? (f.σ.(reshape(p[1:(f.out*f.in)],f.out,f.in)*x .+ p[(f.out*f.in+1):end])) 
+  : (f.σ.(reshape(p[1:(f.out*f.in)],f.out,f.in)*x)))
 ```
 
 The same behavior is replicated in FastChain:
@@ -150,7 +156,7 @@ struct FastChain{T<:Tuple} <: FastLayer
   # function FastChain(xs...)...
 end
 ```
-Since it is a Layer, it implemnets interfaces:
+Since it is a Layer, it implements interfaces:
 ```julia
 paramlength(c::FastChain) = sum(paramlength(x) for x in c.layers)
 initial_params(c::FastChain) = vcat(initial_params.(c.layers)...)
@@ -163,13 +169,20 @@ applychain(::Tuple{}, x, p) = x
 applychain(fs::Tuple, x, p) = applychain(Base.tail(fs), first(fs)(x,p[1:paramlength(first(fs))]), p[(paramlength(first(fs))+1):end])
 ```
 
+This allows to implement layers with StaticArrays (allocating on the stack).
+
 The same 2x2 network can be implemented as:
 ```julia
 nn=FastDense(2,2)
 p = initial_params(nn)
 nn([1,2],p)
 ```
 
+Effects of code composition in Julia:
+- Toolboxes of Neural Networks in Julia are often lightweight
+- the tools necessary for their training are not specific to NN (AD: Zygote, Enzyme)
+- Combination with ODE is straigthforward
+
 # Neural Networks in ODEs:
 
 Neural networks are universal approximators. They can approaximate:
@@ -179,17 +192,38 @@ Neural networks are universal approximators. They can approaximate:
 ## Neural Lotka-Volterra
 Consider an extension of the LV ODE by a MLP:
 ```julia
-function fnn(x,θ,nn,p)
-  α,β,γ,δ = θ
-  x1,x2=x
-   dx1 = α*x1 - β*x1*x2
-   dx2 = δ*x1*x2 - γ*x2
-  [dx1,dx2]+nn(x,p)
+
+function fnn(x,θ)
+    α, β, γ, δ = θ[1:4]
+    x₁, x₂ = x
+
+    dx₁ = α*x₁ - β*x₁*x₂ 
+    dx₂ = δ*x₁*x₂ - γ*x₂
+
+    [dx₁, dx₂]+nn(x,@view θ[5:end])
 end
 ```
+Can be implemented via a closure (closing on nn).
+
+Optimize using the same approach as before:
+```julia
+θnn = [0.2,0.2,0.3,0.2,0.01*initial_params(nn)...]
+probnn = ODEProblem(fnn,tspan,u0,θnn)
+
+θopt = copy(θnn)
+O=Optim.optimize(θ->loss(θ,probnn,Xy),θopt,Optim.Options(iterations=10000))
+```
+
+The ``Xy`` data were generated with the ω version of the ODE, with parameters ``\theta=[0.2,0.2,0.3,0.2,0.1]``.
+
+Optimization difficulties:
+- the number of iteration in Nelder-Mead had to be increased
+- LBGFS() optimizer extremely slow
+
+Why?
+
+
 
-- run through solver (RK2)
-- optimize, show ambiguity
 
 ## Physics-informed Neural Network
 
@@ -221,6 +255,15 @@ This straightforward approach was proposed relatively recently (2019).
 - need for higher-order derivatives
 - numerical issues
 
+Very simple extension for known data:
+```math
+\begin{align}
+\mathcal{L}=&||nn(0)-x0)|| + \frac{1}{N}\sum_{i=1}^N||f(x_i)-\nabla_x nn(x_i) ||\\
+            & + \frac{1}{M}\sum_{i=1}^M||y_i - h(nn(x_i))||
+\end{align}
+```
+where ``h()`` is a function transforming ODE solution to observations (e.g. identity, or selection of the relevant observations).
+
 Worked out in the lab.
 
 Can be combined with Neural ODE.
@@ -231,7 +274,7 @@ So far, we have seen optimizations of the ODEs in the form of point estimate. We
 - the measurement are uncertain with large possible error
 - the number of measurements is insufficient to fit the model.
 
-Consider the Monte Carlo simulation from the previous lecture:
+Consider the Monte Carlo simulation from the previous lecture extended for unknown parameter:
 ```julia
 K=100
 X0 = [x0 .+ 0.1*randn(2) for k=1:K]
@@ -248,7 +291,7 @@ Point estimate is the trajectory with the thick color.
 - it is the one with minimum error
 - is it really the solution?
 
-Lets, select all trajectories withing a selected tolerance:
+Lets select all trajectories within a selected tolerance:
 
 ![](LV_MC_param_assim.svg)
 
@@ -258,7 +301,17 @@ When the data are collected sequentially, the process of reduction of the uncert
 1. prediction - use ODE with uncertainty propagation to the next step,
 2. correction - use the acquired measurement to reduce the uncertainty
 
-How exactly are these steps implemented depends on the assumptions made on the type of model uncertainty (initial conditions, parameters, noise) and the measurment uncertainty (noise).
+In mathematics, it is direct application of the Bayes rule:
+```math
+\begin{align}
+p(\mathbf{x},\mathbf{y})	=p(\mathbf{y}|\mathbf{x})p(\mathbf{x})=p(\mathbf{x}|\mathbf{y})p(\mathbf{y})
+p(\mathbf{x}|\mathbf{y})	=\frac{p(\mathbf{y}|\mathbf{x})p(\mathbf{x})}{p(\mathbf{y})}=\frac{p(\mathbf{y}|\mathbf{x})p(\mathbf{x})}{\int p(\mathbf{y}|\mathbf{x})p(\mathbf{x})d\mathbf{x}}
+\end{align}
+```
+
+Tradeoff between generality and speed
+- A implementation of the whole procedure can be implemented on general level using types for probability distributions and operations on them.
+- How exactly are these steps implemented depends on the assumptions made on the type of model uncertainty (initial conditions, parameters, noise) and the measurment uncertainty (noise).
 
 We have done propagation of the Gaussian uncertainty through an ODE (GaussNum, Cubature rules). We will complement it by the correcton step here. 
 
@@ -273,6 +326,11 @@ p(\mathbf{x},\mathbf{y})&=\mathcal{N}\left(\begin{bmatrix}\mu_{x}\\
 \end{align}
 ```
 
+![](mvgaussian.png)
+
+- marginal distributions are unaffected by the correlation
+- the correlation determines the reduction of uncertainty in the conditional case
+
 We have uncertainty in all our unknowns ``p(\mathbf{x})`` in the form of quadrature points. We assume that the probability of observation of ``p(\mathbf{y}|\mathbf{x})`` has mean given by ``x`` and variance ``\sigma_y``.
 Hence, the means can be obtained by empirical samples of the cubature points ``X_p`` and measurements corresponding to cubature points.
 ```math
@@ -290,7 +348,9 @@ The covariance matrices can be obtained by empirical samples:
 The uncertainty reduction is then application of the conditional distribution using the obtained means and variances. A common trick is to define the Kalman gain:
 ```math
 \begin{align}
-\mu_{x|y}&=\mu_{x}+K(\mathbf{y}-\mu_{y}),\,\,\,\,K=\Sigma_{xy}\Sigma_{yy}^{-1},\\\Sigma_{x|y}&=\Sigma_{xx}-K\Sigma_{yx},
+K&=\Sigma_{xy}\Sigma_{yy}^{-1},\\
+\mu_{x|y}&=\mu_{x}+K(\mathbf{y}-\mu_{y}),\\
+\Sigma_{x|y}&=\Sigma_{xx}-K\Sigma_{yx},
 \end{align}
 ```
 
diff --git a/docs/src/lecture_13/mvgaussian.png b/docs/src/lecture_13/mvgaussian.png