ArndtLab
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/make.jl‎
Lines changed: 1 addition & 0 deletions b/‎docs/make.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/src/diagnostics.md‎
Lines changed: 36 additions & 0 deletions b/‎docs/src/diagnostics.md‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎docs/src/tutorial.md‎
Lines changed: 25 additions & 7 deletions b/‎docs/src/tutorial.md‎
Lines changed: 25 additions & 7 deletions
diff --git a/‎src/HetDister.jl‎
Lines changed: 52 additions & 12 deletions b/‎src/HetDister.jl‎
Lines changed: 52 additions & 12 deletions
@@ -1,6 +1,6 @@
 name = "HetDister"
 uuid = "50651ce3-0423-45d2-b99c-8ea4267d2717"
-version = "0.10.7"
+version = "0.11.0"
 authors = ["Tommaso Stentella <stentell@molgen.mpg.de> and contributors"]
 
 [deps]
 
@@ -15,6 +15,7 @@ makedocs(;
     pages=[
         "Home" => "index.md",
         "Tutorial" => "tutorial.md",
+        "Diagnostics" => "diagnostics.md"
     ],
     warnonly=[:missing_docs],
 )
 
@@ -0,0 +1,36 @@
+# Diagnostics
+
+Let's say that we inferred some demographic models and we have 
+obtained the most probable as explained in the [Tutorial](@ref) 
+```julia
+results = demoinfer(segments, 1:8, mu, rho)
+best = compare_models(results.fits)
+```
+First we can print
+```julia
+best.converged
+```
+which indicates whether the maximum likelihood optimization
+converged. If that is not the case, we can inspect further
+```julia
+best.opt.optim_result.original
+``` 
+to get more details and decide whether this flag is correct.
+In case the non convergence is true, a more greedy search might
+be needed (see [`FitOptions`](@ref)) and larger number of iterations
+and/or optimization time allowed.
+
+We can also compute z-score residuals to assess goodness of fit
+```
+wth = wth = yth .* diff(h.edges[1])
+resid = (h.weights .- wth) ./ sqrt.(wth)
+```
+Because the probabilistic model is Poisson, it might be that bins
+in the tail have a skewed distribution of residuals, but the others
+should closely follow a standard normal.
+We can also assess the correlation structure of neighboring residuals
+```julia
+ps = HetDister.residstructure(resid)
+```
+This function return a vector of right tail p values from t-tests for
+correlation of neighbouring residuals (see the function doc).
@@ -1,4 +1,4 @@
-## Tutorial
+# Tutorial
 
 To run the package, first install julia ([here](https://julialang.org/downloads/)).
 To create a local environment with the package `cd` into your work directory and 
@@ -18,7 +18,7 @@ You can now load the installed packages:
 using HetDister, HistogramBinnings, CSV, DataFrames, DataFramesMeta
 ```
 
-### Preparing input data
+## Preparing input data
 For example, suppose you have a `.vcf` file with called variants you want to analyze. Then, in the most basic case, you may compute distances between heterozygous SNPs as follows:
 ```julia
 f = "/myproject/myfavouritespecies.vcf"
@@ -41,10 +41,26 @@ ils = df.POS[2:end] .- df.POS[1:end-1]
 ```
 Now we have a vector of intervals `ils`.
 
-### Fitting demographic models
+## Fitting and comparing demographic models
 The tool require three inputs: a vector of IBS segments lengths, a mutation rate and 
-a recombination rate (both per bp per generation). First, IBSs need to be placed into
-a histogram.
+a recombination rate (both per bp per generation). Additionally, we need to choose
+a range of demographic models with epochs of piecewise constant effective size.
+
+In the simplest use case we can just call
+```julia
+mu = 1e-8
+rho = 1e-8
+results = demoinfer(ils, 1:8, mu, rho)
+```
+and the 8 models will be saved in `results.fits`.
+Then we can obtain the most probable model in the set with
+```julia
+best = compare_models(results.fits)
+```
+
+### More advanced options
+First IBS spectrum is obtained as an histogram and the binning can be 
+controlled by the function `adapt_histogram` and it keyword arguments
 ```julia
 h = adapt_histogram(ils)
 mu = 1e-8
@@ -53,10 +69,12 @@ rho = 1e-8
 Then we set up the `FitOptions` object that contains several parameters for the optimization.
 We stick with default values and only initialize with the three required inputs:
 ```julia
-fop = FitOptions(sum(ils), mu, rho)
+fop = FitOptions(sum(ils), length(ils), mu, rho)
 ```
 And we fit 8 different model, with a number of epochs in the range 1 to 8:
 ```julia
 results = demoinfer(h_obs, 1:8, fop)
 ```
-The fitted models can be accessed with `results.fits`.
+The fitted models can be accessed with `results.fits`.
+
+See [Diagnostics](@ref) to inspect the result and assess goodness of fit and optimization convergence.
@@ -20,12 +20,16 @@ include("mle_optimization.jl")
 include("sequential_fit.jl")
 include("corrections.jl")
 
-export pre_fit, pre_fit!, demoinfer, compare_models, correctestimate!,
-    get_para, evd, sds, pop_sizes, durations,
+export pre_fit!, demoinfer, compare_models, sample_model_epochs!,
+    correctestimate!,
+    get_para, evd, sds, pop_sizes, durations, get_covar,
     compute_residuals,
     adapt_histogram,
     FitResult, FitOptions,
-    laplacekingman, mldsmcp
+    laplacekingman, mldsmcp,
+    extbps,
+    lineages, cumulative_lineages, crediblehistory,
+    sampleN, quantilesN
 
 
 function integral_ws(edges::AbstractVector{<:Real}, mu::Float64, TN::Vector)
@@ -85,6 +89,25 @@ function compute_residuals(h1::Histogram, h2::Histogram; fc1 = 1.0, fc2 = 1.0)
     return residuals
 end
 
+"""
+    residstructure(residuals::AbstractVector{<:Real}; frame::Int = length(residuals)÷20)
+
+Compute the p-values for the correlation between adjacent residuals in a sliding window of size `frame`.
+The p-value is the right tail of the t-distribution.
+"""
+function residstructure(residuals::AbstractVector{<:Real};
+    frame::Int = length(residuals)÷20
+)
+    ps = ones(length(residuals)-frame)
+    for i in eachindex(ps)
+        c = cor(residuals[i+1:i+frame],residuals[i:i+frame-1])
+        t = c * sqrt((frame - 2)/(1-c^2))
+        p = StatsAPI.pvalue(Distributions.TDist(frame - 2), t; tail=:right)
+        ps[i] = p
+    end
+    ps
+end
+
 function CustomEdgeVector(; lo = 1, hi = 10, nbins::Integer)
     @assert (lo > 0) && (hi > 0) && (nbins > 0) && (hi > lo)
     lo = floor(Int, lo)
@@ -103,14 +126,23 @@ function CustomEdgeVector(; lo = 1, hi = 10, nbins::Integer)
 end
 
 """
-    adapt_histogram(segments::AbstractVector{<:Integer}; lo::Int=1, hi::Int=50_000_000, nbins::Int=800, tailthr::Int=1)
+    adapt_histogram(segments::AbstractVector{<:Integer}; lo::Int=1, hi::Int=50_000_000, nbins::Int=0, tailthr::Int=0)
 
 Build an histogram from `segments` logbinned between `lo` and `hi`
-with `nbins` bins.
+with `nbins` bins. `nbins` is automatically determined by default.
 
-The upper limit is adapted to ensure logspacing with the requested `nbins`.
+The upper limit is adapted to ensure logspacing with the requested `nbins`. The adaptive strategy is such that the
+last bin has at least `tailthr` segments.
 """
-function adapt_histogram(segments::AbstractVector{<:Integer}; lo::Int=1, hi::Int=50_000_000, nbins::Int=800, tailthr::Int=1)
+function adapt_histogram(segments::AbstractVector{<:Integer}; lo::Int=1, hi::Int=50_000_000, nbins::Int=0, tailthr::Int=0)
+    if iszero(nbins)
+        if length(segments) > 1e7
+            nbins = 1600
+        else
+            nbins = 800
+        end
+    end
+    @assert nbins > 0
     h_obs = Histogram(CustomEdgeVector(;lo, hi, nbins))
     @assert !isempty(segments)
     append!(h_obs, segments)
@@ -149,10 +181,18 @@ function compare_mlds(segs1::AbstractVector{<:Integer}, segs2::AbstractVector{<:
     append!(h2, segs2)
     return compare_mlds!(h1, h2, theta1, theta2)
 end
-function compare_mlds!(h1, h2, theta1, theta2) # add !
+
+"""
+    compare_mlds!(h1::Histogram, h2::Histogram, theta1::Float64, theta2::Float64)
+
+The same as `compare_mlds`, except that it takes two histograms and mutates them.
+Return values are the same as `compare_mlds`.
+"""
+function compare_mlds!(h1, h2, theta1, theta2)
     # 1 is the target lattice, i.e. with biggest theta
     length(h1.weights) == length(h2.weights) && @assert any(h1.weights .!= h2.weights)
     @assert theta1 != theta2
+    @assert any(h1.weights .> 0) && any(h2.weights .> 0)
     swap = false
     if theta1 < theta2
         temp = deepcopy(h1)
@@ -166,9 +206,9 @@ function compare_mlds!(h1, h2, theta1, theta2) # add !
     tw = zeros(Float64, length(h1.weights))
     w2 = h2.weights
     factor = sum(h1.weights) / sum(h2.weights)
-    t = 1
-    f = 1
-    while t < length(edges1) && f < length(edges2) && h1.weights[t] > 0 && w2[f] > 0
+    t = 1 # target
+    f = 1 # following
+    while t < length(edges1) && f < length(edges2)
         st, en = edges1[t], edges1[t+1]
         width = edges2[f+1] - edges2[f]
         if st <= edges2[f] < edges2[f+1] < en
@@ -200,7 +240,7 @@ function compare_mlds!(h1, h2, theta1, theta2) # add !
 
     rs = midpoints(h1.edges[1]) * theta1
     sigmasq = h1.weights .+ tw * factor^2
-    maxl = min(t,f,length(h1.weights),length(tw))
+    maxl = min(findlast(w2 .> 0), findlast(h1.weights .> 0))
     if swap
         return rs[1:maxl], tw[1:maxl] * factor, h1.weights[1:maxl], sigmasq[1:maxl]
     else
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@ makedocs(;`
`15`	`15`	`pages=[`
`16`	`16`	`"Home" => "index.md",`
`17`	`17`	`"Tutorial" => "tutorial.md",`
	`18`	`+ "Diagnostics" => "diagnostics.md"`
`18`	`19`	`],`
`19`	`20`	`warnonly=[:missing_docs],`
`20`	`21`	`)`