diff --git a/.github/workflows/generate_website.yml b/.github/workflows/generate_website.yml index f8c4ac6..223fed5 100644 --- a/.github/workflows/generate_website.yml +++ b/.github/workflows/generate_website.yml @@ -86,7 +86,6 @@ jobs: collect-results: runs-on: ubuntu-latest - if: github.event_name != 'pull_request' needs: [setup-keys, run-models] steps: @@ -106,3 +105,4 @@ jobs: with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./html + destination_dir: ${{ github.event_name == 'pull_request' && 'pr' || '' }} diff --git a/Project.toml b/Project.toml index f789f5d..c4b461f 100644 --- a/Project.toml +++ b/Project.toml @@ -16,4 +16,4 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] -DynamicPPL = "0.35" +DynamicPPL = "0.36" diff --git a/README.md b/README.md index 5bd1092..2e83353 100644 --- a/README.md +++ b/README.md @@ -21,15 +21,17 @@ You can edit it there. Note that the links-to-existing-GitHub-issues in the table are also defined in this script. +## I want to see the HTML generated by a PR! + +The latest workflow run across all PRs will be published to https://turinglang.org/ADTests/pr. + +This is a bit messy, but works for now on the assumption that there aren't many PRs being worked on simultaneously. + ## What's going on? The workflow is the most complicated part of this repository. This section attempts to explain it from the 'bottom up'; if you prefer a 'top down' approach start by looking at the GitHub Action workflow, `.github/workflows/test.yml`. -Firstly, there is library code for running the benchmarks. -This is in `lib.jl`; it should (in the near future) be put directly into DynamicPPL.jl. -Until then, it has to live here. - Under the hood, the main thing that actually runs the AD tests / benchmarks is `main.jl`. You can run `julia --project=. main.jl` and it will print some usage information. However, it is the Python script `ad.py` that controls how this Julia script is called. diff --git a/lib.jl b/lib.jl deleted file mode 100644 index cacbab0..0000000 --- a/lib.jl +++ /dev/null @@ -1,186 +0,0 @@ -module Lib - -import ADTypes: AbstractADType, AutoForwardDiff -import Chairmarks: @be -import DifferentiationInterface as DI -import DynamicPPL: Model, LogDensityFunction, VarInfo, AbstractVarInfo -import LogDensityProblems: logdensity, logdensity_and_gradient -import Random: Random, Xoshiro -import Statistics: median - -""" - REFERENCE_ADTYPE - -Reference AD backend to use for comparison. In this case, ForwardDiff.jl, since -it's the default AD backend used in Turing.jl. -""" -const REFERENCE_ADTYPE = AutoForwardDiff() - -""" - ADTestResult - -Data structure to store the results of the AD correctness test. - -If you want to quickly check whether the result is a success or failure, you -can use `isnothing(result.error)`. -""" -struct ADTestResult - "The DynamicPPL model that was tested" - model::Model - "The values at which the model was evaluated" - params::Vector{<:Real} - "The AD backend that was tested" - adtype::AbstractADType - "The absolute tolerance for the value of logp" - value_atol::Real - "The absolute tolerance for the gradient of logp" - grad_atol::Real - "If the test ran, the expected value of logp (calculated using the reference AD backend)" - value_expected::Union{Nothing,Float64} - "If the test ran, the expected gradient of logp (calculated using the reference AD backend)" - grad_expected::Union{Nothing,Vector{Float64}} - "If the test ran, the actual value of logp (calculated using `adtype`)" - value_actual::Union{Nothing,Real} - "If the test ran, the actual gradient of logp (calculated using `adtype`)" - grad_actual::Union{Nothing,Vector{Float64}} - "If the test ran and benchmarking was requested, the time taken by the AD backend to calculate the gradient of logp, divided by the time taken to evaluate logp itself" - time_vs_primal::Union{Nothing,Float64} - "If the test did not run successfully, the error that was thrown" - error::Union{Nothing,Exception} -end - -""" - ADIncorrectException - -Represents an AD test that ran successfully, but failed due to numerical -inaccuracy. -""" -struct ADIncorrectException <: Exception -end - -""" - run_ad( - model::Model, - adtype::ADTypes.AbstractADType; - benchmark=false, - value_atol=1e-6, - grad_atol=1e-6, - varinfo::AbstractVarInfo=VarInfo(model), - params::Vector{<:Real}=varinfo[:], - reference_adtype::ADTypes.AbstractADType=REFERENCE_ADTYPE, - expected_value_and_grad::Union{Nothing,Tuple{Real,Vector{<:Real}}}=nothing, - verbose=true, - )::ADTestResult - -Test the correctness of the AD backend `adtype` for the model `model`. If -`benchmark` is set to `true`, also benchmark the AD backend. By default this is -`false`. - -Returns an [`ADTestResult`](@ref) object, which contains the results of the -test and/or benchmark. - -The signature of this function is complicated. There are two things that must -be provided: - -1. `model` - The model being tested. -2. `adtype` - The AD backend being tested. - -Everything else is optional, and can be categorised into several groups: - -1. _How to specify the VarInfo._ DynamicPPL contains several different types of -VarInfo objects which change the way model evaluation occurs. If you want to -use a specific type of VarInfo, pass it as the `varinfo` argument. Otherwise, -it will default to using a `TypedVarInfo` generated from the model. - -2. _How to specify the parameters._ For maximum control over this, generate a -vector of parameters yourself and pass this as the `params` argument. If you -don't specify this, it will be taken from the contents of the VarInfo. Note -that if the VarInfo is not specified (and thus automatically generated) the -parameters in it will have been sampled from the prior of the model. - -3. _How to specify the results to compare against._ Once logp and its gradient -has been calculated with the specified `adtype`, it must be tested for -correctness. This can be done either by specifying `reference_adtype`, in which -case logp and its gradient will also be calculated with this reference in order -to obtain the ground truth; or by using `expected_value_and_grad`, which is a -tuple of (logp, gradient) that the calculated values must match. The latter is -useful if you are testing multiple AD backends and want to avoid recalculating -the ground truth multiple times. The default reference backend is ForwardDiff. -If none of these parameters are specified, that will be used to calculate the -ground truth. - -4. _How to specify the tolerances._ The tolerances for the value and gradient -can be set using `value_atol` and `grad_atol`. These default to 1e-6. - -5. _Whether to output extra logging information._ By default, this function -prints a message when it runs. To silence it, set `verbose=false`. -""" -function run_ad( - model::Model, - adtype::AbstractADType; - benchmark=false, - value_atol=1e-6, - grad_atol=1e-6, - rng::Random.AbstractRNG=Xoshiro(468), - varinfo::AbstractVarInfo=VarInfo(rng, model), - params::Vector{<:Real}=varinfo[:], - reference_adtype::AbstractADType=REFERENCE_ADTYPE, - expected_value_and_grad::Union{Nothing,Tuple{Real,Vector{<:Real}}}=nothing, - verbose=true, - throw_error=true, -)::ADTestResult - try - verbose && @info "Running AD on $(model.f) with $(adtype)\n" - ldf = LogDensityFunction(model, varinfo; adtype=adtype) - verbose && println(Base.stderr, " params : $(params)") - - # Calculate ground truth to compare against - value_true, grad_true = if expected_value_and_grad === nothing - ldf_reference = LogDensityFunction(model, varinfo; adtype=reference_adtype) - logdensity_and_gradient(ldf_reference, params) - else - expected_value_and_grad - end - verbose && println(Base.stderr, " expected : $((value_true, grad_true))") - - value, grad = logdensity_and_gradient(ldf, params) - if !(grad isa Vector{Float64}) - # https://github.com/JuliaDiff/DifferentiationInterface.jl/issues/754 - grad = collect(grad) - end - verbose && println(Base.stderr, " actual : $((value, grad))") - - value_is_correct = isapprox(value, value_true; atol=value_atol) - grad_is_correct = isapprox(grad, grad_true; atol=grad_atol) - - maybe_exc = if value_is_correct && grad_is_correct - nothing - else - ADIncorrectException() - end - - time_vs_primal = if benchmark - primal_bmark = @be (ldf, params) logdensity(_[1], _[2]) - grad_bmark = @be (ldf, params) logdensity_and_gradient(_[1], _[2]) - median(grad_bmark).time / median(primal_bmark).time - else - nothing - end - - return ADTestResult( - model, params, adtype, value_atol, grad_atol, - value_true, grad_true, value, grad, time_vs_primal, maybe_exc - ) - catch e - # If we want to throw the error, do so - throw_error && throw(e) - # otherwise capture and return it - println("Error: $e") - return ADTestResult( - model, params, adtype, value_atol, grad_atol, - nothing, nothing, nothing, nothing, nothing, e - ) - end -end - -end # module Lib diff --git a/main.jl b/main.jl index 1198168..cb063b7 100644 --- a/main.jl +++ b/main.jl @@ -1,5 +1,6 @@ import Test: @test, @testset using DynamicPPL: DynamicPPL, VarInfo +using DynamicPPL.TestUtils.AD: run_ad, ADResult, ADIncorrectException using ADTypes using Printf: @printf @@ -26,11 +27,6 @@ ADTYPES = Dict( include("models.jl") using .Models: MODELS -# Benchmarking code is defined here. In time this will be put into DynamicPPL. -# See https://github.com/TuringLang/DynamicPPL.jl/pull/882 -include("lib.jl") -using .Lib: run_ad, ADIncorrectException - # The entry point to this script itself begins here if ARGS == ["--list-model-keys"] foreach(println, sort(collect(keys(MODELS)))) @@ -39,22 +35,37 @@ elseif ARGS == ["--list-adtype-keys"] elseif length(ARGS) == 3 && ARGS[1] == "--run" model, adtype = MODELS[ARGS[2]], ADTYPES[ARGS[3]] - if ARGS[2] == "control_flow" - # https://github.com/TuringLang/ADTests/issues/4 - vi = DynamicPPL.unflatten(VarInfo(model), [0.5, -0.5]) - params = [-0.5, 0.5] - result = run_ad(model, adtype; varinfo=vi, params=params, benchmark=true) - else - result = run_ad(model, adtype; benchmark=true) - end - - if isnothing(result.error) + try + if ARGS[2] == "control_flow" + # https://github.com/TuringLang/ADTests/issues/4 + vi = DynamicPPL.unflatten(VarInfo(model), [0.5, -0.5]) + params = [-0.5, 0.5] + result = run_ad(model, adtype; varinfo=vi, params=params, benchmark=true) + else + result = run_ad(model, adtype; benchmark=true) + end + # If reached here - nothing went wrong @printf("%.3f", result.time_vs_primal) - elseif result.error isa ADIncorrectException - println("wrong") - else - # some other error happened - println("error") + catch e + if result.error isa ADIncorrectException + # First check for completely incorrect ones + for (a, b) in zip(result.grad_expected, result.grad_actual) + if !isnan(a) && !isnan(b) && abs(a - b) > 1e-6 + println("wrong") + exit() + end + end + # If not, check for NaN's and report those + if any(isnan, result.grad_expected) || any(isnan, result.grad_actual) + println("NaN") + else + # Something else went wrong, shouldn't happen + println("wrong") + end + else + # Some other error, just say it's an error + println("error") + end end else println("Usage: julia main.jl --list-model-keys")