diff --git a/.github/workflows/generate_website.yml b/.github/workflows/generate_website.yml
index f8c4ac6..223fed5 100644
--- a/.github/workflows/generate_website.yml
+++ b/.github/workflows/generate_website.yml
@@ -86,7 +86,6 @@ jobs:
 
   collect-results:
     runs-on: ubuntu-latest
-    if: github.event_name != 'pull_request'
     needs: [setup-keys, run-models]
 
     steps:
@@ -106,3 +105,4 @@ jobs:
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           publish_dir: ./html
+          destination_dir: ${{ github.event_name == 'pull_request' && 'pr' || '' }}
diff --git a/Project.toml b/Project.toml
index f789f5d..c4b461f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,4 +16,4 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-DynamicPPL = "0.35"
+DynamicPPL = "0.36"
diff --git a/README.md b/README.md
index 5bd1092..2e83353 100644
--- a/README.md
+++ b/README.md
@@ -21,15 +21,17 @@ You can edit it there.
 
 Note that the links-to-existing-GitHub-issues in the table are also defined in this script.
 
+## I want to see the HTML generated by a PR!
+
+The latest workflow run across all PRs will be published to https://turinglang.org/ADTests/pr.
+
+This is a bit messy, but works for now on the assumption that there aren't many PRs being worked on simultaneously.
+
 ## What's going on?
 
 The workflow is the most complicated part of this repository.
 This section attempts to explain it from the 'bottom up'; if you prefer a 'top down' approach start by looking at the GitHub Action workflow, `.github/workflows/test.yml`.
 
-Firstly, there is library code for running the benchmarks.
-This is in `lib.jl`; it should (in the near future) be put directly into DynamicPPL.jl.
-Until then, it has to live here.
-
 Under the hood, the main thing that actually runs the AD tests / benchmarks is `main.jl`.
 You can run `julia --project=. main.jl` and it will print some usage information.
 However, it is the Python script `ad.py` that controls how this Julia script is called.
diff --git a/lib.jl b/lib.jl
deleted file mode 100644
index cacbab0..0000000
--- a/lib.jl
+++ /dev/null
@@ -1,186 +0,0 @@
-module Lib
-
-import ADTypes: AbstractADType, AutoForwardDiff
-import Chairmarks: @be
-import DifferentiationInterface as DI
-import DynamicPPL: Model, LogDensityFunction, VarInfo, AbstractVarInfo
-import LogDensityProblems: logdensity, logdensity_and_gradient
-import Random: Random, Xoshiro
-import Statistics: median
-
-"""
-    REFERENCE_ADTYPE
-
-Reference AD backend to use for comparison. In this case, ForwardDiff.jl, since
-it's the default AD backend used in Turing.jl.
-"""
-const REFERENCE_ADTYPE = AutoForwardDiff()
-
-"""
-    ADTestResult
-
-Data structure to store the results of the AD correctness test.
-
-If you want to quickly check whether the result is a success or failure, you
-can use `isnothing(result.error)`.
-"""
-struct ADTestResult
-    "The DynamicPPL model that was tested"
-    model::Model
-    "The values at which the model was evaluated"
-    params::Vector{<:Real}
-    "The AD backend that was tested"
-    adtype::AbstractADType
-    "The absolute tolerance for the value of logp"
-    value_atol::Real
-    "The absolute tolerance for the gradient of logp"
-    grad_atol::Real
-    "If the test ran, the expected value of logp (calculated using the reference AD backend)"
-    value_expected::Union{Nothing,Float64}
-    "If the test ran, the expected gradient of logp (calculated using the reference AD backend)"
-    grad_expected::Union{Nothing,Vector{Float64}}
-    "If the test ran, the actual value of logp (calculated using `adtype`)"
-    value_actual::Union{Nothing,Real}
-    "If the test ran, the actual gradient of logp (calculated using `adtype`)"
-    grad_actual::Union{Nothing,Vector{Float64}}
-    "If the test ran and benchmarking was requested, the time taken by the AD backend to calculate the gradient of logp, divided by the time taken to evaluate logp itself"
-    time_vs_primal::Union{Nothing,Float64}
-    "If the test did not run successfully, the error that was thrown"
-    error::Union{Nothing,Exception}
-end
-
-"""
-    ADIncorrectException
-
-Represents an AD test that ran successfully, but failed due to numerical
-inaccuracy.
-"""
-struct ADIncorrectException <: Exception
-end
-
-"""
-    run_ad(
-        model::Model,
-        adtype::ADTypes.AbstractADType;
-        benchmark=false,
-        value_atol=1e-6,
-        grad_atol=1e-6,
-        varinfo::AbstractVarInfo=VarInfo(model),
-        params::Vector{<:Real}=varinfo[:],
-        reference_adtype::ADTypes.AbstractADType=REFERENCE_ADTYPE,
-        expected_value_and_grad::Union{Nothing,Tuple{Real,Vector{<:Real}}}=nothing,
-        verbose=true,
-    )::ADTestResult
-
-Test the correctness of the AD backend `adtype` for the model `model`. If
-`benchmark` is set to `true`, also benchmark the AD backend. By default this is
-`false`.
-
-Returns an [`ADTestResult`](@ref) object, which contains the results of the
-test and/or benchmark.
-
-The signature of this function is complicated. There are two things that must
-be provided:
-
-1. `model` - The model being tested.
-2. `adtype` - The AD backend being tested.
-
-Everything else is optional, and can be categorised into several groups:
-
-1. _How to specify the VarInfo._ DynamicPPL contains several different types of
-VarInfo objects which change the way model evaluation occurs. If you want to
-use a specific type of VarInfo, pass it as the `varinfo` argument. Otherwise,
-it will default to using a `TypedVarInfo` generated from the model.
-
-2. _How to specify the parameters._ For maximum control over this, generate a
-vector of parameters yourself and pass this as the `params` argument. If you
-don't specify this, it will be taken from the contents of the VarInfo. Note
-that if the VarInfo is not specified (and thus automatically generated) the
-parameters in it will have been sampled from the prior of the model.
-
-3. _How to specify the results to compare against._ Once logp and its gradient
-has been calculated with the specified `adtype`, it must be tested for
-correctness. This can be done either by specifying `reference_adtype`, in which
-case logp and its gradient will also be calculated with this reference in order
-to obtain the ground truth; or by using `expected_value_and_grad`, which is a
-tuple of (logp, gradient) that the calculated values must match. The latter is
-useful if you are testing multiple AD backends and want to avoid recalculating
-the ground truth multiple times. The default reference backend is ForwardDiff.
-If none of these parameters are specified, that will be used to calculate the
-ground truth.
-
-4. _How to specify the tolerances._ The tolerances for the value and gradient
-can be set using `value_atol` and `grad_atol`. These default to 1e-6.
-
-5. _Whether to output extra logging information._ By default, this function
-prints a message when it runs. To silence it, set `verbose=false`.
-"""
-function run_ad(
-    model::Model,
-    adtype::AbstractADType;
-    benchmark=false,
-    value_atol=1e-6,
-    grad_atol=1e-6,
-    rng::Random.AbstractRNG=Xoshiro(468),
-    varinfo::AbstractVarInfo=VarInfo(rng, model),
-    params::Vector{<:Real}=varinfo[:],
-    reference_adtype::AbstractADType=REFERENCE_ADTYPE,
-    expected_value_and_grad::Union{Nothing,Tuple{Real,Vector{<:Real}}}=nothing,
-    verbose=true,
-    throw_error=true,
-)::ADTestResult
-    try
-        verbose && @info "Running AD on $(model.f) with $(adtype)\n"
-        ldf = LogDensityFunction(model, varinfo; adtype=adtype)
-        verbose && println(Base.stderr, "       params : $(params)")
-
-        # Calculate ground truth to compare against
-        value_true, grad_true = if expected_value_and_grad === nothing
-            ldf_reference = LogDensityFunction(model, varinfo; adtype=reference_adtype)
-            logdensity_and_gradient(ldf_reference, params)
-        else
-            expected_value_and_grad
-        end
-        verbose && println(Base.stderr, "     expected : $((value_true, grad_true))")
-
-        value, grad = logdensity_and_gradient(ldf, params)
-        if !(grad isa Vector{Float64})
-            # https://github.com/JuliaDiff/DifferentiationInterface.jl/issues/754
-            grad = collect(grad)
-        end
-        verbose && println(Base.stderr, "       actual : $((value, grad))")
-
-        value_is_correct = isapprox(value, value_true; atol=value_atol)
-        grad_is_correct = isapprox(grad, grad_true; atol=grad_atol)
-
-        maybe_exc = if value_is_correct && grad_is_correct
-            nothing
-        else
-            ADIncorrectException()
-        end
-
-        time_vs_primal = if benchmark
-            primal_bmark = @be (ldf, params) logdensity(_[1], _[2])
-            grad_bmark = @be (ldf, params) logdensity_and_gradient(_[1], _[2])
-            median(grad_bmark).time / median(primal_bmark).time
-        else
-            nothing
-        end
-
-        return ADTestResult(
-            model, params, adtype, value_atol, grad_atol,
-            value_true, grad_true, value, grad, time_vs_primal, maybe_exc
-        )
-    catch e
-        # If we want to throw the error, do so
-        throw_error && throw(e)
-        # otherwise capture and return it
-        println("Error: $e")
-        return ADTestResult(
-            model, params, adtype, value_atol, grad_atol,
-            nothing, nothing, nothing, nothing, nothing, e
-        )
-    end
-end
-
-end # module Lib
diff --git a/main.jl b/main.jl
index 1198168..cb063b7 100644
--- a/main.jl
+++ b/main.jl
@@ -1,5 +1,6 @@
 import Test: @test, @testset
 using DynamicPPL: DynamicPPL, VarInfo
+using DynamicPPL.TestUtils.AD: run_ad, ADResult, ADIncorrectException
 using ADTypes
 using Printf: @printf
 
@@ -26,11 +27,6 @@ ADTYPES = Dict(
 include("models.jl")
 using .Models: MODELS
 
-# Benchmarking code is defined here. In time this will be put into DynamicPPL.
-# See https://github.com/TuringLang/DynamicPPL.jl/pull/882
-include("lib.jl")
-using .Lib: run_ad, ADIncorrectException
-
 # The entry point to this script itself begins here
 if ARGS == ["--list-model-keys"]
     foreach(println, sort(collect(keys(MODELS))))
@@ -39,22 +35,37 @@ elseif ARGS == ["--list-adtype-keys"]
 elseif length(ARGS) == 3 && ARGS[1] == "--run"
     model, adtype = MODELS[ARGS[2]], ADTYPES[ARGS[3]]
 
-    if ARGS[2] == "control_flow"
-        # https://github.com/TuringLang/ADTests/issues/4
-        vi = DynamicPPL.unflatten(VarInfo(model), [0.5, -0.5])
-        params = [-0.5, 0.5]
-        result = run_ad(model, adtype; varinfo=vi, params=params, benchmark=true)
-    else
-        result = run_ad(model, adtype; benchmark=true)
-    end
-
-    if isnothing(result.error)
+    try
+        if ARGS[2] == "control_flow"
+            # https://github.com/TuringLang/ADTests/issues/4
+            vi = DynamicPPL.unflatten(VarInfo(model), [0.5, -0.5])
+            params = [-0.5, 0.5]
+            result = run_ad(model, adtype; varinfo=vi, params=params, benchmark=true)
+        else
+            result = run_ad(model, adtype; benchmark=true)
+        end
+        # If reached here - nothing went wrong
         @printf("%.3f", result.time_vs_primal)
-    elseif result.error isa ADIncorrectException
-        println("wrong")
-    else
-        # some other error happened
-        println("error")
+    catch e
+        if result.error isa ADIncorrectException
+            # First check for completely incorrect ones
+            for (a, b) in zip(result.grad_expected, result.grad_actual)
+                if !isnan(a) && !isnan(b) && abs(a - b) > 1e-6
+                    println("wrong")
+                    exit()
+                end
+            end
+            # If not, check for NaN's and report those
+            if any(isnan, result.grad_expected) || any(isnan, result.grad_actual)
+                println("NaN")
+            else
+                # Something else went wrong, shouldn't happen
+                println("wrong")
+            end
+        else
+            # Some other error, just say it's an error
+            println("error")
+        end
     end
 else
     println("Usage: julia main.jl --list-model-keys")