add level=4 accelerated_evaluation test

ablaom · ablaom · commit dc2b045a61ac · 2022-05-31T14:27:17.000+12:00
diff --git a/Project.toml b/Project.toml
@@ -5,7 +5,9 @@ version = "0.1.0"
 
 [deps]
 MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
+MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 MLJ = "0.18"
diff --git a/src/MLJTestIntegration.jl b/src/MLJTestIntegration.jl
@@ -1,13 +1,22 @@
 module MLJTestIntegration
 
+const N_MODELS_FOR_REPEATABILITY_TEST = 3
+
 using MLJ
 using Pkg
+using .Threads
+using Test
 
 include("attemptors.jl")
 include("test.jl")
 include("special_cases.jl")
 include("dummy_model.jl")
 
+function __init__()
+    global RESOURCES = (CPU1(), CPUThreads())
+    @info "Testing with $(nthreads()) threads. "
+end
+
 using .DummyModel
 
 end # module
diff --git a/src/attemptors.jl b/src/attemptors.jl
@@ -1,9 +1,12 @@
+const ERR_INCONSISTENT_RESULTS =
+    "Different computational resources are giving different results. "
+
 """
     attempt(f, message; throw=false)
 
 Return `(f(), "✓") if `f()` executes without throwing an
 exception. Otherwise, return `(ex, "×"), where `ex` is the exception
-caught. Only truly throw the exception if `throw=true`. 
+caught. Only truly throw the exception if `throw=true`.
 
 If `message` is not empty, then it is logged to `Info`, together with
 the second return value ("✓" or "×").
@@ -123,32 +126,51 @@ function threshold_prediction(model, data...; throw=false, verbosity=1)
     end
 end
 
-function evaluation(measure, model, data...; throw=false, verbosity=1)
+function evaluation(measure, model, resources, data...; throw=false, verbosity=1)
     message = "[:evaluation] Evaluating performance "
     attempt(finalize(message, verbosity); throw) do
-        evaluate(model, data...;
-                 measure=measure,
-                 resampling=Holdout(),
-                 verbosity=0)
+        es = map(resources) do accel
+            evaluate(model, data...;
+                     measure=measure,
+                     resampling=Holdout(),
+                     acceleration=accel,
+                     verbosity=0)
+        end
+        ms = map(e->e.measurement, es)
+        m = first(ms)
+        @assert all(≈(m), collect(ms)[2:end]) ERR_INCONSISTENT_RESULTS
+        return first(es)
     end
 end
 
-function tuned_pipe_evaluation(measure, model, data...; throw=false, verbosity=1)
+function tuned_pipe_evaluation(
+    measure,
+    model,
+    data...;
+    throw=false,
+    verbosity=1,
+)
     message = "[:tuned_pipe_evaluation] Evaluating perfomance in a tuned pipeline "
     attempt(finalize(message, verbosity); throw) do
         pipe = identity |> model
-        tuned_pipe = TunedModel(models=[pipe,],
-                                measure=measure)
-        evaluate(tuned_pipe, data...;
-                 measure=measure,
-                 verbosity=0);
+        tuned_pipe = TunedModel(
+            models=[pipe,],
+            measure=measure,
+        )
+        evaluate(
+            tuned_pipe, data...;
+            measure=measure,
+            verbosity=0,
+        )
     end
 end
 
 function ensemble_prediction(model, data...; throw=false, verbosity=1)
     attempt(finalize("[:ensemble_prediction] Ensembling ", verbosity); throw) do
-        imodel = EnsembleModel(model=model,
-                               n=2)
+        imodel = EnsembleModel(
+            model=model,
+            n=2,
+        )
         mach = machine(imodel, data...)
         fit!(mach, verbosity=0)
         predict(mach, first(data))
diff --git a/src/test.jl b/src/test.jl
@@ -27,11 +27,12 @@ with automatic code loading" below.
 
 The extent of testing is controlled by `level`:
 
-|`level`          | description                      | tests (full list below) |
-|:----------------|:---------------------------------|:------------------------|
-| 1               | test code loading                | `:model_type`           |
-| 2 (default)     | basic test of model interface    | first four tests        |
-| 3               | comprehensive                    | all applicable tests    |
+|`level`          | description                       | tests (full list below) |
+|:----------------|:----------------------------------|:------------------------|
+| 1               | test code loading                 | `:model_type`           |
+| 2 (default)     | basic test of model interface     | first four tests        |
+| 3               | comprehensive CPU1()              | all CPU1() tests        |
+| 4               | comprehensive CPU1()/CPUThreads() | all tests               |
 
 By default, exceptions caught in tests are not thrown. If
 `throw=true`, testing will terminate at the first execption
@@ -131,6 +132,10 @@ These additional tests are applied to `Supervised` models:
   (metric), evaluate the performance of the model using `evaluate!`
   and a `Holdout` set.
 
+- `:accelerated_evaluation`: Assuming the model appears to make
+  repeatable predictions on retraining, repeat the `:evaluation` test
+  using `CPUThreads()` acceleration and check agreement with `CPU1()` case.
+
 - `:tuned_pipe_evaluation`: Repeat the `:evauation` test but first
   insert model in a pipeline with a trivial pre-processing step
   (applies the identity transformation) and wrap in `TunedModel` (only
@@ -156,11 +161,12 @@ function test(model_proxies, data...; mod=Main, level=2, throw=false, verbosity=
         :fitted_machine,
         :operations,
         :evaluation,
+        :accelerated_evaluation,
         :tuned_pipe_evaluation,
         :threshold_prediction,
         :ensemble_prediction,
         :iteration_prediction
-    ), NTuple{11, String}}}(undef, nproxies)
+    ), NTuple{12, String}}}(undef, nproxies)
 
     # summary table row corresponding to all tests skipped:
     row0 = (
@@ -171,6 +177,7 @@ function test(model_proxies, data...; mod=Main, level=2, throw=false, verbosity=
         fitted_machine = "-",
         operations = "-",
         evaluation = "-",
+        accelerated_evaluation = "-",
         tuned_pipe_evaluation = "-",
         threshold_prediction = "-",
         ensemble_prediction = "-",
@@ -269,10 +276,56 @@ function test(model_proxies, data...; mod=Main, level=2, throw=false, verbosity=
 
         # evaluation:
         evaluation, outcome =
-            MLJTestIntegration.evaluation(measure, model_instance, data...; throw, verbosity)
+            MLJTestIntegration.evaluation(
+                measure,
+                model_instance,
+                [CPU1(),],
+                data...;
+                throw,
+                verbosity,
+            )
         row = update(row, i, :evaluation, evaluation, outcome)
         outcome == "×" && continue
 
+        # determine computational resources to test; we only test more
+        # than CPU1() if model evaluations are independent of training
+        # run (assuming this means models are "deterministic", ie,
+        # RNGs):
+        resources = MLJ.AbstractResource[] # fallback
+        if level  > 3
+            per_fold = evaluation.per_fold[1]
+            per_folds = map(1:(N_MODELS_FOR_REPEATABILITY_TEST - 1)) do _
+                e, o = MLJTestIntegration.evaluation(
+                    measure,
+                    model_instance,
+                    [CPU1(),],
+                    data...;
+                    throw=false,
+                    verbosity,
+                )
+                o == "✓" || return nothing
+                e.per_fold[1]
+            end
+            if all(≈(per_fold), per_folds)
+                resources = RESOURCES
+            end
+        end
+
+        if length(resources) > 1
+            # accelerated_evaluation:
+            evaluation, outcome =
+                MLJTestIntegration.evaluation(
+                    measure,
+                    model_instance,
+                    resources,
+                    data...;
+                    throw,
+                    verbosity,
+                )
+            row = update(row, i, :accelerated_evaluation, evaluation, outcome)
+            outcome == "×" && continue
+        end
+
         # tuned_pipe_evaluation:
         tuned_pipe_evaluation, outcome =
             MLJTestIntegration.tuned_pipe_evaluation(
@@ -287,15 +340,26 @@ function test(model_proxies, data...; mod=Main, level=2, throw=false, verbosity=
 
         # ensemble_prediction:
         ensemble_prediction, outcome =
-            MLJTestIntegration.ensemble_prediction(model_instance, data...; throw, verbosity)
+            MLJTestIntegration.ensemble_prediction(
+                model_instance,
+                data...;
+                throw,
+                verbosity,
+            )
         row = update(row, i, :ensemble_prediction, ensemble_prediction, outcome)
         outcome == "×" && continue
 
         isnothing(iteration_parameter(model_instance)) &&  continue
 
         # iteration prediction:
         iteration_prediction, outcome =
-            MLJTestIntegration.iteration_prediction(measure, model_instance, data...; throw, verbosity)
+            MLJTestIntegration.iteration_prediction(
+                measure,
+                model_instance,
+                data...;
+                throw,
+                verbosity,
+            )
         row = update(row, i, :iteration_prediction, iteration_prediction, outcome)
         outcome == "×" && continue
     end
diff --git a/test/attemptors.jl b/test/attemptors.jl
@@ -4,9 +4,15 @@
     good() = 42
 
     @test (@test_logs MLJTestIntegration.attempt(bad, "")) == (e, "×")
-    @test (@test_logs (:info, "look ×") MLJTestIntegration.attempt(bad, "look "))  == (e, "×")
+    @test(@test_logs(
+        (:info, "look ×"),
+        MLJTestIntegration.attempt(bad, "look "),
+    )  == (e, "×"))
     @test (@test_logs MLJTestIntegration.attempt(good, "")) == (42, "✓")
-    @test (@test_logs (:info, "look ✓") MLJTestIntegration.attempt(good, "look "))  == (42, "✓")
+    @test (@test_logs(
+        (:info, "look ✓"),
+        MLJTestIntegration.attempt(good, "look "),
+    )  == (42, "✓"))
     @test_throws e MLJTestIntegration.attempt(bad, ""; throw=true)
 end
 
diff --git a/test/test.jl b/test/test.jl
@@ -11,6 +11,7 @@ expected_summary1 = (
     fitted_machine = "✓",
     operations = "predict",
     evaluation = "✓",
+    accelerated_evaluation = "✓",
     tuned_pipe_evaluation = "✓",
     threshold_prediction = "✓",
     ensemble_prediction = "✓",
@@ -25,6 +26,7 @@ expected_summary2 = (
     fitted_machine = "✓",
     operations = "predict",
     evaluation = "✓",
+    accelerated_evaluation = "✓",
     tuned_pipe_evaluation = "✓",
     threshold_prediction = "-",
     ensemble_prediction = "✓",
@@ -41,7 +43,7 @@ expected_summary2 = (
             X,
             y;
             mod=@__MODULE__,
-            level=3,
+            level=4,
             verbosity=0
         )
     @test isempty(fails)
@@ -61,7 +63,7 @@ end
             X,
             y;
             mod=@__MODULE__,
-            level=3,
+            level=4,
             verbosity=0
         )
     @test isempty(fails)
@@ -109,6 +111,7 @@ end
         fitted_machine = "×",
         operations = "-",
         evaluation = "-",
+        accelerated_evaluation = "-",
         tuned_pipe_evaluation = "-",
         threshold_prediction = "-",
         ensemble_prediction = "-",
@@ -123,6 +126,7 @@ end
         fitted_machine = "✓",
         operations = "predict",
         evaluation = "×",
+        accelerated_evaluation = "-",
         tuned_pipe_evaluation = "-",
         threshold_prediction = "-",
         ensemble_prediction = "-",
@@ -201,6 +205,7 @@ end
         fitted_machine = "-",
         operations = "-",
         evaluation = "-",
+        accelerated_evaluation = "-",
         tuned_pipe_evaluation = "-",
         threshold_prediction = "-",
         ensemble_prediction = "-",
@@ -225,11 +230,37 @@ end
         fitted_machine = "✓",
         operations = "predict",
         evaluation = "-",
+        accelerated_evaluation = "-",
         tuned_pipe_evaluation = "-",
         threshold_prediction = "-",
         ensemble_prediction = "-",
         iteration_prediction = "-",
     )
+
+    # level=4:
+    fails, summary  =
+        @test_logs MLJTestIntegration.test(
+            classifiers,
+            X,
+            y;
+            mod=@__MODULE__,
+            level=4,
+            verbosity=0)
+    @test isempty(fails)
+    @test summary[1] == (
+        name = "ConstantClassifier",
+        package_name = "MLJModels",
+        model_type = "✓",
+        model_instance = "✓",
+        fitted_machine = "✓",
+        operations = "predict",
+        evaluation = "✓",
+        accelerated_evaluation = "✓",
+        tuned_pipe_evaluation = "✓",
+        threshold_prediction = "✓",
+        ensemble_prediction = "✓",
+        iteration_prediction = "-",
+    )
 end
 
 @testset "iterative model" begin
@@ -252,8 +283,10 @@ end
         fitted_machine = "✓",
         operations = "predict",
         evaluation = "✓",
+        accelerated_evaluation = "-",
         tuned_pipe_evaluation = "✓",
         threshold_prediction = "-",
         ensemble_prediction = "✓",
-        iteration_prediction = "✓",)
+        iteration_prediction = "✓",
+    )
 end