|
| 1 | +using Test |
| 2 | +using MLJTuning |
| 3 | +using MLJBase |
| 4 | +using StatisticalMeasures |
| 5 | +using StableRNGs |
| 6 | +import MLJModelInterface |
| 7 | +import StatisticalMeasures: CategoricalDistributions, Distributions |
| 8 | + |
| 9 | + |
| 10 | +# We define a density estimator to fit a `UnivariateFinite` distribution to some |
| 11 | +# Categorical data, with a Laplace smoothing option, α. |
| 12 | + |
| 13 | +mutable struct UnivariateFiniteFitter <: MLJModelInterface.Probabilistic |
| 14 | + alpha::Float64 |
| 15 | +end |
| 16 | +UnivariateFiniteFitter(;alpha=1.0) = UnivariateFiniteFitter(alpha) |
| 17 | + |
| 18 | +function MLJModelInterface.fit(model::UnivariateFiniteFitter, |
| 19 | + verbosity, X, y) |
| 20 | + |
| 21 | + α = model.alpha |
| 22 | + N = length(y) |
| 23 | + _classes = classes(y) |
| 24 | + d = length(_classes) |
| 25 | + |
| 26 | + frequency_given_class = Distributions.countmap(y) |
| 27 | + prob_given_class = |
| 28 | + Dict(c => (get(frequency_given_class, c, 0) + α)/(N + α*d) for c in _classes) |
| 29 | + |
| 30 | + fitresult = CategoricalDistributions.UnivariateFinite(prob_given_class) |
| 31 | + |
| 32 | + report = (params=Distributions.params(fitresult),) |
| 33 | + cache = nothing |
| 34 | + |
| 35 | + verbosity > 0 && @info "Fitted a $fitresult" |
| 36 | + |
| 37 | + return fitresult, cache, report |
| 38 | +end |
| 39 | + |
| 40 | +MLJModelInterface.predict(model::UnivariateFiniteFitter, |
| 41 | + fitresult, |
| 42 | + X) = fitresult |
| 43 | + |
| 44 | + |
| 45 | +MLJModelInterface.input_scitype(::Type{<:UnivariateFiniteFitter}) = |
| 46 | + Nothing |
| 47 | +MLJModelInterface.target_scitype(::Type{<:UnivariateFiniteFitter}) = |
| 48 | + AbstractVector{<:Finite} |
| 49 | + |
| 50 | +# This test will fail if MLJ test dependency MLJBase is < 1.11 |
| 51 | +@testset "tuning for density estimators" begin |
| 52 | + y = coerce(collect("abbabbc"), Multiclass) |
| 53 | + X = nothing |
| 54 | + |
| 55 | + train, test = partition(eachindex(y), 3/7) |
| 56 | + # For above train-test split, hand calculation determines, when optimizing against |
| 57 | + # log loss, that: |
| 58 | + best_alpha = 2.0 |
| 59 | + best_loss = (4log(9) - log(3) - 2log(4) - log(2))/4 |
| 60 | + |
| 61 | + model = UnivariateFiniteFitter(alpha=0) |
| 62 | + r = range(model, :alpha, values=[0.1, 1, 1.5, 2, 2.5, 10]) |
| 63 | + tmodel = TunedModel( |
| 64 | + model, |
| 65 | + tuning=Grid(shuffle=false), |
| 66 | + range=r, |
| 67 | + resampling=[(train, test),], |
| 68 | + measure=log_loss, |
| 69 | + compact_history=false, |
| 70 | + ) |
| 71 | + |
| 72 | + mach = machine(tmodel, X, y) |
| 73 | + fit!(mach, verbosity=0) |
| 74 | + best = report(mach).best_history_entry |
| 75 | + @test best.model.alpha == best_alpha |
| 76 | + @test best.evaluation.measurement[1] ≈ best_loss |
| 77 | +end |
| 78 | + |
| 79 | +true |
0 commit comments