Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions src/confusion_matrices.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@ const CM = "ConfusionMatrices"
const CatArrOrSub{T, N} =
Union{CategoricalArray{T, N}, SubArray{T, N, <:CategoricalArray}}

function WARN_UNORDERED(levels)
raw_levels = CategoricalArrays.unwrap.(levels)
ret = "Levels not explicitly ordered. "*
"Using the order $raw_levels. "
if length(levels) == 2
ret *= "The \"positive\" level is $(raw_levels[2]). "
end
ret
end

const ERR_INDEX_ACCESS_DENIED = ErrorException(
"Direct access by index of unordered confusion matrices dissallowed. "*
"Access by level, as in `some_confusion_matrix(\"male\", \"female\")` or first "*
Expand Down Expand Up @@ -343,7 +333,7 @@ Return the regular `Matrix` associated with confusion matrix `m`.
"""
matrix(cm::ConfusionMatrix{N,true}; kwargs...) where N = cm.mat
@inline function matrix(cm::ConfusionMatrix{N,false}; warn=true) where N
warn && @warn WARN_UNORDERED(levels(cm))
warn && warn_unordered(levels(cm))
cm.mat
end

Expand Down
115 changes: 115 additions & 0 deletions src/probabilistic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -536,3 +536,118 @@ $DOC_DISTRIBUTIONS
SphericalScore
"$SphericalScoreDoc"
const spherical_score = SphericalScore()


# ---------------------------------------------------------------------
# Continuous Boyce Index
struct _ContinuousBoyceIndex
n_bins::Integer
bin_overlap::AbstractFloat
min::Union{AbstractFloat, Nothing}
max::Union{AbstractFloat, Nothing}
cor::Function
function _ContinuousBoyceIndex(; n_bins = 101, bin_overlap = 0.1, min = nothing, max = nothing, cor = StatsBase.corspearman)
new(n_bins, bin_overlap, min, max, cor)
end
end

ContinuousBoyceIndex(; kw...) = _ContinuousBoyceIndex(; kw...) |> robust_measure |> fussy_measure

function (m::_ContinuousBoyceIndex)(ŷ::UnivariateFiniteArray, y::NonMissingCatArrOrSub; warn=true)
warn && warn_unordered(levels(y))
positive_class = classes(first(ŷ))|> last
scores = pdf.(ŷ, positive_class)
ma = isnothing(m.max) ? maximum(scores) : m.max
mi = isnothing(m.min) ? minimum(scores) : m.min
binwidth = m.bin_overlap * (ma - mi)

return _cbi(scores, y, positive_class, m.n_bins, binwidth, ma, mi, m.cor)
end

function _cbi(scores, y, positive_class, nbins, binwidth, ma, mi, cor)
binstarts = range(mi, stop=ma-binwidth, length=nbins)
binends = range(mi + binwidth, stop=ma, length=nbins)

sorted_indices = sortperm(scores)
sorted_scores = view(scores, sorted_indices)
sorted_y = view(y, sorted_indices)

tot_positive = count(==(positive_class), y)
tot_negative = length(y) - tot_positive

n_positive = zeros(Int, nbins)
n_negative = zeros(Int, nbins)

@inbounds for i in 1:nbins
bin_index_first = searchsortedfirst(sorted_scores, binstarts[i])
bin_index_last = searchsortedlast(sorted_scores, binends[i])
@inbounds for j in bin_index_first:bin_index_last
if sorted_y[j] == positive_class
n_positive[i] += 1
end
end
n_negative[i] = bin_index_last - bin_index_first + 1 - n_positive[i]
end

n_total = n_positive .+ n_negative

# omit bins with no negative - we don't want to divide by zero
no_obs = n_negative .== 0
deleteat!(n_positive, no_obs)
deleteat!(n_negative, no_obs)
binstarts = binstarts[.!no_obs]

# calculate the relative frequencies of the positive class in each bin
binmeans = (n_positive ./ tot_positive) ./ (n_negative ./ tot_negative)
r = cor(binmeans, binstarts)
isnan(r) && error(
"Could not calculate a correlation coefficient because no bins with at least owned
negative and one positive observation. Try decreasing the number of bins or increasing
the bin overlap."
)
return r
end

const ContinuousBoyceIndexType = API.FussyMeasure{<:API.RobustMeasure{<:_ContinuousBoyceIndex}}

@fix_show ContinuousBoyceIndex::ContinuousBoyceIndexType

StatisticalMeasures.@trait(
_ContinuousBoyceIndex,
consumes_multiple_observations=true,
observation_scitype = Finite{2},
kind_of_proxy=StatisticalMeasures.LearnAPI.Distribution(),
orientation=Score(),
external_aggregation_mode=Mean(),
human_name = "continuous boyce index",
)

register(ContinuousBoyceIndex, "continuous_boyce_index", "cbi")

const ContinuousBoyceIndexDoc = docstring(
"ContinuousBoyceIndex(; n_bins=101, bin_overlap=0.1, min=nothing, max=nothing, cor=StatsBase.corspearman)",
body=
"""
The Continuous Boyce Index is a measure for evaluating the performance of probabilistic predictions for binary classification,
especially for presence-background data in ecological modeling.
It compares the predicted probability scores for the positive class across bins, giving higher scores if the ratio of positive
and negative samples in each bin is strongly correlated to the value at that bin.

- `n_bins`: Number of bins to use for score partitioning.
- `bin_overlap`: Fractional overlap between bins.
- `min`, `max`: Optional minimum and maximum score values for binning.
- `cor`: Correlation function (default: Spearman correlation).

The predictions `ŷ` should be a vector of `UnivariateFinite` distributions from CategoricalDistributions.jl, and `y` a vector of ground truth labels.

Returns the correlation between the ratio of positive to negative samples in each bin and the bin centers.

See also [Boyce Index (Wikipedia)](https://en.wikipedia.org/wiki/Boyce_index).
""",
scitype="",
)

"$ContinuousBoyceIndexDoc"
ContinuousBoyceIndex
"$ContinuousBoyceIndexDoc"
cbi(x, y; kw...) = ContinuousBoyceIndex(; kw...)(x, y)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ablaom what would be the right way to go here? I know the other functions don't have this interface, but here I think it would make a lot of sense to allow cbi(ŷ, y; n_bins=5)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, you have to make n_bins part of the struct. So you do ContinuousBoyceIndex(nbins=5)(yhat, y).

However, if you want, you can define a pure functional version Functions.continuous_boyce_index here and refactor so that your struct version calls that. And then documentation can point out the core implementation, like we do for MatthewsCorrelation.

5 changes: 1 addition & 4 deletions src/roc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,7 @@ function binary_levels(
length(classes) == 2 || throw(ERR_ROC2)
API.check_numobs(yhat, y)
API.check_pools(yhat, y)
if !(yhat isa AbstractArray{<:UnivariateFinite{<:OrderedFactor}}) ||
!CategoricalArrays.isordered(y)
@warn ConfusionMatrices.WARN_UNORDERED(classes)
end
yhat isa AbstractArray{<:UnivariateFinite{<:OrderedFactor}} && warn_unordered(classes)
classes
end
binary_levels(
Expand Down
11 changes: 11 additions & 0 deletions src/tools.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,14 @@ function API.check_pools(
return nothing
end

# Throw a warning if levels are not explicitly ordered
function warn_unordered(levels)
CategoricalArrays.isordered(levels) && return
raw_levels = CategoricalArrays.unwrap.(levels)
ret = "Levels not explicitly ordered. "*
"Using the order $raw_levels. "
if length(levels) == 2
ret *= "The \"positive\" level is $(raw_levels[2]). "
end
@warn ret
end
30 changes: 30 additions & 0 deletions test/probabilistic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,33 @@ end
s = SphericalScore(alpha=1)
@test_throws StatisticalMeasures.ERR_UNSUPPORTED_ALPHA s(yhat, [1.0, 1.0])
end


@testset "ContinuousBoyceIndex" begin
rng = srng(1234)
# Simple synthetic test: perfectly separates positives and negatives
c = ["neg", "pos"]
probs = repeat(0.0:0.1:0.9, inner = 10) .+ rand(rng, 100) .* 0.1
y = categorical(probs .> rand(rng, 100))
ŷ = UnivariateFinite(levels(y), probs, augment=true)
# Should be pretty high
@test cbi(ŷ, y) ≈ 0.84 atol=0.05

# Randomized test: shuffled labels, should be near 0
y_shuf = copy(y)
shuffle!(rng, y_shuf)
@test (cbi(ŷ, y_shuf)) ≈ 0.0 atol=0.1

# Test invariance to order
idx = randperm(length(y))
@test isapprox(cbi(ŷ[idx], y[idx]), cbi(ŷ, y), atol=1e-8)

# Test with different number of bins
@test cbi(ŷ, y; n_bins=5) < cbi(ŷ, y; n_bins=20)

# Test with all positives or all negatives (should error or return NaN)
y_allpos = categorical(trues(100), levels = levels(y))
y_allneg = categorical(falses(100), levels = levels(y))
@test_throws cbi(ŷ, y_allpos)
@test_throws cbi(ŷ, y_allneg)
end
2 changes: 1 addition & 1 deletion test/roc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
)

fprs, tprs, ts = @test_logs(
(:warn, ConfusionMatrices.WARN_UNORDERED([0, 1])),
(:warn, StatisticalMeasures.warn_unordered([0, 1])),
roc_curve(ŷ, y),
)

Expand Down
Loading