From 98a1a407dd37f939df511fb3cad9c6ea3f9f1fa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller-Widmann?= Date: Fri, 10 Oct 2025 21:49:27 +0200 Subject: [PATCH] Improve performance of unweighted `ecdf` --- Project.toml | 2 +- src/empirical.jl | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index e48fd5ce1..80431779a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.34.6" +version = "0.34.7" [deps] AliasTables = "66dad0bd-aa9a-41b7-9441-69ab47430ed8" diff --git a/src/empirical.jl b/src/empirical.jl index 45f985468..fe3368bc0 100644 --- a/src/empirical.jl +++ b/src/empirical.jl @@ -42,7 +42,7 @@ function (ecdf::ECDF)(v::AbstractVector{<:Real}) end """ - ecdf(X; weights::AbstractWeights) + ecdf(X[; weights::AbstractVector{<:Real}]) Return an empirical cumulative distribution function (ECDF) based on a vector of samples given in `X`. Optionally providing `weights` returns a weighted ECDF. @@ -53,12 +53,17 @@ evaluate CDF values on other samples. `extrema`, `minimum`, and `maximum` are supported to for obtaining the range over which function is inside the interval ``(0,1)``; the function is defined for the whole real line. """ -function ecdf(X::AbstractVector{<:Real}; weights::AbstractVector{<:Real}=Weights(Float64[])) +function ecdf(X::AbstractVector{<:Real}; weights::AbstractVector{<:Real}=weights(Float64[])) any(isnan, X) && throw(ArgumentError("ecdf can not include NaN values")) - isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," * - "got $(length(X)) and $(length(weights))")) - ord = sortperm(X) - ECDF(X[ord], isempty(weights) ? weights : Weights(weights[ord])) + _weights = weights isa AbstractWeights ? weights : StatsBase.weights(weights) + if isempty(_weights) + return ECDF(sort(X), _weights) + else + length(X) == length(_weights) || throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(X)) and $(length(_weights))")) + ord = sortperm(X) + ECDF(X[ord], _weights[ord]) + end end minimum(ecdf::ECDF) = first(ecdf.sorted_values)