@@ -77,25 +77,25 @@ julia> cut(-1:0.5:1, [0, 1], extend=true)
77
77
78
78
julia> cut(-1:0.5:1, 2)
79
79
5-element CategoricalArray{String,1,UInt32}:
80
- "Q1: [-1.0, 0.0 )"
81
- "Q1: [-1.0, 0.0 )"
82
- "Q2 : [0 .0, 1.0] "
83
- "Q2: [0.0 , 1.0]"
84
- "Q2: [0.0 , 1.0]"
80
+ "Q1: [-1.0, 0.5 )"
81
+ "Q1: [-1.0, 0.5 )"
82
+ "Q1 : [-1 .0, 0.5) "
83
+ "Q2: [0.5 , 1.0]"
84
+ "Q2: [0.5 , 1.0]"
85
85
86
86
julia> cut(-1:0.5:1, 2, labels=["A", "B"])
87
87
5-element CategoricalArray{String,1,UInt32}:
88
88
"A"
89
89
"A"
90
- "B "
90
+ "A "
91
91
"B"
92
92
"B"
93
93
94
94
julia> cut(-1:0.5:1, 2, labels=[-0.5, +0.5])
95
95
5-element CategoricalArray{Float64,1,UInt32}:
96
96
-0.5
97
97
-0.5
98
- 0.5
98
+ - 0.5
99
99
0.5
100
100
0.5
101
101
@@ -104,11 +104,11 @@ fmt (generic function with 1 method)
104
104
105
105
julia> cut(-1:0.5:1, 3, labels=fmt)
106
106
5-element CategoricalArray{String,1,UInt32}:
107
- "grp 1 (-1.0//-0.3333333333333335 )"
108
- "grp 1 (-1.0//-0.3333333333333335 )"
109
- "grp 2 (-0.3333333333333335//0.33333333333333326 )"
110
- "grp 3 (0.33333333333333326 //1.0)"
111
- "grp 3 (0.33333333333333326 //1.0)"
107
+ "grp 1 (-1.0//0.0 )"
108
+ "grp 1 (-1.0//0.0 )"
109
+ "grp 2 (0.0//1.0 )"
110
+ "grp 2 (0.0 //1.0)"
111
+ "grp 3 (1.0 //1.0)"
112
112
```
113
113
"""
114
114
@inline function cut (x:: AbstractArray , breaks:: AbstractVector ;
@@ -233,17 +233,21 @@ Provide the default label format for the `cut(x, ngroups)` method.
233
233
quantile_formatter (from, to, i; leftclosed, rightclosed) =
234
234
string (" Q" , i, " : " , leftclosed ? " [" : " (" , from, " , " , to, rightclosed ? " ]" : " )" )
235
235
236
- function _quantile! (v:: AbstractVector , p :: AbstractVector )
236
+ function _quantile! (v:: AbstractVector , ps :: AbstractVector )
237
237
n = length (v)
238
238
n > 0 || throw (ArgumentError (" cannot compute quantiles of empty data vector" ))
239
- sort! (v)
240
- return map (p) do i
241
- v[clamp (ceil (Int, n* i), 0 , n- 1 ) + firstindex (v)]
239
+ return map (ps) do p
240
+ i = clamp (ceil (Int, n* p), 1 , n) + firstindex (v) - 1
241
+ q = v[i]
242
+ # Take next distinct value even if quantile falls in a series of duplicated values
243
+ @inbounds for j in (i+ 1 ): lastindex (v)
244
+ q_prev = q
245
+ q = v[j]
246
+ q_prev != q && break
247
+ end
248
+ return q
242
249
end
243
250
end
244
- _quantile (x:: AbstractArray , p:: AbstractVector ) =
245
- _quantile! (Base. copymutable (vec (x)), p)
246
- _quantile (x, p:: AbstractVector ) = _quantile! (collect (x), p)
247
251
248
252
"""
249
253
cut(x::AbstractArray, ngroups::Integer;
@@ -253,7 +257,9 @@ _quantile(x, p::AbstractVector) = _quantile!(collect(x), p)
253
257
Cut a numeric array into `ngroups` quantiles.
254
258
255
259
Cutpoints differ from those returned by `Statistics.quantile` as they are suited
256
- for intervals closed on the left and taken from actual values in `x`.
260
+ for intervals closed on the left and taken from actual values in `x`. However,
261
+ group assignments are identical to those which would be obtained with type 1
262
+ quantiles if intervals were closed on the right.
257
263
258
264
If `x` contains `missing` values, they are automatically skipped when computing
259
265
quantiles.
@@ -273,14 +279,14 @@ function cut(x::AbstractArray, ngroups::Integer;
273
279
labels:: Union{AbstractVector{<:SupportedTypes},Function} = quantile_formatter,
274
280
allowempty:: Bool = false )
275
281
ngroups >= 1 || throw (ArgumentError (" ngroups must be strictly positive (got $ngroups )" ))
276
- xnm = eltype (x) >: Missing ? skipmissing (x) : x
277
- # Computing extrema is faster than taking 0 and 1 quantiles
278
- min_x, max_x = extrema (xnm)
282
+ xnm = eltype (x) >: Missing ? sort! (collect (skipmissing (x))) : sort (x)
283
+ min_x, max_x = first (xnm), last (xnm)
279
284
if (min_x isa Number && isnan (min_x)) ||
280
285
(max_x isa Number && isnan (max_x))
281
286
throw (ArgumentError (" NaN values are not allowed in input vector" ))
282
287
end
283
- breaks = _quantile (xnm, (0 : ngroups)/ ngroups)
288
+ qs = _quantile! (xnm, (1 : (ngroups- 1 ))/ ngroups)
289
+ breaks = [min_x; qs; max_x]
284
290
if ! allowempty && ! allunique (@view breaks[1 : end - 1 ])
285
291
throw (ArgumentError (" cannot compute $ngroups quantiles due to " *
286
292
" too many duplicated values in `x`. " *
0 commit comments