@@ -27,17 +27,56 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray,
27
27
end
28
28
end
29
29
30
+ const CUT_FMT = Printf. Format (" %.*g" )
31
+
32
+ """
33
+ CategoricalArrays.default_formatter(from, to, i::Integer;
34
+ leftclosed::Bool, rightclosed::Bool,
35
+ sigdigits::Integer)
36
+
37
+ Provide the default label format for the `cut(x, breaks)` method,
38
+ which is `"[from, to)"` if `leftclosed` is `true` and `"[from, to)"` otherwise.
39
+
40
+ If they are floating points values, breaks are turned into to strings using
41
+ `@sprintf("%.*g", sigdigits, break)`
42
+ (or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
30
43
"""
31
- default_formatter(from, to, i; leftclosed, rightclosed)
44
+ function default_formatter (from, to, i:: Integer ;
45
+ leftclosed:: Bool , rightclosed:: Bool ,
46
+ sigdigits:: Integer )
47
+ from_str = from isa AbstractFloat ?
48
+ Printf. format (CUT_FMT, sigdigits, from) :
49
+ string (from)
50
+ to_str = to isa AbstractFloat ?
51
+ Printf. format (CUT_FMT, sigdigits, to) :
52
+ string (to)
53
+ string (leftclosed ? " [" : " (" , from_str, " , " , to_str, rightclosed ? " ]" : " )" )
54
+ end
32
55
33
- Provide the default label format for the `cut(x, breaks)` method.
34
56
"""
35
- default_formatter (from, to, i; leftclosed, rightclosed) =
36
- string (leftclosed ? " [" : " (" , from, " , " , to, rightclosed ? " ]" : " )" )
57
+ CategoricalArrays.numbered_formatter(from, to, i::Integer;
58
+ leftclosed::Bool, rightclosed::Bool,
59
+ sigdigits::Integer)
60
+
61
+ Provide the default label format for the `cut(x, ngroups)` method
62
+ when `allowempty=true`, which is `"i: [from, to)"` if `leftclosed`
63
+ is `true` and `"i: [from, to)"` otherwise.
64
+
65
+ If they are floating points values, breaks are turned into to strings using
66
+ `@sprintf("%.*g", sigdigits, breaks)`
67
+ (or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
68
+ """
69
+ numbered_formatter (from, to, i:: Integer ;
70
+ leftclosed:: Bool , rightclosed:: Bool ,
71
+ sigdigits:: Integer ) =
72
+ string (i, " : " ,
73
+ default_formatter (from, to, i, leftclosed= leftclosed, rightclosed= rightclosed,
74
+ sigdigits= sigdigits))
37
75
38
76
@doc raw """
39
77
cut(x::AbstractArray, breaks::AbstractVector;
40
78
labels::Union{AbstractVector,Function},
79
+ sigdigits::Integer=3,
41
80
extend::Union{Bool,Missing}=false, allowempty::Bool=false)
42
81
43
82
Cut a numeric array into intervals at values `breaks`
@@ -54,10 +93,15 @@ also accept them.
54
93
in `x` fall outside of the breaks; when `true`, breaks are automatically added to include
55
94
all values in `x`; when `missing`, values outside of the breaks generate `missing` entries.
56
95
* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
57
- or numbers giving the names to use for
58
- the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
96
+ or numbers giving the names to use for the intervals; or a function
97
+ `f(from, to, i::Integer ; leftclosed::Bool , rightclosed::Bool, sigdigits::Integer )` that generates
59
98
the labels from the left and right interval boundaries and the group index. Defaults to
60
- `"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`).
99
+ [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
100
+ for the rightmost interval if `extend == true`).
101
+ * `sigdigits::Integer=3`: the minimum number of significant digits to use in labels.
102
+ This value is increased automatically if necessary so that rounded breaks are unique.
103
+ Only used for floating point types and when `labels` is a function, in which case it
104
+ is passed to it as a keyword argument.
61
105
* `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than
62
106
the last one appear multiple times, generating empty intervals; when `true`,
63
107
duplicate breaks are allowed and the intervals they generate are kept as
@@ -69,19 +113,19 @@ julia> using CategoricalArrays
69
113
70
114
julia> cut(-1:0.5:1, [0, 1], extend=true)
71
115
5-element CategoricalArray{String,1,UInt32}:
72
- "[-1.0, 0. 0)"
73
- "[-1.0, 0. 0)"
74
- "[0.0 , 1.0 ]"
75
- "[0.0 , 1.0 ]"
76
- "[0.0 , 1.0 ]"
116
+ "[-1, 0)"
117
+ "[-1, 0)"
118
+ "[0, 1]"
119
+ "[0, 1]"
120
+ "[0, 1]"
77
121
78
122
julia> cut(-1:0.5:1, 2)
79
123
5-element CategoricalArray{String,1,UInt32}:
80
- "Q1: [-1.0, 0. 0)"
81
- "Q1: [-1.0, 0. 0)"
82
- "Q2: [0.0 , 1.0 ]"
83
- "Q2: [0.0 , 1.0 ]"
84
- "Q2: [0.0 , 1.0 ]"
124
+ "[-1, 0)"
125
+ "[-1, 0)"
126
+ "[0 , 1]"
127
+ "[0 , 1]"
128
+ "[0 , 1]"
85
129
86
130
julia> cut(-1:0.5:1, 2, labels=["A", "B"])
87
131
5-element CategoricalArray{String,1,UInt32}:
@@ -114,6 +158,7 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
114
158
@inline function cut (x:: AbstractArray , breaks:: AbstractVector ;
115
159
extend:: Union{Bool, Missing} = false ,
116
160
labels:: Union{AbstractVector{<:SupportedTypes},Function} = default_formatter,
161
+ sigdigits:: Integer = 3 ,
117
162
allowmissing:: Union{Bool, Nothing} = nothing ,
118
163
allow_missing:: Union{Bool, Nothing} = nothing ,
119
164
allowempty:: Bool = false )
@@ -127,14 +172,15 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
127
172
:cut )
128
173
extend = missing
129
174
end
130
- return _cut (x, breaks, extend, labels, allowempty)
175
+ return _cut (x, breaks, extend, labels, sigdigits, allowempty)
131
176
end
132
177
133
178
# Separate function for inferability (thanks to inlining of cut)
134
179
function _cut (x:: AbstractArray{T, N} , breaks:: AbstractVector ,
135
180
extend:: Union{Bool, Missing} ,
136
181
labels:: Union{AbstractVector{<:SupportedTypes},Function} ,
137
- allowempty:: Bool = false ) where {T, N}
182
+ sigdigits:: Integer ,
183
+ allowempty:: Bool ) where {T, N}
138
184
if ! issorted (breaks)
139
185
breaks = sort (breaks)
140
186
end
@@ -191,21 +237,55 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
191
237
end
192
238
end
193
239
240
+ # Find minimal number of digits so that distinct breaks remain so
241
+ if eltype (breaks) <: AbstractFloat
242
+ while true
243
+ local i
244
+ for outer i in 2 : lastindex (breaks)
245
+ b1 = breaks[i- 1 ]
246
+ b2 = breaks[i]
247
+ isequal (b1, b2) && continue
248
+
249
+ b1_str = Printf. format (CUT_FMT, sigdigits, b1)
250
+ b2_str = Printf. format (CUT_FMT, sigdigits, b2)
251
+ if b1_str == b2_str
252
+ sigdigits += 1
253
+ break
254
+ end
255
+ end
256
+ i == lastindex (breaks) && break
257
+ end
258
+ end
194
259
n = length (breaks)
195
260
n >= 2 || throw (ArgumentError (" at least two breaks must be provided when extend is not true" ))
196
261
if labels isa Function
197
262
from = breaks[1 : n- 1 ]
198
263
to = breaks[2 : n]
199
- firstlevel = labels (from[1 ], to[1 ], 1 ,
200
- leftclosed= ! isequal (breaks[1 ], breaks[2 ]), rightclosed= false )
264
+ local firstlevel
265
+ try
266
+ firstlevel = labels (from[1 ], to[1 ], 1 ,
267
+ leftclosed= ! isequal (breaks[1 ], breaks[2 ]), rightclosed= false ,
268
+ sigdigits= sigdigits)
269
+ catch
270
+ # Support functions defined before v1.0, where sigdigits did not exist
271
+ Base. depwarn (" `labels` function is now required to accept a `sigdigits` keyword argument" ,
272
+ :cut )
273
+ labels_orig = labels
274
+ labels = (from, to, i; leftclosed, rightclosed, sigdigits) ->
275
+ labels_orig (from, to, i; leftclosed, rightclosed)
276
+ firstlevel = labels_orig (from[1 ], to[1 ], 1 ,
277
+ leftclosed= ! isequal (breaks[1 ], breaks[2 ]), rightclosed= false )
278
+ end
201
279
levs = Vector {typeof(firstlevel)} (undef, n- 1 )
202
280
levs[1 ] = firstlevel
203
281
for i in 2 : n- 2
204
282
levs[i] = labels (from[i], to[i], i,
205
- leftclosed= ! isequal (breaks[i], breaks[i+ 1 ]), rightclosed= false )
283
+ leftclosed= ! isequal (breaks[i], breaks[i+ 1 ]), rightclosed= false ,
284
+ sigdigits= sigdigits)
206
285
end
207
286
levs[end ] = labels (from[end ], to[end ], n- 1 ,
208
- leftclosed= true , rightclosed= true )
287
+ leftclosed= true , rightclosed= true ,
288
+ sigdigits= sigdigits)
209
289
else
210
290
length (labels) == n- 1 ||
211
291
throw (ArgumentError (" labels must be of length $(n- 1 ) , but got length $(length (labels)) " ))
@@ -225,40 +305,37 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
225
305
CategoricalArray {S, N} (refs, pool)
226
306
end
227
307
228
- """
229
- quantile_formatter(from, to, i; leftclosed, rightclosed)
230
-
231
- Provide the default label format for the `cut(x, ngroups)` method.
232
- """
233
- quantile_formatter (from, to, i; leftclosed, rightclosed) =
234
- string (" Q" , i, " : " , leftclosed ? " [" : " (" , from, " , " , to, rightclosed ? " ]" : " )" )
235
-
236
308
"""
237
309
Find first value in (sorted) `v` which is greater than or equal to each quantile
238
310
in (sorted) `qs`.
239
311
"""
240
312
function find_breaks (v:: AbstractVector , qs:: AbstractVector )
241
313
n = length (qs)
242
314
breaks = similar (v, n)
243
- n == 0 && return breaks
315
+ breaks_prev = similar (v, n)
316
+ n == 0 && return (breaks, breaks_prev)
244
317
245
318
i = 1
246
319
q = qs[1 ]
247
- @inbounds for x in v
320
+ @inbounds for j in eachindex (v)
321
+ x = v[j]
248
322
# Use isless and isequal to differentiate -0.0 from 0.0
249
323
if isless (q, x) || isequal (q, x)
250
324
breaks[i] = x
325
+ # FIXME : handle duplicated breaks
326
+ breaks_prev[i] = v[clamp (j- 1 , firstindex (v), lastindex (v))]
251
327
i += 1
252
328
i > n && break
253
329
q = qs[i]
254
330
end
255
331
end
256
- return breaks
332
+ return ( breaks, breaks_prev)
257
333
end
258
334
259
335
"""
260
336
cut(x::AbstractArray, ngroups::Integer;
261
337
labels::Union{AbstractVector{<:AbstractString},Function},
338
+ sigdigits::Integer=3,
262
339
allowempty::Bool=false)
263
340
264
341
Cut a numeric array into `ngroups` quantiles.
@@ -271,17 +348,25 @@ quantiles.
271
348
272
349
# Keyword arguments
273
350
* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
274
- or numbers giving the names to use for
275
- the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
351
+ or numbers giving the names to use for the intervals; or a function
352
+ `f(from, to, i::Integer ; leftclosed::Bool , rightclosed::Bool, sigdigits::Integer )` that generates
276
353
the labels from the left and right interval boundaries and the group index. Defaults to
277
- `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval).
354
+ [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
355
+ for the rightmost interval if `extend == true`) if `allowempty=false`, otherwise to
356
+ [`CategoricalArrays.numbered_formatter`](@ref), which prefixes the label with the quantile
357
+ number to ensure uniqueness.
358
+ * `sigdigits::Integer=3`: the minimum number of significant digits to use when rounding
359
+ breaks for inclusion in generated labels. This value is increased automatically if necessary
360
+ so that rounded breaks are unique. Only used for floating point types and when `labels` is a
361
+ function, in which case it is passed to it as a keyword argument.
278
362
* `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints
279
363
other than the last one are equal, generating empty intervals;
280
364
when `true`, duplicate breaks are allowed and the intervals they generate are kept as
281
365
unused levels (but duplicate labels are not allowed).
282
366
"""
283
367
function cut (x:: AbstractArray , ngroups:: Integer ;
284
- labels:: Union{AbstractVector{<:SupportedTypes},Function} = quantile_formatter,
368
+ labels:: Union{AbstractVector{<:SupportedTypes},Function,Nothing} = nothing ,
369
+ sigdigits:: Integer = 3 ,
285
370
allowempty:: Bool = false )
286
371
ngroups >= 1 || throw (ArgumentError (" ngroups must be strictly positive (got $ngroups )" ))
287
372
sorted_x = eltype (x) >: Missing ? sort! (collect (skipmissing (x))) : sort (x)
@@ -291,12 +376,48 @@ function cut(x::AbstractArray, ngroups::Integer;
291
376
throw (ArgumentError (" NaN values are not allowed in input vector" ))
292
377
end
293
378
qs = quantile! (sorted_x, (1 : (ngroups- 1 ))/ ngroups, sorted= true )
294
- breaks = [min_x; find_breaks (sorted_x, qs); max_x]
379
+ breaks, breaks_prev = find_breaks (sorted_x, qs)
380
+ breaks = [min_x; breaks; max_x]
295
381
if ! allowempty && ! allunique (@view breaks[1 : end - 1 ])
296
382
throw (ArgumentError (" cannot compute $ngroups quantiles due to " *
297
383
" too many duplicated values in `x`. " *
298
384
" Pass `allowempty=true` to allow empty quantiles or " *
299
385
" choose a lower value for `ngroups`." ))
300
386
end
301
- cut (x, breaks; labels= labels, allowempty= allowempty)
387
+ if labels === nothing
388
+ labels = allowempty ? numbered_formatter : default_formatter
389
+
390
+ if eltype (breaks) <: AbstractFloat
391
+ while true
392
+ local i
393
+ for outer i in 2 : lastindex (breaks)
394
+ b1 = breaks[i- 1 ]
395
+ b2 = breaks[i]
396
+ isequal (b1, b2) && continue
397
+
398
+ # Find minimal number of digits so that `floor` does not
399
+ # return a value that is lower than value immediately below break
400
+ # We skip the first break, which is the minimum and has no equivalent
401
+ # in `breaks_prev`
402
+ b1_rounded = round (b1, sigdigits= sigdigits)
403
+ b2_rounded = round (b2, sigdigits= sigdigits)
404
+ if i < lastindex (breaks) &&
405
+ (isequal (b2_rounded, breaks_prev[i- 1 ]) || isless (b2_rounded, breaks_prev[i- 1 ]))
406
+ sigdigits += 1
407
+ break
408
+ end
409
+
410
+ # Find minimal number of digits so that breaks are unique
411
+ b1_str = Printf. format (CUT_FMT, sigdigits, b1)
412
+ b2_str = Printf. format (CUT_FMT, sigdigits, b2)
413
+ if b1_str == b2_str
414
+ sigdigits += 1
415
+ break
416
+ end
417
+ end
418
+ i == lastindex (breaks) && break
419
+ end
420
+ end
421
+ end
422
+ return cut (x, breaks; labels= labels, sigdigits= sigdigits, allowempty= allowempty)
302
423
end
0 commit comments