Skip to content

Commit dd0f381

Browse files
authored
Merge pull request #19 from mrkn/histogram
Add Array#histogram
2 parents 81749c8 + 6392f1d commit dd0f381

File tree

5 files changed

+410
-1
lines changed

5 files changed

+410
-1
lines changed

ext/enumerable/statistics/extension/statistics.c

Lines changed: 266 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <ruby/util.h>
33
#include <ruby/version.h>
44
#include <assert.h>
5+
#include <math.h>
56

67
#if RUBY_API_VERSION_CODE >= 20400
78
/* for 2.4.0 or higher */
@@ -17,6 +18,12 @@
1718
# undef HAVE_RB_RATIONAL_PLUS
1819
#endif
1920

21+
#ifdef HAVE_RB_ARITHMETIC_SEQUENCE_EXTRACT
22+
# define HAVE_ARITHMETIC_SEQUENCE
23+
#else
24+
# undef HAVE_ARITHMETIC_SEQUENCE
25+
#endif
26+
2027
#ifndef RB_INTEGER_TYPE_P
2128
# define RB_INTEGER_TYPE_P(obj) enum_stat_integer_type_p(obj)
2229
static inline int
@@ -88,7 +95,11 @@ static VALUE half_in_rational;
8895

8996
static ID idPow, idPLUS, idMINUS, idSTAR, idDIV, idGE;
9097
static ID id_eqeq_p, id_idiv, id_negate, id_to_f, id_cmp, id_nan_p;
91-
static ID id_each, id_real_p, id_sum, id_population;
98+
static ID id_each, id_real_p, id_sum, id_population, id_closed, id_edge;
99+
100+
static VALUE sym_left, sym_right;
101+
102+
static VALUE cHistogram;
92103

93104
inline static VALUE
94105
f_add(VALUE x, VALUE y)
@@ -2051,9 +2062,253 @@ hash_value_counts(int argc, VALUE* argv, VALUE hash)
20512062
return any_value_counts(argc, argv, hash, hash_value_counts_without_sort);
20522063
}
20532064

2065+
static long
2066+
histogram_edge_bin_index(VALUE edge, VALUE rb_x, int left_p)
2067+
{
2068+
double x, y;
2069+
long lo, hi, mid;
2070+
2071+
x = NUM2DBL(rb_x);
2072+
2073+
lo = -1;
2074+
hi = RARRAY_LEN(edge);
2075+
2076+
if (left_p) {
2077+
while (hi - lo > 1) {
2078+
mid = lo + (hi - lo)/2;
2079+
y = NUM2DBL(RARRAY_AREF(edge, mid));
2080+
if (y <= x) {
2081+
lo = mid;
2082+
}
2083+
else {
2084+
hi = mid;
2085+
}
2086+
}
2087+
return lo;
2088+
}
2089+
else {
2090+
while (hi - lo > 1) {
2091+
mid = lo + (hi - lo)/2;
2092+
y = NUM2DBL(RARRAY_AREF(edge, mid));
2093+
if (y < x) {
2094+
lo = mid;
2095+
}
2096+
else {
2097+
hi = mid;
2098+
}
2099+
}
2100+
return hi - 1;
2101+
}
2102+
}
2103+
2104+
static void
2105+
histogram_weights_push_values(VALUE weights, VALUE edge, VALUE values, int left_p)
2106+
{
2107+
VALUE x, cur;
2108+
long i, n, bi;
2109+
2110+
n = RARRAY_LEN(values);
2111+
for (i = 0; i < n; ++i) {
2112+
x = RARRAY_AREF(values, i);
2113+
2114+
bi = histogram_edge_bin_index(edge, x, left_p);
2115+
2116+
cur = rb_ary_entry(weights, bi);
2117+
if (NIL_P(cur)) {
2118+
cur = INT2FIX(1);
2119+
}
2120+
else {
2121+
cur = rb_funcall(cur, idPLUS, 1, INT2FIX(1));
2122+
}
2123+
2124+
rb_ary_store(weights, bi, cur);
2125+
}
2126+
}
2127+
2128+
static int
2129+
opt_closed_left_p(VALUE opts)
2130+
{
2131+
int left_p = 1;
2132+
2133+
if (!NIL_P(opts)) {
2134+
VALUE closed;
2135+
#ifdef HAVE_RB_GET_KWARGS
2136+
ID kwargs = id_closed;
2137+
rb_get_kwargs(opts, &kwargs, 0, 1, &closed);
2138+
#else
2139+
closed = rb_hash_lookup2(opts, ID2SYM(id_closed), sym_left);
2140+
#endif
2141+
left_p = (closed != sym_right);
2142+
if (left_p && closed != sym_left) {
2143+
rb_raise(rb_eArgError, "invalid value for :closed keyword "
2144+
"(%"PRIsVALUE" for :left or :right)", closed);
2145+
}
2146+
}
2147+
2148+
return left_p;
2149+
}
2150+
2151+
static inline long
2152+
sturges(long n)
2153+
{
2154+
if (n == 0) return 1L;
2155+
return (long)(ceil(log2(n)) + 1);
2156+
}
2157+
2158+
static VALUE
2159+
ary_histogram_calculate_edge_lo_hi(const double lo, const double hi, const long nbins, const int left_p)
2160+
{
2161+
VALUE edge;
2162+
double bw, lbw, start, step, divisor, r;
2163+
long i, len;
2164+
2165+
if (hi == lo) {
2166+
start = hi;
2167+
step = 1;
2168+
divisor = 1;
2169+
len = 1;
2170+
}
2171+
else {
2172+
bw = (hi - lo) / nbins;
2173+
lbw = log10(bw);
2174+
if (lbw >= 0) {
2175+
step = pow(10, floor(lbw));
2176+
r = bw / step;
2177+
if (r <= 1.1) {
2178+
/* do nothing */
2179+
}
2180+
else if (r <= 2.2) {
2181+
step *= 2;
2182+
}
2183+
else if (r <= 5.5) {
2184+
step *= 5;
2185+
}
2186+
else {
2187+
step *= 10;
2188+
}
2189+
divisor = 1.0;
2190+
start = step * floor(lo / step);
2191+
len = (long)ceil((hi - start) / step);
2192+
}
2193+
else {
2194+
divisor = pow(10, -floor(lbw));
2195+
r = bw * divisor;
2196+
if (r <= 1.1) {
2197+
/* do nothing */
2198+
}
2199+
else if (r <= 2.2) {
2200+
divisor /= 2;
2201+
}
2202+
else if (r <= 5.5) {
2203+
divisor /= 5;
2204+
}
2205+
else {
2206+
divisor /= 10;
2207+
}
2208+
step = 1.0;
2209+
start = floor(lo * divisor);
2210+
len = (long)ceil(hi * divisor - start);
2211+
}
2212+
}
2213+
2214+
if (left_p) {
2215+
while (lo < start/divisor) {
2216+
start -= step;
2217+
}
2218+
while ((start + (len - 1)*step)/divisor <= hi) {
2219+
++len;
2220+
}
2221+
}
2222+
else {
2223+
while (lo <= start/divisor) {
2224+
start -= step;
2225+
}
2226+
while ((start + (len - 1)*step)/divisor < hi) {
2227+
++len;
2228+
}
2229+
}
2230+
2231+
edge = rb_ary_new_capa(len);
2232+
for (i = 0; i < len; ++i) {
2233+
rb_ary_push(edge, DBL2NUM(start/divisor));
2234+
start += step;
2235+
}
2236+
2237+
return edge;
2238+
}
2239+
2240+
static VALUE
2241+
ary_histogram_calculate_edge(VALUE ary, const long nbins, const int left_p)
2242+
{
2243+
long n;
2244+
VALUE minmax;
2245+
VALUE edge = Qnil;
2246+
double lo, hi;
2247+
2248+
Check_Type(ary, T_ARRAY);
2249+
n = RARRAY_LEN(ary);
2250+
2251+
if (n == 0 && nbins < 0) {
2252+
rb_raise(rb_eArgError, "nbins must be >= 0 for an empty array, got %ld", nbins);
2253+
}
2254+
else if (n > 0 && nbins < 1) {
2255+
rb_raise(rb_eArgError, "nbins must be >= 1 for a non-empty array, got %ld", nbins);
2256+
}
2257+
else if (n == 0) {
2258+
edge = rb_ary_new_capa(1);
2259+
rb_ary_push(edge, DBL2NUM(0.0));
2260+
return edge;
2261+
}
2262+
2263+
minmax = rb_funcall(ary, rb_intern("minmax"), 0);
2264+
lo = NUM2DBL(RARRAY_AREF(minmax, 0));
2265+
hi = NUM2DBL(RARRAY_AREF(minmax, 1));
2266+
2267+
edge = ary_histogram_calculate_edge_lo_hi(lo, hi, nbins, left_p);
2268+
2269+
return edge;
2270+
}
2271+
2272+
/* call-seq:
2273+
* ary.histogram(nbins=:auto, closed: :left)
2274+
*
2275+
* @param [Integer] nbins The approximate number of bins
2276+
* @param [:left, :right] closed
2277+
* If :left (the default), the bin interval are left-closed.
2278+
* If :right, the bin interval are right-closed.
2279+
*
2280+
* @return [EnumerableStatistics::Histogram] The histogram struct.
2281+
*/
2282+
static VALUE
2283+
ary_histogram(int argc, VALUE *argv, VALUE ary)
2284+
{
2285+
VALUE arg0, opts, edge, weights;
2286+
int left_p;
2287+
long nbins;
2288+
2289+
rb_scan_args(argc, argv, "01:", &arg0, &opts);
2290+
if (NIL_P(arg0)) {
2291+
nbins = sturges(RARRAY_LEN(ary));
2292+
}
2293+
else {
2294+
nbins = NUM2LONG(arg0);
2295+
}
2296+
left_p = opt_closed_left_p(opts);
2297+
2298+
edge = ary_histogram_calculate_edge(ary, nbins, left_p);
2299+
weights = rb_ary_new_capa(RARRAY_LEN(edge) - 1);
2300+
histogram_weights_push_values(weights, edge, ary, left_p);
2301+
2302+
return rb_struct_new(cHistogram, edge, weights,
2303+
left_p ? sym_left : sym_right,
2304+
Qfalse);
2305+
}
2306+
20542307
void
20552308
Init_extension(void)
20562309
{
2310+
VALUE mEnumerableStatistics;
2311+
20572312
#ifndef HAVE_ENUM_SUM
20582313
rb_define_method(rb_mEnumerable, "sum", enum_sum, -1);
20592314
#endif
@@ -2082,6 +2337,11 @@ Init_extension(void)
20822337
half_in_rational = nurat_s_new_internal(rb_cRational, INT2FIX(1), INT2FIX(2));
20832338
rb_gc_register_mark_object(half_in_rational);
20842339

2340+
mEnumerableStatistics = rb_const_get_at(rb_cObject, rb_intern("EnumerableStatistics"));
2341+
cHistogram = rb_const_get_at(mEnumerableStatistics, rb_intern("Histogram"));
2342+
2343+
rb_define_method(rb_cArray, "histogram", ary_histogram, -1);
2344+
20852345
idPLUS = '+';
20862346
idMINUS = '-';
20872347
idSTAR = '*';
@@ -2098,4 +2358,9 @@ Init_extension(void)
20982358
id_real_p = rb_intern("real?");
20992359
id_sum = rb_intern("sum");
21002360
id_population = rb_intern("population");
2361+
id_closed = rb_intern("closed");
2362+
id_edge = rb_intern("edge");
2363+
2364+
sym_left = ID2SYM(rb_intern("left"));
2365+
sym_right = ID2SYM(rb_intern("right"));
21012366
}

lib/enumerable/statistics.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
require "enumerable_statistics"
12
require "enumerable/statistics/extension"

lib/enumerable_statistics.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
require_relative "enumerable_statistics/version"
2+
require_relative "enumerable_statistics/histogram"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
module EnumerableStatistics
2+
class Histogram < Struct.new(:edge, :weights, :closed, :isdensity)
3+
alias density? isdensity
4+
end
5+
end

0 commit comments

Comments
 (0)