Skip to content

Commit 7d647e2

Browse files
authored
GH-45733: [C++][Python] Add biased/unbiased toggle to skew and kurtosis functions (#45762)
### Rationale for this change Allow unbiased computations of skew and kurtosis. Toggle variable to compute biased/unbiased. ### What changes are included in this PR? Add `biased` option to compute biased/unbiased versions when computing Skew and kurtosis. ### Are these changes tested? Yes, tests added. ### Are there any user-facing changes? There is a new `biased` option on `SkewOptions` * GitHub Issue: #45733 Authored-by: Raúl Cumplido <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 550a6b3 commit 7d647e2

File tree

10 files changed

+99
-34
lines changed

10 files changed

+99
-34
lines changed

cpp/src/arrow/acero/hash_aggregate_test.cc

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1504,11 +1504,15 @@ TEST_P(GroupBy, VarianceOptionsAndSkewOptions) {
15041504
/*ddof=*/0, /*skip_nulls=*/false, /*min_count=*/3);
15051505

15061506
auto skew_keep_nulls = std::make_shared<SkewOptions>(/*skip_nulls=*/false,
1507+
/*biased=*/true,
15071508
/*min_count=*/0);
1508-
auto skew_min_count =
1509-
std::make_shared<SkewOptions>(/*skip_nulls=*/true, /*min_count=*/3);
1509+
auto skew_min_count = std::make_shared<SkewOptions>(/*skip_nulls=*/true,
1510+
/*biased=*/true, /*min_count=*/3);
15101511
auto skew_keep_nulls_min_count = std::make_shared<SkewOptions>(
1511-
/*skip_nulls=*/false, /*min_count=*/3);
1512+
/*skip_nulls=*/false, /*biased=*/true, /*min_count=*/3);
1513+
1514+
auto skew_unbiased = std::make_shared<SkewOptions>(
1515+
/*skip_nulls=*/false, /*biased=*/false, /*min_count=*/0);
15121516

15131517
for (std::string value_column : {"argument", "argument1"}) {
15141518
for (bool use_threads : {false}) {
@@ -1553,26 +1557,30 @@ TEST_P(GroupBy, VarianceOptionsAndSkewOptions) {
15531557
{"hash_skew", skew_keep_nulls, value_column, "hash_skew"},
15541558
{"hash_skew", skew_min_count, value_column, "hash_skew"},
15551559
{"hash_skew", skew_keep_nulls_min_count, value_column, "hash_skew"},
1560+
{"hash_skew", skew_unbiased, value_column, "hash_skew"},
15561561
{"hash_kurtosis", skew_keep_nulls, value_column, "hash_kurtosis"},
15571562
{"hash_kurtosis", skew_min_count, value_column, "hash_kurtosis"},
15581563
{"hash_kurtosis", skew_keep_nulls_min_count, value_column,
15591564
"hash_kurtosis"},
1565+
{"hash_kurtosis", skew_unbiased, value_column, "hash_kurtosis"},
15601566
},
15611567
use_threads));
15621568
expected = ArrayFromJSON(struct_({
15631569
field("key", int64()),
15641570
field("hash_skew", float64()),
15651571
field("hash_skew", float64()),
15661572
field("hash_skew", float64()),
1573+
field("hash_skew", float64()),
1574+
field("hash_kurtosis", float64()),
15671575
field("hash_kurtosis", float64()),
15681576
field("hash_kurtosis", float64()),
15691577
field("hash_kurtosis", float64()),
15701578
}),
15711579
R"([
1572-
[1, null, 0.707106, null, null, -1.5, null ],
1573-
[2, 0.213833, 0.213833, 0.213833, -1.720164, -1.720164, -1.720164],
1574-
[3, 0.0, null, null, -2.0, null, null ],
1575-
[4, null, 0.707106, null, null, -1.5, null ]
1580+
[1, null, 0.707106, null, null, null, -1.5, null, null ],
1581+
[2, 0.213833, 0.213833, 0.213833, 0.37037, -1.720164, -1.720164, -1.720164, -3.90123],
1582+
[3, 0.0, null, null, null, -2.0, null, null, null ],
1583+
[4, null, 0.707106, null, null, null, -1.5, null, null ]
15761584
])");
15771585
ValidateOutput(actual);
15781586
AssertDatumsApproxEqual(expected, actual, /*verbose=*/true);

cpp/src/arrow/compute/api_aggregate.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ static auto kVarianceOptionsType = GetFunctionOptionsType<VarianceOptions>(
111111
DataMember("min_count", &VarianceOptions::min_count));
112112
static auto kSkewOptionsType = GetFunctionOptionsType<SkewOptions>(
113113
DataMember("skip_nulls", &SkewOptions::skip_nulls),
114+
DataMember("biased", &SkewOptions::biased),
114115
DataMember("min_count", &SkewOptions::min_count));
115116
static auto kQuantileOptionsType = GetFunctionOptionsType<QuantileOptions>(
116117
DataMember("q", &QuantileOptions::q),
@@ -154,9 +155,10 @@ VarianceOptions::VarianceOptions(int ddof, bool skip_nulls, uint32_t min_count)
154155
min_count(min_count) {}
155156
constexpr char VarianceOptions::kTypeName[];
156157

157-
SkewOptions::SkewOptions(bool skip_nulls, uint32_t min_count)
158+
SkewOptions::SkewOptions(bool skip_nulls, bool biased, uint32_t min_count)
158159
: FunctionOptions(internal::kSkewOptionsType),
159160
skip_nulls(skip_nulls),
161+
biased(biased),
160162
min_count(min_count) {}
161163

162164
QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation,

cpp/src/arrow/compute/api_aggregate.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,18 @@ class ARROW_EXPORT VarianceOptions : public FunctionOptions {
117117
/// \brief Control Skew and Kurtosis kernel behavior
118118
class ARROW_EXPORT SkewOptions : public FunctionOptions {
119119
public:
120-
explicit SkewOptions(bool skip_nulls = true, uint32_t min_count = 0);
120+
explicit SkewOptions(bool skip_nulls = true, bool biased = true,
121+
uint32_t min_count = 0);
121122
static constexpr char const kTypeName[] = "SkewOptions";
122123
static SkewOptions Defaults() { return SkewOptions{}; }
123124

124125
/// If true (the default), null values are ignored. Otherwise, if any value is null,
125126
/// emit null.
126127
bool skip_nulls;
128+
/// If true (the default), the calculated value is biased. If false, the calculated
129+
/// value includes a correction factor to reduce bias, making it more accurate for
130+
/// small sample sizes.
131+
bool biased;
127132
/// If less than this many non-null values are observed, emit null.
128133
uint32_t min_count;
129134
};

cpp/src/arrow/compute/kernels/aggregate_test.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3693,6 +3693,11 @@ TEST_F(TestSkewKurtosis, Options) {
36933693
AssertSkewKurtosisInvalid(type, {"[]", "[]", "[]"}, options);
36943694
AssertSkewKurtosisAre(type, "[0, 1, null, 2]", options, 0.0, -1.5);
36953695
AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0, -1.5);
3696+
options.biased = false;
3697+
AssertSkewKurtosisInvalid(type, "[0, 1]", options);
3698+
AssertSkewKurtosisAre(type, {"[1, 2, 3]", "[40, null]"}, options, 1.9889477403978211,
3699+
3.9631931024230695);
3700+
options.biased = true;
36963701
options.min_count = 3;
36973702
AssertSkewKurtosisAre(type, "[0, 1, null, 2]", options, 0.0, -1.5);
36983703
AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0, -1.5);

cpp/src/arrow/compute/kernels/aggregate_var_std.cc

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ struct StatisticImpl : public ScalarAggregator {
176176
: out_type(out_type),
177177
stat_type(stat_type),
178178
skip_nulls(options.skip_nulls),
179+
biased(options.biased),
179180
min_count(options.min_count),
180181
ddof(0),
181182
state(moments_level_for_statistic(stat_type), decimal_scale, skip_nulls) {}
@@ -197,7 +198,9 @@ struct StatisticImpl : public ScalarAggregator {
197198

198199
Status Finalize(KernelContext*, Datum* out) override {
199200
if (state.count() <= ddof || state.count() < min_count ||
200-
(!state.all_valid && !skip_nulls)) {
201+
(!state.all_valid && !skip_nulls) ||
202+
(stat_type == StatisticType::Skew && !biased && state.count() < 3) ||
203+
(stat_type == StatisticType::Kurtosis && !biased && state.count() < 4)) {
201204
out->value = std::make_shared<DoubleScalar>();
202205
} else {
203206
switch (stat_type) {
@@ -208,10 +211,10 @@ struct StatisticImpl : public ScalarAggregator {
208211
out->value = std::make_shared<DoubleScalar>(state.moments.Variance(ddof));
209212
break;
210213
case StatisticType::Skew:
211-
out->value = std::make_shared<DoubleScalar>(state.moments.Skew());
214+
out->value = std::make_shared<DoubleScalar>(state.moments.Skew(biased));
212215
break;
213216
case StatisticType::Kurtosis:
214-
out->value = std::make_shared<DoubleScalar>(state.moments.Kurtosis());
217+
out->value = std::make_shared<DoubleScalar>(state.moments.Kurtosis(biased));
215218
break;
216219
default:
217220
return Status::NotImplemented("Unsupported statistic type ",
@@ -224,6 +227,7 @@ struct StatisticImpl : public ScalarAggregator {
224227
std::shared_ptr<DataType> out_type;
225228
StatisticType stat_type;
226229
bool skip_nulls;
230+
bool biased;
227231
uint32_t min_count;
228232
int ddof = 0;
229233
MomentsState<ArrowType> state;

cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,31 @@ struct Moments {
8484

8585
double Stddev(int ddof) const { return sqrt(Variance(ddof)); }
8686

87-
double Skew() const {
88-
// This may return NaN for m2 == 0 and m3 == 0, which is expected
89-
return sqrt(count) * m3 / sqrt(m2 * m2 * m2);
87+
double Skew(bool biased = true) const {
88+
double result;
89+
// This may return NaN for m2 == 0 and m3 == 0, which is expected.
90+
if (biased) {
91+
result = sqrt(count) * m3 / sqrt(m2 * m2 * m2);
92+
} else {
93+
auto m2_avg = m2 / count;
94+
result = sqrt(count * (count - 1)) / (count - 2) * (m3 / count) /
95+
sqrt(m2_avg * m2_avg * m2_avg);
96+
}
97+
return result;
9098
}
9199

92-
double Kurtosis() const {
93-
// This may return NaN for m2 == 0 and m4 == 0, which is expected
94-
return count * m4 / (m2 * m2) - 3;
100+
double Kurtosis(bool biased = true) const {
101+
double result;
102+
// This may return NaN for m2 == 0 and m4 == 0, which is expected.
103+
if (biased) {
104+
result = count * m4 / (m2 * m2) - 3;
105+
} else {
106+
auto m2_avg = m2 / count;
107+
result = 1.0 / ((count - 2) * (count - 3)) *
108+
(((count * count) - 1.0) * (m4 / count) / (m2_avg * m2_avg) -
109+
3 * ((count - 1) * (count - 1)));
110+
}
111+
return result;
95112
}
96113

97114
void MergeFrom(int level, const Moments& other) { *this = Merge(level, *this, other); }

cpp/src/arrow/compute/kernels/hash_aggregate_numeric.cc

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -452,35 +452,37 @@ struct GroupedStatisticImpl : public GroupedAggregator {
452452
Status InitInternal(ExecContext* ctx, const KernelInitArgs& args,
453453
StatisticType stat_type, const VarianceOptions& options) {
454454
return InitInternal(ctx, args, stat_type, options.ddof, options.skip_nulls,
455-
options.min_count);
455+
/*biased=*/false, options.min_count);
456456
}
457457

458458
// Init helper for hash_skew and hash_kurtosis
459459
Status InitInternal(ExecContext* ctx, const KernelInitArgs& args,
460460
StatisticType stat_type, const SkewOptions& options) {
461461
return InitInternal(ctx, args, stat_type, /*ddof=*/0, options.skip_nulls,
462-
options.min_count);
462+
options.biased, options.min_count);
463463
}
464464

465465
Status InitInternal(ExecContext* ctx, const KernelInitArgs& args,
466-
StatisticType stat_type, int ddof, bool skip_nulls,
466+
StatisticType stat_type, int ddof, bool skip_nulls, bool biased,
467467
uint32_t min_count) {
468468
if constexpr (is_decimal_type<Type>::value) {
469469
int32_t decimal_scale =
470470
checked_cast<const DecimalType&>(*args.inputs[0].type).scale();
471-
return InitInternal(ctx, stat_type, decimal_scale, ddof, skip_nulls, min_count);
471+
return InitInternal(ctx, stat_type, decimal_scale, ddof, skip_nulls, biased,
472+
min_count);
472473
} else {
473-
return InitInternal(ctx, stat_type, /*decimal_scale=*/0, ddof, skip_nulls,
474+
return InitInternal(ctx, stat_type, /*decimal_scale=*/0, ddof, skip_nulls, biased,
474475
min_count);
475476
}
476477
}
477478

478479
Status InitInternal(ExecContext* ctx, StatisticType stat_type, int32_t decimal_scale,
479-
int ddof, bool skip_nulls, uint32_t min_count) {
480+
int ddof, bool skip_nulls, bool biased, uint32_t min_count) {
480481
stat_type_ = stat_type;
481482
moments_level_ = moments_level_for_statistic(stat_type_);
482483
decimal_scale_ = decimal_scale;
483484
skip_nulls_ = skip_nulls;
485+
biased_ = biased;
484486
min_count_ = min_count;
485487
ddof_ = ddof;
486488
ctx_ = ctx;
@@ -539,7 +541,7 @@ struct GroupedStatisticImpl : public GroupedAggregator {
539541
Status ConsumeGeneric(const ExecSpan& batch) {
540542
GroupedStatisticImpl<Type> state;
541543
RETURN_NOT_OK(state.InitInternal(ctx_, stat_type_, decimal_scale_, ddof_, skip_nulls_,
542-
min_count_));
544+
biased_, min_count_));
543545
RETURN_NOT_OK(state.Resize(num_groups_));
544546
int64_t* counts = state.counts_.mutable_data();
545547
double* means = state.means_.mutable_data();
@@ -612,7 +614,7 @@ struct GroupedStatisticImpl : public GroupedAggregator {
612614
var_std.resize(num_groups_);
613615
GroupedStatisticImpl<Type> state;
614616
RETURN_NOT_OK(state.InitInternal(ctx_, stat_type_, decimal_scale_, ddof_,
615-
skip_nulls_, min_count_));
617+
skip_nulls_, biased_, min_count_));
616618
RETURN_NOT_OK(state.Resize(num_groups_));
617619
int64_t* other_counts = state.counts_.mutable_data();
618620
double* other_means = state.means_.mutable_data();
@@ -739,7 +741,9 @@ struct GroupedStatisticImpl : public GroupedAggregator {
739741
const double* m3s = m3s_data();
740742
const double* m4s = m4s_data();
741743
for (int64_t i = 0; i < num_groups_; ++i) {
742-
if (counts[i] > ddof_ && counts[i] >= min_count_) {
744+
if (counts[i] > ddof_ && counts[i] >= min_count_ &&
745+
(stat_type_ != StatisticType::Skew || biased_ || counts[i] > 2) &&
746+
(stat_type_ != StatisticType::Kurtosis || biased_ || counts[i] > 3)) {
743747
const auto moments = Moments(counts[i], means[i], m2s[i], m3s[i], m4s[i]);
744748
switch (stat_type_) {
745749
case StatisticType::Var:
@@ -749,10 +753,10 @@ struct GroupedStatisticImpl : public GroupedAggregator {
749753
results[i] = moments.Stddev(ddof_);
750754
break;
751755
case StatisticType::Skew:
752-
results[i] = moments.Skew();
756+
results[i] = moments.Skew(biased_);
753757
break;
754758
case StatisticType::Kurtosis:
755-
results[i] = moments.Kurtosis();
759+
results[i] = moments.Kurtosis(biased_);
756760
break;
757761
default:
758762
return Status::NotImplemented("Statistic type ",
@@ -809,6 +813,7 @@ struct GroupedStatisticImpl : public GroupedAggregator {
809813
int moments_level_;
810814
int32_t decimal_scale_;
811815
bool skip_nulls_;
816+
bool biased_;
812817
uint32_t min_count_;
813818
int ddof_;
814819
int64_t num_groups_ = 0;

python/pyarrow/_compute.pyx

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1909,8 +1909,8 @@ class VarianceOptions(_VarianceOptions):
19091909

19101910

19111911
cdef class _SkewOptions(FunctionOptions):
1912-
def _set_options(self, skip_nulls, min_count):
1913-
self.wrapped.reset(new CSkewOptions(skip_nulls, min_count))
1912+
def _set_options(self, skip_nulls, biased, min_count):
1913+
self.wrapped.reset(new CSkewOptions(skip_nulls, biased, min_count))
19141914

19151915

19161916
class SkewOptions(_SkewOptions):
@@ -1920,11 +1920,14 @@ class SkewOptions(_SkewOptions):
19201920
Parameters
19211921
----------
19221922
{_skip_nulls_doc()}
1923+
biased : bool, default True
1924+
Whether the calculated value is biased.
1925+
If False, the value computed includes a correction factor to reduce bias.
19231926
{_min_count_doc(default=0)}
19241927
"""
19251928

1926-
def __init__(self, *, skip_nulls=True, min_count=0):
1927-
self._set_options(skip_nulls, min_count)
1929+
def __init__(self, *, skip_nulls=True, biased=True, min_count=0):
1930+
self._set_options(skip_nulls, biased, min_count)
19281931

19291932

19301933
cdef class _SplitOptions(FunctionOptions):

python/pyarrow/includes/libarrow.pxd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2628,8 +2628,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
26282628

26292629
cdef cppclass CSkewOptions \
26302630
"arrow::compute::SkewOptions"(CFunctionOptions):
2631-
CSkewOptions(c_bool skip_nulls, uint32_t min_count)
2631+
CSkewOptions(c_bool skip_nulls, c_bool biased, uint32_t min_count)
26322632
c_bool skip_nulls
2633+
c_bool biased
26332634
uint32_t min_count
26342635

26352636
cdef cppclass CScalarAggregateOptions \

python/pyarrow/tests/test_compute.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,21 @@ def test_kurtosis():
457457
assert pc.kurtosis(data, min_count=4).as_py() is None
458458

459459

460+
@pytest.mark.parametrize("input, expected", (
461+
(
462+
[1.0, 2.0, 3.0, 40.0, None],
463+
{'skew': 1.988947740397821, 'kurtosis': 3.9631931024230695}
464+
),
465+
([1, 2, 40], {'skew': 1.7281098503730385, 'kurtosis': None}),
466+
([1, 40], {'skew': None, 'kurtosis': None}),
467+
))
468+
def test_unbiased_skew_and_kurtosis(input, expected):
469+
arrow_skew = pc.skew(input, skip_nulls=True, biased=False)
470+
arrow_kurtosis = pc.kurtosis(input, skip_nulls=True, biased=False)
471+
assert arrow_skew == pa.scalar(expected['skew'], type=pa.float64())
472+
assert arrow_kurtosis == pa.scalar(expected['kurtosis'], type=pa.float64())
473+
474+
460475
def test_count_substring():
461476
for (ty, offset) in [(pa.string(), pa.int32()),
462477
(pa.large_string(), pa.int64())]:

0 commit comments

Comments
 (0)