Skip to content

Commit 01ee0c0

Browse files
author
Rafał Hibner
committed
Add tdigest_reduce and tdigest_quantile
1 parent 6a7f4ef commit 01ee0c0

File tree

6 files changed

+438
-155
lines changed

6 files changed

+438
-155
lines changed

cpp/src/arrow/compute/api_aggregate.cc

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,8 @@ static auto kTDigestReduceOptionsType = GetFunctionOptionsType<TDigestReduceOpti
150150
DataMember("scaler", &TDigestReduceOptions::scaler));
151151
static auto kTDigestQuantileOptionsType = GetFunctionOptionsType<TDigestQuantileOptions>(
152152
DataMember("q", &TDigestQuantileOptions::q),
153-
DataMember("min_count", &TDigestQuantileOptions::min_count));
153+
DataMember("min_count", &TDigestQuantileOptions::min_count),
154+
DataMember("scaler", &TDigestQuantileOptions::scaler));
154155
static auto kPivotOptionsType = GetFunctionOptionsType<PivotWiderOptions>(
155156
DataMember("key_names", &PivotWiderOptions::key_names),
156157
DataMember("unexpected_key_behavior", &PivotWiderOptions::unexpected_key_behavior));
@@ -239,15 +240,19 @@ TDigestReduceOptions::TDigestReduceOptions(Scaler scaler)
239240
: FunctionOptions(internal::kTDigestReduceOptionsType), scaler{scaler} {}
240241
constexpr char TDigestReduceOptions::kTypeName[];
241242

242-
TDigestQuantileOptions::TDigestQuantileOptions(double q, uint32_t min_count)
243+
TDigestQuantileOptions::TDigestQuantileOptions(double q, uint32_t min_count,
244+
Scaler scaler)
243245
: FunctionOptions(internal::kTDigestQuantileOptionsType),
244246
q{q},
245-
min_count{min_count} {}
247+
min_count{min_count},
248+
scaler{scaler} {}
246249

247-
TDigestQuantileOptions::TDigestQuantileOptions(std::vector<double> q, uint32_t min_count)
250+
TDigestQuantileOptions::TDigestQuantileOptions(std::vector<double> q, uint32_t min_count,
251+
Scaler scaler)
248252
: FunctionOptions(internal::kTDigestQuantileOptionsType),
249253
q{std::move(q)},
250-
min_count{min_count} {}
254+
min_count{min_count},
255+
scaler{scaler} {}
251256
constexpr char TDigestReduceOptions::kTypeName[];
252257

253258
PivotWiderOptions::PivotWiderOptions(std::vector<std::string> key_names,
@@ -363,6 +368,16 @@ Result<Datum> TDigestMap(const Datum& value, const TDigestMapOptions& options,
363368
return CallFunction("tdigest_map", {value}, &options, ctx);
364369
}
365370

371+
Result<Datum> TDigestReduce(const Datum& value, const TDigestReduceOptions& options,
372+
ExecContext* ctx) {
373+
return CallFunction("tdigest_reduce", {value}, &options, ctx);
374+
}
375+
376+
Result<Datum> TDigestQuantile(const Datum& value, const TDigestQuantileOptions& options,
377+
ExecContext* ctx) {
378+
return CallFunction("tdigest_quantile", {value}, &options, ctx);
379+
}
380+
366381
Result<Datum> Index(const Datum& value, const IndexOptions& options, ExecContext* ctx) {
367382
return CallFunction("index", {value}, &options, ctx);
368383
}

cpp/src/arrow/compute/api_aggregate.h

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,15 +244,21 @@ class ARROW_EXPORT TDigestReduceOptions : public FunctionOptions {
244244
/// By default, returns the median value.
245245
class ARROW_EXPORT TDigestQuantileOptions : public FunctionOptions {
246246
public:
247-
explicit TDigestQuantileOptions(double q = 0.5, uint32_t min_count = 0);
248-
explicit TDigestQuantileOptions(std::vector<double> q, uint32_t min_count = 0);
247+
using Scaler = TDigestOptions::Scaler;
248+
249+
explicit TDigestQuantileOptions(double q = 0.5, uint32_t min_count = 0,
250+
Scaler scaler = Scaler::K1);
251+
explicit TDigestQuantileOptions(std::vector<double> q, uint32_t min_count = 0,
252+
Scaler scaler = Scaler::K1);
249253
static constexpr char const kTypeName[] = "TDigestQuantileOptions";
250254
static TDigestQuantileOptions Defaults() { return TDigestQuantileOptions{}; }
251255

252256
/// probability level of quantile must be between 0 and 1 inclusive
253257
std::vector<double> q;
254258
/// If less than this many non-null values are observed, emit null.
255259
uint32_t min_count;
260+
/// select scaler implementation
261+
Scaler scaler;
256262
};
257263

258264
/// \brief Control Pivot kernel behavior
@@ -643,7 +649,7 @@ Result<Datum> TDigest(const Datum& value,
643649
/// \brief Calculate centroids of a numeric array with T-Digest algorithm
644650
///
645651
/// \param[in] value input datum, expecting Array or ChunkedArray
646-
/// \param[in] options see TDigestOptions for more information
652+
/// \param[in] options see TDigestMapOptions for more information
647653
/// \param[in] ctx the function execution context, optional
648654
/// \return resulting struct of mean and weight arrays
649655
///
@@ -654,6 +660,36 @@ Result<Datum> TDigestMap(const Datum& value,
654660
const TDigestMapOptions& options = TDigestMapOptions::Defaults(),
655661
ExecContext* ctx = NULLPTR);
656662

663+
/// \brief Merge multiple centroid sets into one
664+
///
665+
/// \param[in] value input centroid sets, expecting Scalar, Array or ChunkedArray of
666+
/// centroid structs \param[in] options see TDigestReduceOptions for more information
667+
/// \param[in] ctx the function execution context, optional
668+
/// \return resulting struct of mean and weight arrays
669+
///
670+
/// \since 22.0.0
671+
/// \note API not yet finalized
672+
ARROW_EXPORT
673+
Result<Datum> TDigestReduce(
674+
const Datum& value,
675+
const TDigestReduceOptions& options = TDigestReduceOptions::Defaults(),
676+
ExecContext* ctx = NULLPTR);
677+
678+
/// \brief Calculate the approximate quantiles using centroids with T-Digest algorithm
679+
///
680+
/// \param[in] value input centroid sets, expecting Scalar, Array or ChunkedArray of
681+
/// centroid structs \param[in] options see TDigestQuantileOptions for more information
682+
/// \param[in] ctx the function execution context, optional
683+
/// \return resulting struct of mean and weight arrays
684+
///
685+
/// \since 22.0.0
686+
/// \note API not yet finalized
687+
ARROW_EXPORT
688+
Result<Datum> TDigestQuantile(
689+
const Datum& value,
690+
const TDigestQuantileOptions& options = TDigestQuantileOptions::Defaults(),
691+
ExecContext* ctx = NULLPTR);
692+
657693
/// \brief Find the first index of a value in an array.
658694
///
659695
/// \param[in] value The array to search.

0 commit comments

Comments
 (0)