From 48042a7963056d3163cffcc34376f8b628cd341a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 18 Dec 2024 12:31:31 +0100 Subject: [PATCH 1/8] GH-45190: [C++][Compute] Add rank_percentile function --- cpp/src/arrow/compute/api_vector.cc | 13 + cpp/src/arrow/compute/api_vector.h | 24 ++ cpp/src/arrow/compute/kernels/vector_rank.cc | 307 ++++++++++++------ .../compute/kernels/vector_sort_internal.h | 7 + .../arrow/compute/kernels/vector_sort_test.cc | 158 ++++++++- docs/source/cpp/compute.rst | 38 ++- 6 files changed, 428 insertions(+), 119 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 22ecf1cc878..54e04298b6a 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -48,6 +48,7 @@ using compute::DictionaryEncodeOptions; using compute::FilterOptions; using compute::NullPlacement; using compute::RankOptions; +using compute::RankPercentileOptions; template <> struct EnumTraits @@ -151,6 +152,10 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankOptions::sort_keys), DataMember("null_placement", &RankOptions::null_placement), DataMember("tiebreaker", &RankOptions::tiebreaker)); +static auto kRankPercentileOptionsType = GetFunctionOptionsType( + DataMember("sort_keys", &RankPercentileOptions::sort_keys), + DataMember("null_placement", &RankPercentileOptions::null_placement), + DataMember("factor", &RankPercentileOptions::factor)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); static auto kListFlattenOptionsType = GetFunctionOptionsType( @@ -228,6 +233,14 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac tiebreaker(tiebreaker) {} constexpr char RankOptions::kTypeName[]; +RankPercentileOptions::RankPercentileOptions(std::vector sort_keys, + NullPlacement null_placement, double factor) + : FunctionOptions(internal::kRankPercentileOptionsType), + sort_keys(std::move(sort_keys)), + null_placement(null_placement), + factor(factor) {} +constexpr char RankPercentileOptions::kTypeName[]; + PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index ada1665b3ec..5fd8241e9bd 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -195,6 +195,30 @@ class ARROW_EXPORT RankOptions : public FunctionOptions { Tiebreaker tiebreaker; }; +/// \brief Percentile rank options +class ARROW_EXPORT RankPercentileOptions : public FunctionOptions { + public: + explicit RankPercentileOptions(std::vector sort_keys = {}, + NullPlacement null_placement = NullPlacement::AtEnd, + double factor = 1.0); + /// Convenience constructor for array inputs + explicit RankPercentileOptions(SortOrder order, + NullPlacement null_placement = NullPlacement::AtEnd, + double factor = 1.0) + : RankPercentileOptions({SortKey("", order)}, null_placement, factor) {} + + static constexpr char const kTypeName[] = "RankPercentileOptions"; + static RankPercentileOptions Defaults() { return RankPercentileOptions(); } + + /// Column key(s) to order by and how to order by these sort keys. + std::vector sort_keys; + /// Whether nulls and NaNs are placed at the start or at the end + NullPlacement null_placement; + /// Factor to apply to the output. + /// Use 1.0 for results in (0, 1), 100.0 for percentages, etc. + double factor; +}; + /// \brief Partitioning options for NthToIndices class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { public: diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 4fdc83788c6..50af9c6d599 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -15,9 +15,13 @@ // specific language governing permissions and limitations // under the License. +#include +#include + #include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" +#include "arrow/util/logging.h" namespace arrow::compute::internal { @@ -32,10 +36,6 @@ namespace { // is the same as the value at the previous sort index. constexpr uint64_t kDuplicateMask = 1ULL << 63; -constexpr bool NeedsDuplicates(RankOptions::Tiebreaker tiebreaker) { - return tiebreaker != RankOptions::First; -} - template void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_selector) { using T = decltype(value_selector(int64_t{})); @@ -63,81 +63,145 @@ void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_sel } } -Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted, - const NullPlacement null_placement, - const RankOptions::Tiebreaker tiebreaker) { - auto length = sorted.overall_end() - sorted.overall_begin(); - ARROW_ASSIGN_OR_RAISE(auto rankings, - MakeMutableUInt64Array(length, ctx->memory_pool())); - auto out_begin = rankings->GetMutableValues(1); - uint64_t rank; - - auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; - auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; - - switch (tiebreaker) { - case RankOptions::Dense: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { - if (!is_duplicate(*it)) { - ++rank; - } - out_begin[original_index(*it)] = rank; +struct RankingsEmitter { + virtual ~RankingsEmitter() = default; + virtual bool NeedsDuplicates() = 0; + virtual Result CreateRankings(ExecContext* ctx, + const NullPartitionResult& sorted) = 0; +}; + +// A helper class that emits rankings for the "rank_percentile" function +struct PercentileRankingsEmitter : public RankingsEmitter { + explicit PercentileRankingsEmitter(double factor) : factor_(factor) {} + + bool NeedsDuplicates() override { return true; } + + Result CreateRankings(ExecContext* ctx, + const NullPartitionResult& sorted) override { + const int64_t length = sorted.overall_end() - sorted.overall_begin(); + ARROW_ASSIGN_OR_RAISE(auto rankings, + MakeMutableFloat64Array(length, ctx->memory_pool())); + auto out_begin = rankings->GetMutableValues(1); + + auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; + auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; + + // The count of values strictly less than the value being considered + int64_t cum_freq = 0; + auto it = sorted.overall_begin(); + + while (it < sorted.overall_end()) { + // Look for a run of duplicate values + DCHECK(!is_duplicate(*it)); + auto run_end = it; + while (++run_end < sorted.overall_end() && is_duplicate(*run_end)) { + } + // The run length, i.e. the frequency of the current value + int64_t freq = run_end - it; + double percentile = (cum_freq + 0.5 * freq) * factor_ / static_cast(length); + // Output percentile rank values + for (; it < run_end; ++it) { + out_begin[original_index(*it)] = percentile; } - break; + cum_freq += freq; } + DCHECK_EQ(cum_freq, length); + return Datum(rankings); + } + + private: + const double factor_; +}; + +// A helper class that emits rankings for the "rank" function +struct OrdinalRankingsEmitter : public RankingsEmitter { + explicit OrdinalRankingsEmitter(RankOptions::Tiebreaker tiebreaker) + : tiebreaker_(tiebreaker) {} + + bool NeedsDuplicates() override { return tiebreaker_ != RankOptions::First; } - case RankOptions::First: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) { - // No duplicate marks expected for RankOptions::First - DCHECK(!is_duplicate(*it)); - out_begin[*it] = ++rank; + Result CreateRankings(ExecContext* ctx, + const NullPartitionResult& sorted) override { + const int64_t length = sorted.overall_end() - sorted.overall_begin(); + ARROW_ASSIGN_OR_RAISE(auto rankings, + MakeMutableUInt64Array(length, ctx->memory_pool())); + auto out_begin = rankings->GetMutableValues(1); + uint64_t rank; + + auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; + auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; + + switch (tiebreaker_) { + case RankOptions::Dense: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { + if (!is_duplicate(*it)) { + ++rank; + } + out_begin[original_index(*it)] = rank; + } + break; } - break; - } - case RankOptions::Min: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { - if (!is_duplicate(*it)) { - rank = (it - sorted.overall_begin()) + 1; + case RankOptions::First: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) { + // No duplicate marks expected for RankOptions::First + DCHECK(!is_duplicate(*it)); + out_begin[*it] = ++rank; } - out_begin[original_index(*it)] = rank; + break; } - break; - } - case RankOptions::Max: { - rank = length; - for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); --it) { - out_begin[original_index(*it)] = rank; - // If the current index isn't marked as duplicate, then it's the last - // tie in a row (since we iterate in reverse order), so update rank - // for the next row of ties. - if (!is_duplicate(*it)) { - rank = it - sorted.overall_begin(); + case RankOptions::Min: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { + if (!is_duplicate(*it)) { + rank = (it - sorted.overall_begin()) + 1; + } + out_begin[original_index(*it)] = rank; } + break; + } + + case RankOptions::Max: { + rank = length; + for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); --it) { + out_begin[original_index(*it)] = rank; + // If the current index isn't marked as duplicate, then it's the last + // tie in a row (since we iterate in reverse order), so update rank + // for the next row of ties. + if (!is_duplicate(*it)) { + rank = it - sorted.overall_begin(); + } + } + break; } - break; } + + return Datum(rankings); } - return Datum(rankings); -} + private: + const RankOptions::Tiebreaker tiebreaker_; +}; const RankOptions* GetDefaultRankOptions() { static const auto kDefaultRankOptions = RankOptions::Defaults(); return &kDefaultRankOptions; } +const RankPercentileOptions* GetDefaultPercentileRankOptions() { + static const auto kDefaultPercentileRankOptions = RankPercentileOptions::Defaults(); + return &kDefaultPercentileRankOptions; +} + template class RankerMixin : public TypeVisitor { public: RankerMixin(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const InputType& input, const SortOrder order, - const NullPlacement null_placement, - const RankOptions::Tiebreaker tiebreaker, Datum* output) + const NullPlacement null_placement, RankingsEmitter* emitter) : TypeVisitor(), ctx_(ctx), indices_begin_(indices_begin), @@ -145,15 +209,17 @@ class RankerMixin : public TypeVisitor { input_(input), order_(order), null_placement_(null_placement), - tiebreaker_(tiebreaker), physical_type_(GetPhysicalType(input.type())), - output_(output) {} + emitter_(emitter) {} - Status Run() { return physical_type_->Accept(this); } + Result Run() { + RETURN_NOT_OK(physical_type_->Accept(this)); + return emitter_->CreateRankings(ctx_, sorted_); + } -#define VISIT(TYPE) \ - Status Visit(const TYPE& type) { \ - return static_cast(this)->template RankInternal(); \ +#define VISIT(TYPE) \ + Status Visit(const TYPE& type) { \ + return static_cast(this)->template SortAndMarkDuplicates(); \ } VISIT_SORTABLE_PHYSICAL_TYPES(VISIT) @@ -167,9 +233,9 @@ class RankerMixin : public TypeVisitor { const InputType& input_; const SortOrder order_; const NullPlacement null_placement_; - const RankOptions::Tiebreaker tiebreaker_; const std::shared_ptr physical_type_; - Datum* output_; + RankingsEmitter* emitter_; + NullPartitionResult sorted_{}; }; template @@ -181,26 +247,23 @@ class Ranker : public RankerMixin> { using RankerMixin::RankerMixin; template - Status RankInternal() { + Status SortAndMarkDuplicates() { using GetView = GetViewType; using ArrayType = typename TypeTraits::ArrayType; ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_)); ArrayType array(input_.data()); - ARROW_ASSIGN_OR_RAISE(NullPartitionResult sorted, + ARROW_ASSIGN_OR_RAISE(sorted_, array_sorter(indices_begin_, indices_end_, array, 0, ArraySortOptions(order_, null_placement_), ctx_)); - if (NeedsDuplicates(tiebreaker_)) { + if (emitter_->NeedsDuplicates()) { auto value_selector = [&array](int64_t index) { return GetView::LogicalValue(array.GetView(index)); }; - MarkDuplicates(sorted, value_selector); + MarkDuplicates(sorted_, value_selector); } - ARROW_ASSIGN_OR_RAISE(*output_, - CreateRankings(ctx_, sorted, null_placement_, tiebreaker_)); - return Status::OK(); } }; @@ -214,26 +277,21 @@ class Ranker : public RankerMixin - Status RankInternal() { + Status SortAndMarkDuplicates() { if (physical_chunks_.empty()) { return Status::OK(); } - ARROW_ASSIGN_OR_RAISE( - NullPartitionResult sorted, - SortChunkedArray(ctx_, indices_begin_, indices_end_, physical_type_, - physical_chunks_, order_, null_placement_)); - - if (NeedsDuplicates(tiebreaker_)) { + sorted_, SortChunkedArray(ctx_, indices_begin_, indices_end_, physical_type_, + physical_chunks_, order_, null_placement_)); + if (emitter_->NeedsDuplicates()) { const auto arrays = GetArrayPointers(physical_chunks_); auto value_selector = [resolver = ChunkedArrayResolver(span(arrays))](int64_t index) { return resolver.Resolve(index).Value(); }; - MarkDuplicates(sorted, value_selector); + MarkDuplicates(sorted_, value_selector); } - ARROW_ASSIGN_OR_RAISE(*output_, - CreateRankings(ctx_, sorted, null_placement_, tiebreaker_)); return Status::OK(); } @@ -242,7 +300,7 @@ class Ranker : public RankerMixin ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override { - const auto& rank_options = checked_cast(*options); switch (args[0].kind()) { case Datum::ARRAY: { - return Rank(*args[0].make_array(), rank_options, ctx); + return Rank(*args[0].make_array(), *options, ctx); } break; case Datum::CHUNKED_ARRAY: { - return Rank(*args[0].chunked_array(), rank_options, ctx); + return Rank(*args[0].chunked_array(), *options, ctx); } break; default: break; @@ -278,14 +347,19 @@ class RankMetaFunction : public MetaFunction { args[0].ToString()); } - private: + protected: + struct UnpackedOptions { + SortOrder order{SortOrder::Ascending}; + NullPlacement null_placement; + std::unique_ptr emitter; + }; + + virtual UnpackedOptions UnpackOptions(const FunctionOptions&) const = 0; + template - static Result Rank(const T& input, const RankOptions& options, - ExecContext* ctx) { - SortOrder order = SortOrder::Ascending; - if (!options.sort_keys.empty()) { - order = options.sort_keys[0].order; - } + Result Rank(const T& input, const FunctionOptions& function_options, + ExecContext* ctx) const { + auto options = UnpackOptions(function_options); int64_t length = input.length(); ARROW_ASSIGN_OR_RAISE(auto indices, @@ -294,11 +368,45 @@ class RankMetaFunction : public MetaFunction { auto* indices_end = indices_begin + length; std::iota(indices_begin, indices_end, 0); - Datum output; - Ranker ranker(ctx, indices_begin, indices_end, input, order, - options.null_placement, options.tiebreaker, &output); - ARROW_RETURN_NOT_OK(ranker.Run()); - return output; + Ranker ranker(ctx, indices_begin, indices_end, input, options.order, + options.null_placement, options.emitter.get()); + return ranker.Run(); + } +}; + +class RankMetaFunction : public RankMetaFunctionBase { + public: + RankMetaFunction() + : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {} + + protected: + UnpackedOptions UnpackOptions(const FunctionOptions& function_options) const override { + const auto& options = checked_cast(function_options); + UnpackedOptions unpacked{ + SortOrder::Ascending, options.null_placement, + std::make_unique(options.tiebreaker)}; + if (!options.sort_keys.empty()) { + unpacked.order = options.sort_keys[0].order; + } + return unpacked; + } +}; + +class RankPercentileMetaFunction : public RankMetaFunctionBase { + public: + RankPercentileMetaFunction() + : RankMetaFunctionBase("rank_percentile", Arity::Unary(), rank_percentile_doc, + GetDefaultPercentileRankOptions()) {} + + protected: + UnpackedOptions UnpackOptions(const FunctionOptions& function_options) const override { + const auto& options = checked_cast(function_options); + UnpackedOptions unpacked{SortOrder::Ascending, options.null_placement, + std::make_unique(options.factor)}; + if (!options.sort_keys.empty()) { + unpacked.order = options.sort_keys[0].order; + } + return unpacked; } }; @@ -306,6 +414,7 @@ class RankMetaFunction : public MetaFunction { void RegisterVectorRank(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::make_shared())); + DCHECK_OK(registry->AddFunction(std::make_shared())); } } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index cc6b7834a30..6288aa26eaa 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -806,4 +806,11 @@ inline Result> MakeMutableUInt64Array( return ArrayData::Make(uint64(), length, {nullptr, std::move(data)}, /*null_count=*/0); } +inline Result> MakeMutableFloat64Array( + int64_t length, MemoryPool* memory_pool) { + auto buffer_size = length * sizeof(double); + ARROW_ASSIGN_OR_RAISE(auto data, AllocateBuffer(buffer_size, memory_pool)); + return ArrayData::Make(float64(), length, {nullptr, std::move(data)}, /*null_count=*/0); +} + } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 7f0ef641f6c..15ee4c013f7 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -2205,9 +2205,9 @@ TEST_F(TestNestedSortIndices, SortRecordBatch) { TestSort(GetRecordBatch()); } TEST_F(TestNestedSortIndices, SortTable) { TestSort(GetTable()); } // ---------------------------------------------------------------------- -// Tests for Rank +// Tests for Rank and Percentile Rank -class TestRank : public ::testing::Test { +class BaseTestRank : public ::testing::Test { protected: // Create several test datums from `array`. One of which is the unmodified Array // while the rest are chunked variants based on it. @@ -2236,6 +2236,11 @@ class TestRank : public ::testing::Test { datums_ = {chunked_array}; } + DatumVector datums_; +}; + +class TestRank : public BaseTestRank { + protected: static void AssertRank(const DatumVector& datums, SortOrder order, NullPlacement null_placement, RankOptions::Tiebreaker tiebreaker, const std::shared_ptr& expected) { @@ -2310,8 +2315,6 @@ class TestRank : public ::testing::Test { AssertRank(SortOrder::Descending, NullPlacement::AtStart, RankOptions::Dense, ArrayFromJSON(uint64(), "[3, 4, 2, 1, 2, 1, 4]")); } - - DatumVector datums_; }; TEST_F(TestRank, Real) { @@ -2466,5 +2469,152 @@ TEST_F(TestRank, EmptyChunks) { } } +class TestRankPercentile : public BaseTestRank { + public: + void AssertRankPercentile(const DatumVector& datums, SortOrder order, + NullPlacement null_placement, double factor, + const std::shared_ptr& expected) { + const std::vector sort_keys{SortKey("foo", order)}; + RankPercentileOptions options(sort_keys, null_placement, factor); + ARROW_SCOPED_TRACE("options = ", options.ToString()); + for (const auto& datum : datums) { + ASSERT_OK_AND_ASSIGN(auto actual, + CallFunction("rank_percentile", {datum}, &options)); + ValidateOutput(actual); + AssertDatumsEqual(expected, actual, /*verbose=*/true); + } + } + + void AssertRankPercentile(const DatumVector& datums, SortOrder order, + NullPlacement null_placement, double factor, + const std::string& expected) { + AssertRankPercentile(datums, order, null_placement, factor, + ArrayFromJSON(float64(), expected)); + } + + void AssertRankPercentile(SortOrder order, NullPlacement null_placement, double factor, + const std::shared_ptr& expected) { + AssertRankPercentile(datums_, order, null_placement, factor, expected); + } + + void AssertRankPercentile(SortOrder order, NullPlacement null_placement, double factor, + const std::string& expected) { + AssertRankPercentile(datums_, order, null_placement, factor, + ArrayFromJSON(float64(), expected)); + } + + void AssertRankPercentileEmpty(std::shared_ptr type) { + for (auto null_placement : AllNullPlacements()) { + for (auto order : AllOrders()) { + AssertRankPercentile({ArrayFromJSON(type, "[]")}, order, null_placement, + /*factor=*/1.0, "[]"); + AssertRankPercentile({ArrayFromJSON(type, "[null]")}, order, null_placement, + /*factor=*/1.0, "[0.5]"); + AssertRankPercentile({ArrayFromJSON(type, "[null]")}, order, null_placement, + /*factor=*/10.0, "[5]"); + AssertRankPercentile({ArrayFromJSON(type, "[null, null, null]")}, order, + null_placement, /*factor=*/1.0, "[0.5, 0.5, 0.5]"); + AssertRankPercentile({ArrayFromJSON(type, "[null, null, null]")}, order, + null_placement, /*factor=*/100.0, "[50, 50, 50]"); + } + } + } + + // Expecting an input ordered like [1, 2, 1, 2, 1] + void AssertRankPercentile_12121() { + for (auto null_placement : AllNullPlacements()) { + AssertRankPercentile(SortOrder::Ascending, null_placement, 100.0, + "[30.0, 80.0, 30.0, 80.0, 30.0]"); + AssertRankPercentile(SortOrder::Descending, null_placement, 100.0, + "[70.0, 20.0, 70.0, 20.0, 70.0]"); + } + } + + // Expecting an input ordered like [null, 1, null, 2, null] + void AssertRankPercentile_N1N2N() { + AssertRankPercentile(SortOrder::Ascending, NullPlacement::AtStart, 1.0, + "[0.3, 0.7, 0.3, 0.9, 0.3]"); + AssertRankPercentile(SortOrder::Ascending, NullPlacement::AtEnd, 1.0, + "[0.7, 0.1, 0.7, 0.3, 0.7]"); + AssertRankPercentile(SortOrder::Descending, NullPlacement::AtStart, 1.0, + "[0.3, 0.9, 0.3, 0.7, 0.3]"); + AssertRankPercentile(SortOrder::Descending, NullPlacement::AtEnd, 1.0, + "[0.7, 0.3, 0.7, 0.1, 0.7]"); + } + + void AssertRankPercentileNumeric(std::shared_ptr type) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + AssertRankPercentileEmpty(type); + + // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank + SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]")); + for (auto null_placement : AllNullPlacements()) { + AssertRankPercentile(SortOrder::Ascending, null_placement, 10.0, + "[9.5, 8.0, 8.0, 6.0, 6.0, 3.5, 3.5, 3.5, 1.5, 0.5]"); + AssertRankPercentile(SortOrder::Ascending, null_placement, 100.0, + "[95, 80, 80, 60, 60, 35, 35, 35, 15, 5]"); + AssertRankPercentile(SortOrder::Descending, null_placement, 10.0, + "[0.5, 2.0, 2.0, 4.0, 4.0, 6.5, 6.5, 6.5, 8.5, 9.5]"); + AssertRankPercentile(SortOrder::Descending, null_placement, 100.0, + "[5, 20, 20, 40, 40, 65, 65, 65, 85, 95]"); + } + + // With nulls + SetInput(ArrayFromJSON(type, "[null, 1, null, 2, null]")); + AssertRankPercentile_N1N2N(); + } + + void AssertRankPercentileBinaryLike(std::shared_ptr type) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + AssertRankPercentileEmpty(type); + + SetInput(ArrayFromJSON(type, R"(["", "ab", "", "ab", ""])")); + AssertRankPercentile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, R"([null, "", null, "ab", null])")); + AssertRankPercentile_N1N2N(); + } +}; + +TEST_F(TestRankPercentile, Real) { + for (auto type : ::arrow::FloatingPointTypes()) { + AssertRankPercentileNumeric(type); + } +} + +TEST_F(TestRankPercentile, Integral) { + for (auto type : ::arrow::IntTypes()) { + AssertRankPercentileNumeric(type); + } +} + +TEST_F(TestRankPercentile, Boolean) { + auto type = boolean(); + AssertRankPercentileEmpty(type); + + SetInput(ArrayFromJSON(type, "[false, true, false, true, false]")); + AssertRankPercentile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, "[null, false, null, true, null]")); + AssertRankPercentile_N1N2N(); +} + +TEST_F(TestRankPercentile, BinaryLike) { + for (auto type : BaseBinaryTypes()) { + AssertRankPercentileBinaryLike(type); + } +} + +TEST_F(TestRankPercentile, FixedSizeBinary) { + auto type = fixed_size_binary(3); + AssertRankPercentileEmpty(type); + + SetInput(ArrayFromJSON(type, R"(["abc", "def", "abc", "def", "abc"])")); + AssertRankPercentile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, R"([null, "abc", null, "def", null])")); + AssertRankPercentile_N1N2N(); +} + } // namespace compute } // namespace arrow diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 92f3e440391..64fda25851d 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1796,19 +1796,21 @@ in the respective option classes. Binary- and String-like inputs are ordered lexicographically as bytestrings, even for String types. -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=======================+============+=========================================================+===================+================================+================+ -| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(5) \(6) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(5) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+============+=========================================================+===================+=================================+================+ +| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| rank_percentile | Unary | Boolean, Numeric, Temporal, Binary- and String-like | Float64 | :struct:`RankPercentileOptions` | \(5) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(6) \(7) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(6) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ * \(1) The output is an array of indices into the input, that define a @@ -1823,13 +1825,17 @@ in the respective option classes. :func:`std::nth_element`). *N* is given in :member:`PartitionNthOptions::pivot`. -* \(4) The output is a one-based numerical array of ranks +* \(4) The output is a one-based numerical array of ranks. -* \(5) The input can be an array, chunked array, record batch or +* \(5) The output is an array of quantiles between 0 and a constant *factor*. + The *factor* can be configured in :class:`RankPercentileOptions` + (use 100.0 for a percentile rank). + +* \(6) The input can be an array, chunked array, record batch or table. If the input is a record batch or table, one or more sort keys must be specified. -* \(6) The output is an array of indices into the input, that define a +* \(7) The output is an array of indices into the input, that define a non-stable sort of the input. .. _cpp-compute-vector-structural-transforms: From d7260bca1be08ab94600492eea5fe595ab0973f0 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Fri, 17 Jan 2025 18:00:20 +0800 Subject: [PATCH 2/8] Refine the structure WIP --- cpp/src/arrow/compute/kernels/vector_rank.cc | 200 ++++++++++++++++++- 1 file changed, 194 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 50af9c6d599..6e79da6423b 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -299,6 +299,161 @@ class Ranker : public RankerMixin +Result SortAndMarkDup(const Array& input, uint64_t* indices_begin, + uint64_t* indices_end, SortOrder order, + NullPlacement null_placement, + bool needs_duplicates, ExecContext* ctx) { + using GetView = GetViewType; + using ArrayType = typename TypeTraits::ArrayType; + + ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*input.type())); + + ArrayType array(input.data()); + ARROW_ASSIGN_OR_RAISE(auto sorted, + array_sorter(indices_begin, indices_end, array, 0, + ArraySortOptions(order, null_placement), ctx)); + + if (needs_duplicates) { + auto value_selector = [&array](int64_t index) { + return GetView::LogicalValue(array.GetView(index)); + }; + MarkDuplicates(sorted, value_selector); + } + return sorted; +} + +template +Result SortAndMarkDup(const ChunkedArray& input, + uint64_t* indices_begin, uint64_t* indices_end, + SortOrder order, NullPlacement null_placement, + bool needs_duplicates, ExecContext* ctx) { + auto physical_type = GetPhysicalType(input.type()); + auto physical_chunks = GetPhysicalChunks(input, physical_type); + if (physical_chunks.empty()) { + return NullPartitionResult{}; + } + ARROW_ASSIGN_OR_RAISE(auto sorted, + SortChunkedArray(ctx, indices_begin, indices_end, physical_type, + physical_chunks, order, null_placement)); + if (needs_duplicates) { + const auto arrays = GetArrayPointers(physical_chunks); + auto value_selector = [resolver = ChunkedArrayResolver(span(arrays))](int64_t index) { + return resolver.Resolve(index).Value(); + }; + MarkDuplicates(sorted, value_selector); + } + return sorted; +} + +struct PercentileRanker { + explicit PercentileRanker(double factor) : factor_(factor) {} + + Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted) { + const int64_t length = sorted.overall_end() - sorted.overall_begin(); + ARROW_ASSIGN_OR_RAISE(auto rankings, + MakeMutableFloat64Array(length, ctx->memory_pool())); + auto out_begin = rankings->GetMutableValues(1); + + auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; + auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; + + // The count of values strictly less than the value being considered + int64_t cum_freq = 0; + auto it = sorted.overall_begin(); + + while (it < sorted.overall_end()) { + // Look for a run of duplicate values + DCHECK(!is_duplicate(*it)); + auto run_end = it; + while (++run_end < sorted.overall_end() && is_duplicate(*run_end)) { + } + // The run length, i.e. the frequency of the current value + int64_t freq = run_end - it; + double percentile = (cum_freq + 0.5 * freq) * factor_ / static_cast(length); + // Output percentile rank values + for (; it < run_end; ++it) { + out_begin[original_index(*it)] = percentile; + } + cum_freq += freq; + } + DCHECK_EQ(cum_freq, length); + return Datum(rankings); + } + + private: + const double factor_; +}; + +// A helper class that emits rankings for the "rank" function +struct OrdinalRanker { + explicit OrdinalRanker(RankOptions::Tiebreaker tiebreaker) : tiebreaker_(tiebreaker) {} + + Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted) { + const int64_t length = sorted.overall_end() - sorted.overall_begin(); + ARROW_ASSIGN_OR_RAISE(auto rankings, + MakeMutableUInt64Array(length, ctx->memory_pool())); + auto out_begin = rankings->GetMutableValues(1); + uint64_t rank; + + auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; + auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; + + switch (tiebreaker_) { + case RankOptions::Dense: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { + if (!is_duplicate(*it)) { + ++rank; + } + out_begin[original_index(*it)] = rank; + } + break; + } + + case RankOptions::First: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) { + // No duplicate marks expected for RankOptions::First + DCHECK(!is_duplicate(*it)); + out_begin[*it] = ++rank; + } + break; + } + + case RankOptions::Min: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { + if (!is_duplicate(*it)) { + rank = (it - sorted.overall_begin()) + 1; + } + out_begin[original_index(*it)] = rank; + } + break; + } + + case RankOptions::Max: { + rank = length; + for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); --it) { + out_begin[original_index(*it)] = rank; + // If the current index isn't marked as duplicate, then it's the last + // tie in a row (since we iterate in reverse order), so update rank + // for the next row of ties. + if (!is_duplicate(*it)) { + rank = it - sorted.overall_begin(); + } + } + break; + } + } + + return Datum(rankings); + } + + private: + const RankOptions::Tiebreaker tiebreaker_; +}; + const FunctionDoc rank_doc( "Compute ordinal ranks of an array (1-based)", ("This function computes a rank of the input array.\n" @@ -324,6 +479,7 @@ const FunctionDoc rank_percentile_doc( "in RankPercentileOptions."), {"input"}, "RankPercentileOptions"); +template class RankMetaFunctionBase : public MetaFunction { public: using MetaFunction::MetaFunction; @@ -359,7 +515,13 @@ class RankMetaFunctionBase : public MetaFunction { template Result Rank(const T& input, const FunctionOptions& function_options, ExecContext* ctx) const { - auto options = UnpackOptions(function_options); + const auto& options = + checked_cast(function_options); + + // SortOrder order = SortOrder::Ascending; + // if (!options.sort_keys.empty()) { + // order = options.sort_keys[0].order; + // } int64_t length = input.length(); ARROW_ASSIGN_OR_RAISE(auto indices, @@ -368,17 +530,34 @@ class RankMetaFunctionBase : public MetaFunction { auto* indices_end = indices_begin + length; std::iota(indices_begin, indices_end, 0); - Ranker ranker(ctx, indices_begin, indices_end, input, options.order, - options.null_placement, options.emitter.get()); - return ranker.Run(); + // auto needs_duplicates = static_cast(this)->NeedsDuplicates(options); + // ARROW_ASSIGN_OR_RAISE(auto sorted, + // SortAndMarkDup(input, indices_begin, indices_end, order, + // options.null_placement, needs_duplicates, + // ctx)); + NullPartitionResult sorted; + auto ranker = static_cast(this)->GetRanker(options); + + return ranker.CreateRankings(ctx, sorted); } }; -class RankMetaFunction : public RankMetaFunctionBase { +class RankMetaFunction : public RankMetaFunctionBase { public: + using FunctionOptionsType = RankOptions; + using RankerType = OrdinalRanker; + RankMetaFunction() : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {} + bool NeedsDuplicates(const RankOptions& options) const { + return options.tiebreaker != RankOptions::First; + } + + RankerType GetRanker(const RankOptions& options) const { + return RankerType(options.tiebreaker); + } + protected: UnpackedOptions UnpackOptions(const FunctionOptions& function_options) const override { const auto& options = checked_cast(function_options); @@ -392,12 +571,21 @@ class RankMetaFunction : public RankMetaFunctionBase { } }; -class RankPercentileMetaFunction : public RankMetaFunctionBase { +class RankPercentileMetaFunction : public RankMetaFunctionBase { public: + using FunctionOptionsType = RankPercentileOptions; + using RankerType = PercentileRanker; + RankPercentileMetaFunction() : RankMetaFunctionBase("rank_percentile", Arity::Unary(), rank_percentile_doc, GetDefaultPercentileRankOptions()) {} + bool NeedsDuplicates(const RankPercentileOptions&) const { return true; } + + RankerType GetRanker(const RankPercentileOptions& options) const { + return RankerType(options.factor); + } + protected: UnpackedOptions UnpackOptions(const FunctionOptions& function_options) const override { const auto& options = checked_cast(function_options); From d9eb3e4b73283c0ac505b27b019a41742eaf0618 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Fri, 17 Jan 2025 21:36:50 +0800 Subject: [PATCH 3/8] Refinement done --- cpp/src/arrow/compute/kernels/vector_rank.cc | 375 ++++--------------- 1 file changed, 81 insertions(+), 294 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 6e79da6423b..6af7420626c 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -63,129 +63,6 @@ void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_sel } } -struct RankingsEmitter { - virtual ~RankingsEmitter() = default; - virtual bool NeedsDuplicates() = 0; - virtual Result CreateRankings(ExecContext* ctx, - const NullPartitionResult& sorted) = 0; -}; - -// A helper class that emits rankings for the "rank_percentile" function -struct PercentileRankingsEmitter : public RankingsEmitter { - explicit PercentileRankingsEmitter(double factor) : factor_(factor) {} - - bool NeedsDuplicates() override { return true; } - - Result CreateRankings(ExecContext* ctx, - const NullPartitionResult& sorted) override { - const int64_t length = sorted.overall_end() - sorted.overall_begin(); - ARROW_ASSIGN_OR_RAISE(auto rankings, - MakeMutableFloat64Array(length, ctx->memory_pool())); - auto out_begin = rankings->GetMutableValues(1); - - auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; - auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; - - // The count of values strictly less than the value being considered - int64_t cum_freq = 0; - auto it = sorted.overall_begin(); - - while (it < sorted.overall_end()) { - // Look for a run of duplicate values - DCHECK(!is_duplicate(*it)); - auto run_end = it; - while (++run_end < sorted.overall_end() && is_duplicate(*run_end)) { - } - // The run length, i.e. the frequency of the current value - int64_t freq = run_end - it; - double percentile = (cum_freq + 0.5 * freq) * factor_ / static_cast(length); - // Output percentile rank values - for (; it < run_end; ++it) { - out_begin[original_index(*it)] = percentile; - } - cum_freq += freq; - } - DCHECK_EQ(cum_freq, length); - return Datum(rankings); - } - - private: - const double factor_; -}; - -// A helper class that emits rankings for the "rank" function -struct OrdinalRankingsEmitter : public RankingsEmitter { - explicit OrdinalRankingsEmitter(RankOptions::Tiebreaker tiebreaker) - : tiebreaker_(tiebreaker) {} - - bool NeedsDuplicates() override { return tiebreaker_ != RankOptions::First; } - - Result CreateRankings(ExecContext* ctx, - const NullPartitionResult& sorted) override { - const int64_t length = sorted.overall_end() - sorted.overall_begin(); - ARROW_ASSIGN_OR_RAISE(auto rankings, - MakeMutableUInt64Array(length, ctx->memory_pool())); - auto out_begin = rankings->GetMutableValues(1); - uint64_t rank; - - auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; - auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; - - switch (tiebreaker_) { - case RankOptions::Dense: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { - if (!is_duplicate(*it)) { - ++rank; - } - out_begin[original_index(*it)] = rank; - } - break; - } - - case RankOptions::First: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) { - // No duplicate marks expected for RankOptions::First - DCHECK(!is_duplicate(*it)); - out_begin[*it] = ++rank; - } - break; - } - - case RankOptions::Min: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { - if (!is_duplicate(*it)) { - rank = (it - sorted.overall_begin()) + 1; - } - out_begin[original_index(*it)] = rank; - } - break; - } - - case RankOptions::Max: { - rank = length; - for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); --it) { - out_begin[original_index(*it)] = rank; - // If the current index isn't marked as duplicate, then it's the last - // tie in a row (since we iterate in reverse order), so update rank - // for the next row of ties. - if (!is_duplicate(*it)) { - rank = it - sorted.overall_begin(); - } - } - break; - } - } - - return Datum(rankings); - } - - private: - const RankOptions::Tiebreaker tiebreaker_; -}; - const RankOptions* GetDefaultRankOptions() { static const auto kDefaultRankOptions = RankOptions::Defaults(); return &kDefaultRankOptions; @@ -196,118 +73,15 @@ const RankPercentileOptions* GetDefaultPercentileRankOptions() { return &kDefaultPercentileRankOptions; } -template -class RankerMixin : public TypeVisitor { - public: - RankerMixin(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, - const InputType& input, const SortOrder order, - const NullPlacement null_placement, RankingsEmitter* emitter) - : TypeVisitor(), - ctx_(ctx), - indices_begin_(indices_begin), - indices_end_(indices_end), - input_(input), - order_(order), - null_placement_(null_placement), - physical_type_(GetPhysicalType(input.type())), - emitter_(emitter) {} - - Result Run() { - RETURN_NOT_OK(physical_type_->Accept(this)); - return emitter_->CreateRankings(ctx_, sorted_); - } - -#define VISIT(TYPE) \ - Status Visit(const TYPE& type) { \ - return static_cast(this)->template SortAndMarkDuplicates(); \ - } - - VISIT_SORTABLE_PHYSICAL_TYPES(VISIT) - -#undef VISIT - - protected: - ExecContext* ctx_; - uint64_t* indices_begin_; - uint64_t* indices_end_; - const InputType& input_; - const SortOrder order_; - const NullPlacement null_placement_; - const std::shared_ptr physical_type_; - RankingsEmitter* emitter_; - NullPartitionResult sorted_{}; -}; - -template -class Ranker; - -template <> -class Ranker : public RankerMixin> { - public: - using RankerMixin::RankerMixin; - - template - Status SortAndMarkDuplicates() { - using GetView = GetViewType; - using ArrayType = typename TypeTraits::ArrayType; - - ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_)); - - ArrayType array(input_.data()); - ARROW_ASSIGN_OR_RAISE(sorted_, - array_sorter(indices_begin_, indices_end_, array, 0, - ArraySortOptions(order_, null_placement_), ctx_)); - - if (emitter_->NeedsDuplicates()) { - auto value_selector = [&array](int64_t index) { - return GetView::LogicalValue(array.GetView(index)); - }; - MarkDuplicates(sorted_, value_selector); - } - return Status::OK(); - } -}; - -template <> -class Ranker : public RankerMixin> { - public: - template - explicit Ranker(Args&&... args) - : RankerMixin(std::forward(args)...), - physical_chunks_(GetPhysicalChunks(input_, physical_type_)) {} - - template - Status SortAndMarkDuplicates() { - if (physical_chunks_.empty()) { - return Status::OK(); - } - ARROW_ASSIGN_OR_RAISE( - sorted_, SortChunkedArray(ctx_, indices_begin_, indices_end_, physical_type_, - physical_chunks_, order_, null_placement_)); - if (emitter_->NeedsDuplicates()) { - const auto arrays = GetArrayPointers(physical_chunks_); - auto value_selector = [resolver = - ChunkedArrayResolver(span(arrays))](int64_t index) { - return resolver.Resolve(index).Value(); - }; - MarkDuplicates(sorted_, value_selector); - } - return Status::OK(); - } +template +Result DoSortAndMarkDuplicate( + ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const Array& input, + const std::shared_ptr& physical_type, const SortOrder order, + const NullPlacement null_placement, bool needs_duplicates) { + using GetView = GetViewType; + using ArrayType = typename TypeTraits::ArrayType; - private: - const ArrayVector physical_chunks_; -}; - -template -Result SortAndMarkDup(const Array& input, uint64_t* indices_begin, - uint64_t* indices_end, SortOrder order, - NullPlacement null_placement, - bool needs_duplicates, ExecContext* ctx) { - using GetView = GetViewType; - using ArrayType = typename TypeTraits::ArrayType; - - ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*input.type())); + ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type)); ArrayType array(input.data()); ARROW_ASSIGN_OR_RAISE(auto sorted, @@ -323,12 +97,11 @@ Result SortAndMarkDup(const Array& input, uint64_t* indices return sorted; } -template -Result SortAndMarkDup(const ChunkedArray& input, - uint64_t* indices_begin, uint64_t* indices_end, - SortOrder order, NullPlacement null_placement, - bool needs_duplicates, ExecContext* ctx) { - auto physical_type = GetPhysicalType(input.type()); +template +Result DoSortAndMarkDuplicate( + ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, + const ChunkedArray& input, const std::shared_ptr& physical_type, + const SortOrder order, const NullPlacement null_placement, bool needs_duplicates) { auto physical_chunks = GetPhysicalChunks(input, physical_type); if (physical_chunks.empty()) { return NullPartitionResult{}; @@ -339,13 +112,59 @@ Result SortAndMarkDup(const ChunkedArray& input, if (needs_duplicates) { const auto arrays = GetArrayPointers(physical_chunks); auto value_selector = [resolver = ChunkedArrayResolver(span(arrays))](int64_t index) { - return resolver.Resolve(index).Value(); + return resolver.Resolve(index).Value(); }; MarkDuplicates(sorted, value_selector); } return sorted; } +template +class SortAndMarkDuplicate : public TypeVisitor { + public: + SortAndMarkDuplicate(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, + const InputType& input, const SortOrder order, + const NullPlacement null_placement, const bool needs_duplicate) + : TypeVisitor(), + ctx_(ctx), + indices_begin_(indices_begin), + indices_end_(indices_end), + input_(input), + order_(order), + null_placement_(null_placement), + needs_duplicates_(needs_duplicate), + physical_type_(GetPhysicalType(input.type())) {} + + Result Run() { + RETURN_NOT_OK(physical_type_->Accept(this)); + return std::move(sorted_); + } + +#define VISIT(TYPE) \ + Status Visit(const TYPE& type) { \ + ARROW_ASSIGN_OR_RAISE( \ + sorted_, DoSortAndMarkDuplicate(ctx_, indices_begin_, indices_end_, \ + input_, physical_type_, order_, \ + null_placement_, needs_duplicates_)); \ + return Status::OK(); \ + } + + VISIT_SORTABLE_PHYSICAL_TYPES(VISIT) + +#undef VISIT + + private: + ExecContext* ctx_; + uint64_t* indices_begin_; + uint64_t* indices_end_; + const InputType& input_; + const SortOrder order_; + const NullPlacement null_placement_; + const bool needs_duplicates_; + const std::shared_ptr physical_type_; + NullPartitionResult sorted_{}; +}; + struct PercentileRanker { explicit PercentileRanker(double factor) : factor_(factor) {} @@ -504,24 +323,16 @@ class RankMetaFunctionBase : public MetaFunction { } protected: - struct UnpackedOptions { - SortOrder order{SortOrder::Ascending}; - NullPlacement null_placement; - std::unique_ptr emitter; - }; - - virtual UnpackedOptions UnpackOptions(const FunctionOptions&) const = 0; - template Result Rank(const T& input, const FunctionOptions& function_options, ExecContext* ctx) const { const auto& options = checked_cast(function_options); - // SortOrder order = SortOrder::Ascending; - // if (!options.sort_keys.empty()) { - // order = options.sort_keys[0].order; - // } + SortOrder order = SortOrder::Ascending; + if (!options.sort_keys.empty()) { + order = options.sort_keys[0].order; + } int64_t length = input.length(); ARROW_ASSIGN_OR_RAISE(auto indices, @@ -529,15 +340,13 @@ class RankMetaFunctionBase : public MetaFunction { auto* indices_begin = indices->GetMutableValues(1); auto* indices_end = indices_begin + length; std::iota(indices_begin, indices_end, 0); + auto needs_duplicates = Impl::NeedsDuplicates(options); + ARROW_ASSIGN_OR_RAISE( + auto sorted, SortAndMarkDuplicate(ctx, indices_begin, indices_end, input, order, + options.null_placement, needs_duplicates) + .Run()); - // auto needs_duplicates = static_cast(this)->NeedsDuplicates(options); - // ARROW_ASSIGN_OR_RAISE(auto sorted, - // SortAndMarkDup(input, indices_begin, indices_end, order, - // options.null_placement, needs_duplicates, - // ctx)); - NullPartitionResult sorted; - auto ranker = static_cast(this)->GetRanker(options); - + auto ranker = Impl::GetRanker(options); return ranker.CreateRankings(ctx, sorted); } }; @@ -547,55 +356,33 @@ class RankMetaFunction : public RankMetaFunctionBase { using FunctionOptionsType = RankOptions; using RankerType = OrdinalRanker; - RankMetaFunction() - : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {} - - bool NeedsDuplicates(const RankOptions& options) const { + static bool NeedsDuplicates(const RankOptions& options) { return options.tiebreaker != RankOptions::First; } - RankerType GetRanker(const RankOptions& options) const { + static RankerType GetRanker(const RankOptions& options) { return RankerType(options.tiebreaker); } - protected: - UnpackedOptions UnpackOptions(const FunctionOptions& function_options) const override { - const auto& options = checked_cast(function_options); - UnpackedOptions unpacked{ - SortOrder::Ascending, options.null_placement, - std::make_unique(options.tiebreaker)}; - if (!options.sort_keys.empty()) { - unpacked.order = options.sort_keys[0].order; - } - return unpacked; - } + RankMetaFunction() + : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {} }; -class RankPercentileMetaFunction : public RankMetaFunctionBase { +class RankPercentileMetaFunction + : public RankMetaFunctionBase { public: using FunctionOptionsType = RankPercentileOptions; using RankerType = PercentileRanker; - RankPercentileMetaFunction() - : RankMetaFunctionBase("rank_percentile", Arity::Unary(), rank_percentile_doc, - GetDefaultPercentileRankOptions()) {} - - bool NeedsDuplicates(const RankPercentileOptions&) const { return true; } + static bool NeedsDuplicates(const RankPercentileOptions&) { return true; } - RankerType GetRanker(const RankPercentileOptions& options) const { + static RankerType GetRanker(const RankPercentileOptions& options) { return RankerType(options.factor); } - protected: - UnpackedOptions UnpackOptions(const FunctionOptions& function_options) const override { - const auto& options = checked_cast(function_options); - UnpackedOptions unpacked{SortOrder::Ascending, options.null_placement, - std::make_unique(options.factor)}; - if (!options.sort_keys.empty()) { - unpacked.order = options.sort_keys[0].order; - } - return unpacked; - } + RankPercentileMetaFunction() + : RankMetaFunctionBase("rank_percentile", Arity::Unary(), rank_percentile_doc, + GetDefaultPercentileRankOptions()) {} }; } // namespace From 628efe5b1a876282c84adce1cbb47fe7279904d2 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Mon, 20 Jan 2025 17:30:03 +0800 Subject: [PATCH 4/8] Address review comments --- cpp/src/arrow/compute/kernels/vector_rank.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 6af7420626c..e0069a1f2c4 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -137,7 +137,7 @@ class SortAndMarkDuplicate : public TypeVisitor { Result Run() { RETURN_NOT_OK(physical_type_->Accept(this)); - return std::move(sorted_); + return sorted_; } #define VISIT(TYPE) \ @@ -165,6 +165,7 @@ class SortAndMarkDuplicate : public TypeVisitor { NullPartitionResult sorted_{}; }; +// A helper class that emits rankings for the "rank_percentile" function struct PercentileRanker { explicit PercentileRanker(double factor) : factor_(factor) {} @@ -298,7 +299,7 @@ const FunctionDoc rank_percentile_doc( "in RankPercentileOptions."), {"input"}, "RankPercentileOptions"); -template +template class RankMetaFunctionBase : public MetaFunction { public: using MetaFunction::MetaFunction; @@ -327,7 +328,7 @@ class RankMetaFunctionBase : public MetaFunction { Result Rank(const T& input, const FunctionOptions& function_options, ExecContext* ctx) const { const auto& options = - checked_cast(function_options); + checked_cast(function_options); SortOrder order = SortOrder::Ascending; if (!options.sort_keys.empty()) { @@ -340,13 +341,13 @@ class RankMetaFunctionBase : public MetaFunction { auto* indices_begin = indices->GetMutableValues(1); auto* indices_end = indices_begin + length; std::iota(indices_begin, indices_end, 0); - auto needs_duplicates = Impl::NeedsDuplicates(options); + auto needs_duplicates = Derived::NeedsDuplicates(options); ARROW_ASSIGN_OR_RAISE( auto sorted, SortAndMarkDuplicate(ctx, indices_begin, indices_end, input, order, options.null_placement, needs_duplicates) .Run()); - auto ranker = Impl::GetRanker(options); + auto ranker = Derived::GetRanker(options); return ranker.CreateRankings(ctx, sorted); } }; From 1408bfa4025f00b4277a614af449a4a2ea9a08d3 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 20 Jan 2025 16:39:28 +0100 Subject: [PATCH 5/8] Rename to rank_quantile --- cpp/src/arrow/compute/api_vector.cc | 18 +-- cpp/src/arrow/compute/api_vector.h | 22 +-- cpp/src/arrow/compute/kernels/vector_rank.cc | 50 +++--- .../arrow/compute/kernels/vector_sort_test.cc | 153 +++++++++--------- docs/source/cpp/compute.rst | 32 ++-- 5 files changed, 136 insertions(+), 139 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 54e04298b6a..75d1bc3799f 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -48,7 +48,7 @@ using compute::DictionaryEncodeOptions; using compute::FilterOptions; using compute::NullPlacement; using compute::RankOptions; -using compute::RankPercentileOptions; +using compute::RankQuantileOptions; template <> struct EnumTraits @@ -152,10 +152,10 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankOptions::sort_keys), DataMember("null_placement", &RankOptions::null_placement), DataMember("tiebreaker", &RankOptions::tiebreaker)); -static auto kRankPercentileOptionsType = GetFunctionOptionsType( - DataMember("sort_keys", &RankPercentileOptions::sort_keys), - DataMember("null_placement", &RankPercentileOptions::null_placement), - DataMember("factor", &RankPercentileOptions::factor)); +static auto kRankQuantileOptionsType = GetFunctionOptionsType( + DataMember("sort_keys", &RankQuantileOptions::sort_keys), + DataMember("null_placement", &RankQuantileOptions::null_placement), + DataMember("factor", &RankQuantileOptions::factor)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); static auto kListFlattenOptionsType = GetFunctionOptionsType( @@ -233,13 +233,13 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac tiebreaker(tiebreaker) {} constexpr char RankOptions::kTypeName[]; -RankPercentileOptions::RankPercentileOptions(std::vector sort_keys, - NullPlacement null_placement, double factor) - : FunctionOptions(internal::kRankPercentileOptionsType), +RankQuantileOptions::RankQuantileOptions(std::vector sort_keys, + NullPlacement null_placement, double factor) + : FunctionOptions(internal::kRankQuantileOptionsType), sort_keys(std::move(sort_keys)), null_placement(null_placement), factor(factor) {} -constexpr char RankPercentileOptions::kTypeName[]; +constexpr char RankQuantileOptions::kTypeName[]; PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 5fd8241e9bd..99a1603db29 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -195,20 +195,20 @@ class ARROW_EXPORT RankOptions : public FunctionOptions { Tiebreaker tiebreaker; }; -/// \brief Percentile rank options -class ARROW_EXPORT RankPercentileOptions : public FunctionOptions { +/// \brief Quantile rank options +class ARROW_EXPORT RankQuantileOptions : public FunctionOptions { public: - explicit RankPercentileOptions(std::vector sort_keys = {}, - NullPlacement null_placement = NullPlacement::AtEnd, - double factor = 1.0); + explicit RankQuantileOptions(std::vector sort_keys = {}, + NullPlacement null_placement = NullPlacement::AtEnd, + double factor = 1.0); /// Convenience constructor for array inputs - explicit RankPercentileOptions(SortOrder order, - NullPlacement null_placement = NullPlacement::AtEnd, - double factor = 1.0) - : RankPercentileOptions({SortKey("", order)}, null_placement, factor) {} + explicit RankQuantileOptions(SortOrder order, + NullPlacement null_placement = NullPlacement::AtEnd, + double factor = 1.0) + : RankQuantileOptions({SortKey("", order)}, null_placement, factor) {} - static constexpr char const kTypeName[] = "RankPercentileOptions"; - static RankPercentileOptions Defaults() { return RankPercentileOptions(); } + static constexpr char const kTypeName[] = "RankQuantileOptions"; + static RankQuantileOptions Defaults() { return RankQuantileOptions(); } /// Column key(s) to order by and how to order by these sort keys. std::vector sort_keys; diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index e0069a1f2c4..da785eaf2dd 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -21,7 +21,6 @@ #include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" -#include "arrow/util/logging.h" namespace arrow::compute::internal { @@ -68,9 +67,9 @@ const RankOptions* GetDefaultRankOptions() { return &kDefaultRankOptions; } -const RankPercentileOptions* GetDefaultPercentileRankOptions() { - static const auto kDefaultPercentileRankOptions = RankPercentileOptions::Defaults(); - return &kDefaultPercentileRankOptions; +const RankQuantileOptions* GetDefaultQuantileRankOptions() { + static const auto kDefaultQuantileRankOptions = RankQuantileOptions::Defaults(); + return &kDefaultQuantileRankOptions; } template @@ -165,9 +164,9 @@ class SortAndMarkDuplicate : public TypeVisitor { NullPartitionResult sorted_{}; }; -// A helper class that emits rankings for the "rank_percentile" function -struct PercentileRanker { - explicit PercentileRanker(double factor) : factor_(factor) {} +// A helper class that emits rankings for the "rank_quantile" function +struct QuantileRanker { + explicit QuantileRanker(double factor) : factor_(factor) {} Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted) { const int64_t length = sorted.overall_end() - sorted.overall_begin(); @@ -190,10 +189,10 @@ struct PercentileRanker { } // The run length, i.e. the frequency of the current value int64_t freq = run_end - it; - double percentile = (cum_freq + 0.5 * freq) * factor_ / static_cast(length); - // Output percentile rank values + double quantile = (cum_freq + 0.5 * freq) * factor_ / static_cast(length); + // Output quantile rank values for (; it < run_end; ++it) { - out_begin[original_index(*it)] = percentile; + out_begin[original_index(*it)] = quantile; } cum_freq += freq; } @@ -286,18 +285,18 @@ const FunctionDoc rank_doc( "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."), {"input"}, "RankOptions"); -const FunctionDoc rank_percentile_doc( - "Compute percentile ranks of an array", - ("This function computes a percentile rank of the input array.\n" +const FunctionDoc rank_quantile_doc( + "Compute quantile ranks of an array", + ("This function computes a quantile rank of the input array.\n" "By default, null values are considered greater than any other value and\n" "are therefore sorted at the end of the input. For floating-point types,\n" "NaNs are considered greater than any other non-null value, but smaller\n" "than null values.\n" - "Results are computed as in https://en.wikipedia.org/wiki/Percentile_rank\n" + "Results are computed as in https://en.wikipedia.org/wiki/Quantile_rank\n" "\n" "The handling of nulls and NaNs, and the constant factor can be changed\n" - "in RankPercentileOptions."), - {"input"}, "RankPercentileOptions"); + "in RankQuantileOptions."), + {"input"}, "RankQuantileOptions"); template class RankMetaFunctionBase : public MetaFunction { @@ -369,28 +368,27 @@ class RankMetaFunction : public RankMetaFunctionBase { : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {} }; -class RankPercentileMetaFunction - : public RankMetaFunctionBase { +class RankQuantileMetaFunction : public RankMetaFunctionBase { public: - using FunctionOptionsType = RankPercentileOptions; - using RankerType = PercentileRanker; + using FunctionOptionsType = RankQuantileOptions; + using RankerType = QuantileRanker; - static bool NeedsDuplicates(const RankPercentileOptions&) { return true; } + static bool NeedsDuplicates(const RankQuantileOptions&) { return true; } - static RankerType GetRanker(const RankPercentileOptions& options) { + static RankerType GetRanker(const RankQuantileOptions& options) { return RankerType(options.factor); } - RankPercentileMetaFunction() - : RankMetaFunctionBase("rank_percentile", Arity::Unary(), rank_percentile_doc, - GetDefaultPercentileRankOptions()) {} + RankQuantileMetaFunction() + : RankMetaFunctionBase("rank_quantile", Arity::Unary(), rank_quantile_doc, + GetDefaultQuantileRankOptions()) {} }; } // namespace void RegisterVectorRank(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::make_shared())); - DCHECK_OK(registry->AddFunction(std::make_shared())); + DCHECK_OK(registry->AddFunction(std::make_shared())); } } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 15ee4c013f7..c59fb42d5f9 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -2205,7 +2205,7 @@ TEST_F(TestNestedSortIndices, SortRecordBatch) { TestSort(GetRecordBatch()); } TEST_F(TestNestedSortIndices, SortTable) { TestSort(GetTable()); } // ---------------------------------------------------------------------- -// Tests for Rank and Percentile Rank +// Tests for Rank and Quantile Rank class BaseTestRank : public ::testing::Test { protected: @@ -2469,151 +2469,150 @@ TEST_F(TestRank, EmptyChunks) { } } -class TestRankPercentile : public BaseTestRank { +class TestRankQuantile : public BaseTestRank { public: - void AssertRankPercentile(const DatumVector& datums, SortOrder order, - NullPlacement null_placement, double factor, - const std::shared_ptr& expected) { + void AssertRankQuantile(const DatumVector& datums, SortOrder order, + NullPlacement null_placement, double factor, + const std::shared_ptr& expected) { const std::vector sort_keys{SortKey("foo", order)}; - RankPercentileOptions options(sort_keys, null_placement, factor); + RankQuantileOptions options(sort_keys, null_placement, factor); ARROW_SCOPED_TRACE("options = ", options.ToString()); for (const auto& datum : datums) { - ASSERT_OK_AND_ASSIGN(auto actual, - CallFunction("rank_percentile", {datum}, &options)); + ASSERT_OK_AND_ASSIGN(auto actual, CallFunction("rank_quantile", {datum}, &options)); ValidateOutput(actual); AssertDatumsEqual(expected, actual, /*verbose=*/true); } } - void AssertRankPercentile(const DatumVector& datums, SortOrder order, - NullPlacement null_placement, double factor, - const std::string& expected) { - AssertRankPercentile(datums, order, null_placement, factor, - ArrayFromJSON(float64(), expected)); + void AssertRankQuantile(const DatumVector& datums, SortOrder order, + NullPlacement null_placement, double factor, + const std::string& expected) { + AssertRankQuantile(datums, order, null_placement, factor, + ArrayFromJSON(float64(), expected)); } - void AssertRankPercentile(SortOrder order, NullPlacement null_placement, double factor, - const std::shared_ptr& expected) { - AssertRankPercentile(datums_, order, null_placement, factor, expected); + void AssertRankQuantile(SortOrder order, NullPlacement null_placement, double factor, + const std::shared_ptr& expected) { + AssertRankQuantile(datums_, order, null_placement, factor, expected); } - void AssertRankPercentile(SortOrder order, NullPlacement null_placement, double factor, - const std::string& expected) { - AssertRankPercentile(datums_, order, null_placement, factor, - ArrayFromJSON(float64(), expected)); + void AssertRankQuantile(SortOrder order, NullPlacement null_placement, double factor, + const std::string& expected) { + AssertRankQuantile(datums_, order, null_placement, factor, + ArrayFromJSON(float64(), expected)); } - void AssertRankPercentileEmpty(std::shared_ptr type) { + void AssertRankQuantileEmpty(std::shared_ptr type) { for (auto null_placement : AllNullPlacements()) { for (auto order : AllOrders()) { - AssertRankPercentile({ArrayFromJSON(type, "[]")}, order, null_placement, - /*factor=*/1.0, "[]"); - AssertRankPercentile({ArrayFromJSON(type, "[null]")}, order, null_placement, - /*factor=*/1.0, "[0.5]"); - AssertRankPercentile({ArrayFromJSON(type, "[null]")}, order, null_placement, - /*factor=*/10.0, "[5]"); - AssertRankPercentile({ArrayFromJSON(type, "[null, null, null]")}, order, - null_placement, /*factor=*/1.0, "[0.5, 0.5, 0.5]"); - AssertRankPercentile({ArrayFromJSON(type, "[null, null, null]")}, order, - null_placement, /*factor=*/100.0, "[50, 50, 50]"); + AssertRankQuantile({ArrayFromJSON(type, "[]")}, order, null_placement, + /*factor=*/1.0, "[]"); + AssertRankQuantile({ArrayFromJSON(type, "[null]")}, order, null_placement, + /*factor=*/1.0, "[0.5]"); + AssertRankQuantile({ArrayFromJSON(type, "[null]")}, order, null_placement, + /*factor=*/10.0, "[5]"); + AssertRankQuantile({ArrayFromJSON(type, "[null, null, null]")}, order, + null_placement, /*factor=*/1.0, "[0.5, 0.5, 0.5]"); + AssertRankQuantile({ArrayFromJSON(type, "[null, null, null]")}, order, + null_placement, /*factor=*/100.0, "[50, 50, 50]"); } } } // Expecting an input ordered like [1, 2, 1, 2, 1] - void AssertRankPercentile_12121() { + void AssertRankQuantile_12121() { for (auto null_placement : AllNullPlacements()) { - AssertRankPercentile(SortOrder::Ascending, null_placement, 100.0, - "[30.0, 80.0, 30.0, 80.0, 30.0]"); - AssertRankPercentile(SortOrder::Descending, null_placement, 100.0, - "[70.0, 20.0, 70.0, 20.0, 70.0]"); + AssertRankQuantile(SortOrder::Ascending, null_placement, 100.0, + "[30.0, 80.0, 30.0, 80.0, 30.0]"); + AssertRankQuantile(SortOrder::Descending, null_placement, 100.0, + "[70.0, 20.0, 70.0, 20.0, 70.0]"); } } // Expecting an input ordered like [null, 1, null, 2, null] - void AssertRankPercentile_N1N2N() { - AssertRankPercentile(SortOrder::Ascending, NullPlacement::AtStart, 1.0, - "[0.3, 0.7, 0.3, 0.9, 0.3]"); - AssertRankPercentile(SortOrder::Ascending, NullPlacement::AtEnd, 1.0, - "[0.7, 0.1, 0.7, 0.3, 0.7]"); - AssertRankPercentile(SortOrder::Descending, NullPlacement::AtStart, 1.0, - "[0.3, 0.9, 0.3, 0.7, 0.3]"); - AssertRankPercentile(SortOrder::Descending, NullPlacement::AtEnd, 1.0, - "[0.7, 0.3, 0.7, 0.1, 0.7]"); - } - - void AssertRankPercentileNumeric(std::shared_ptr type) { + void AssertRankQuantile_N1N2N() { + AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtStart, 1.0, + "[0.3, 0.7, 0.3, 0.9, 0.3]"); + AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtEnd, 1.0, + "[0.7, 0.1, 0.7, 0.3, 0.7]"); + AssertRankQuantile(SortOrder::Descending, NullPlacement::AtStart, 1.0, + "[0.3, 0.9, 0.3, 0.7, 0.3]"); + AssertRankQuantile(SortOrder::Descending, NullPlacement::AtEnd, 1.0, + "[0.7, 0.3, 0.7, 0.1, 0.7]"); + } + + void AssertRankQuantileNumeric(std::shared_ptr type) { ARROW_SCOPED_TRACE("type = ", type->ToString()); - AssertRankPercentileEmpty(type); + AssertRankQuantileEmpty(type); - // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank + // Reproduce the example from https://en.wikipedia.org/wiki/Quantile_rank SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]")); for (auto null_placement : AllNullPlacements()) { - AssertRankPercentile(SortOrder::Ascending, null_placement, 10.0, - "[9.5, 8.0, 8.0, 6.0, 6.0, 3.5, 3.5, 3.5, 1.5, 0.5]"); - AssertRankPercentile(SortOrder::Ascending, null_placement, 100.0, - "[95, 80, 80, 60, 60, 35, 35, 35, 15, 5]"); - AssertRankPercentile(SortOrder::Descending, null_placement, 10.0, - "[0.5, 2.0, 2.0, 4.0, 4.0, 6.5, 6.5, 6.5, 8.5, 9.5]"); - AssertRankPercentile(SortOrder::Descending, null_placement, 100.0, - "[5, 20, 20, 40, 40, 65, 65, 65, 85, 95]"); + AssertRankQuantile(SortOrder::Ascending, null_placement, 10.0, + "[9.5, 8.0, 8.0, 6.0, 6.0, 3.5, 3.5, 3.5, 1.5, 0.5]"); + AssertRankQuantile(SortOrder::Ascending, null_placement, 100.0, + "[95, 80, 80, 60, 60, 35, 35, 35, 15, 5]"); + AssertRankQuantile(SortOrder::Descending, null_placement, 10.0, + "[0.5, 2.0, 2.0, 4.0, 4.0, 6.5, 6.5, 6.5, 8.5, 9.5]"); + AssertRankQuantile(SortOrder::Descending, null_placement, 100.0, + "[5, 20, 20, 40, 40, 65, 65, 65, 85, 95]"); } // With nulls SetInput(ArrayFromJSON(type, "[null, 1, null, 2, null]")); - AssertRankPercentile_N1N2N(); + AssertRankQuantile_N1N2N(); } - void AssertRankPercentileBinaryLike(std::shared_ptr type) { + void AssertRankQuantileBinaryLike(std::shared_ptr type) { ARROW_SCOPED_TRACE("type = ", type->ToString()); - AssertRankPercentileEmpty(type); + AssertRankQuantileEmpty(type); SetInput(ArrayFromJSON(type, R"(["", "ab", "", "ab", ""])")); - AssertRankPercentile_12121(); + AssertRankQuantile_12121(); // With nulls SetInput(ArrayFromJSON(type, R"([null, "", null, "ab", null])")); - AssertRankPercentile_N1N2N(); + AssertRankQuantile_N1N2N(); } }; -TEST_F(TestRankPercentile, Real) { +TEST_F(TestRankQuantile, Real) { for (auto type : ::arrow::FloatingPointTypes()) { - AssertRankPercentileNumeric(type); + AssertRankQuantileNumeric(type); } } -TEST_F(TestRankPercentile, Integral) { +TEST_F(TestRankQuantile, Integral) { for (auto type : ::arrow::IntTypes()) { - AssertRankPercentileNumeric(type); + AssertRankQuantileNumeric(type); } } -TEST_F(TestRankPercentile, Boolean) { +TEST_F(TestRankQuantile, Boolean) { auto type = boolean(); - AssertRankPercentileEmpty(type); + AssertRankQuantileEmpty(type); SetInput(ArrayFromJSON(type, "[false, true, false, true, false]")); - AssertRankPercentile_12121(); + AssertRankQuantile_12121(); // With nulls SetInput(ArrayFromJSON(type, "[null, false, null, true, null]")); - AssertRankPercentile_N1N2N(); + AssertRankQuantile_N1N2N(); } -TEST_F(TestRankPercentile, BinaryLike) { +TEST_F(TestRankQuantile, BinaryLike) { for (auto type : BaseBinaryTypes()) { - AssertRankPercentileBinaryLike(type); + AssertRankQuantileBinaryLike(type); } } -TEST_F(TestRankPercentile, FixedSizeBinary) { +TEST_F(TestRankQuantile, FixedSizeBinary) { auto type = fixed_size_binary(3); - AssertRankPercentileEmpty(type); + AssertRankQuantileEmpty(type); SetInput(ArrayFromJSON(type, R"(["abc", "def", "abc", "def", "abc"])")); - AssertRankPercentile_12121(); + AssertRankQuantile_12121(); // With nulls SetInput(ArrayFromJSON(type, R"([null, "abc", null, "def", null])")); - AssertRankPercentile_N1N2N(); + AssertRankQuantile_N1N2N(); } } // namespace compute diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 64fda25851d..838ebede2a6 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1796,21 +1796,21 @@ in the respective option classes. Binary- and String-like inputs are ordered lexicographically as bytestrings, even for String types. -+-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=======================+============+=========================================================+===================+=================================+================+ -| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | -+-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ -| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | -+-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ -| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) | -+-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ -| rank_percentile | Unary | Boolean, Numeric, Temporal, Binary- and String-like | Float64 | :struct:`RankPercentileOptions` | \(5) | -+-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ -| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(6) \(7) | -+-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ -| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(6) | -+-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+============+=========================================================+===================+===============================+================+ +| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| rank_quantile | Unary | Boolean, Numeric, Temporal, Binary- and String-like | Float64 | :struct:`RankQuantileOptions` | \(5) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(6) \(7) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(6) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ * \(1) The output is an array of indices into the input, that define a @@ -1828,7 +1828,7 @@ in the respective option classes. * \(4) The output is a one-based numerical array of ranks. * \(5) The output is an array of quantiles between 0 and a constant *factor*. - The *factor* can be configured in :class:`RankPercentileOptions` + The *factor* can be configured in :class:`RankQuantileOptions` (use 100.0 for a percentile rank). * \(6) The input can be an array, chunked array, record batch or From 30db71ad8cce5ff9db50412c6c5cc4955e1b03aa Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 21 Jan 2025 10:10:17 +0100 Subject: [PATCH 6/8] Fix bad search/replace Co-authored-by: Rossi Sun --- cpp/src/arrow/compute/kernels/vector_sort_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index c59fb42d5f9..1292e13ea67 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -2545,7 +2545,7 @@ class TestRankQuantile : public BaseTestRank { ARROW_SCOPED_TRACE("type = ", type->ToString()); AssertRankQuantileEmpty(type); - // Reproduce the example from https://en.wikipedia.org/wiki/Quantile_rank + // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]")); for (auto null_placement : AllNullPlacements()) { AssertRankQuantile(SortOrder::Ascending, null_placement, 10.0, From 0ad0ff007608567b1710c5456984763de8dd87b4 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 21 Jan 2025 10:17:38 +0100 Subject: [PATCH 7/8] Pass factor explicitly --- .../arrow/compute/kernels/vector_sort_test.cc | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 1292e13ea67..a0f86b49d1c 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -2522,22 +2522,22 @@ class TestRankQuantile : public BaseTestRank { // Expecting an input ordered like [1, 2, 1, 2, 1] void AssertRankQuantile_12121() { for (auto null_placement : AllNullPlacements()) { - AssertRankQuantile(SortOrder::Ascending, null_placement, 100.0, + AssertRankQuantile(SortOrder::Ascending, null_placement, /*factor=*/100.0, "[30.0, 80.0, 30.0, 80.0, 30.0]"); - AssertRankQuantile(SortOrder::Descending, null_placement, 100.0, + AssertRankQuantile(SortOrder::Descending, null_placement, /*factor=*/100.0, "[70.0, 20.0, 70.0, 20.0, 70.0]"); } } // Expecting an input ordered like [null, 1, null, 2, null] void AssertRankQuantile_N1N2N() { - AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtStart, 1.0, + AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtStart, /*factor=*/1.0, "[0.3, 0.7, 0.3, 0.9, 0.3]"); - AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtEnd, 1.0, + AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtEnd, /*factor=*/1.0, "[0.7, 0.1, 0.7, 0.3, 0.7]"); - AssertRankQuantile(SortOrder::Descending, NullPlacement::AtStart, 1.0, + AssertRankQuantile(SortOrder::Descending, NullPlacement::AtStart, /*factor=*/1.0, "[0.3, 0.9, 0.3, 0.7, 0.3]"); - AssertRankQuantile(SortOrder::Descending, NullPlacement::AtEnd, 1.0, + AssertRankQuantile(SortOrder::Descending, NullPlacement::AtEnd, /*factor=*/1.0, "[0.7, 0.3, 0.7, 0.1, 0.7]"); } @@ -2548,13 +2548,13 @@ class TestRankQuantile : public BaseTestRank { // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]")); for (auto null_placement : AllNullPlacements()) { - AssertRankQuantile(SortOrder::Ascending, null_placement, 10.0, + AssertRankQuantile(SortOrder::Ascending, null_placement, /*factor=*/10.0, "[9.5, 8.0, 8.0, 6.0, 6.0, 3.5, 3.5, 3.5, 1.5, 0.5]"); - AssertRankQuantile(SortOrder::Ascending, null_placement, 100.0, + AssertRankQuantile(SortOrder::Ascending, null_placement, /*factor=*/100.0, "[95, 80, 80, 60, 60, 35, 35, 35, 15, 5]"); - AssertRankQuantile(SortOrder::Descending, null_placement, 10.0, + AssertRankQuantile(SortOrder::Descending, null_placement, /*factor=*/10.0, "[0.5, 2.0, 2.0, 4.0, 4.0, 6.5, 6.5, 6.5, 8.5, 9.5]"); - AssertRankQuantile(SortOrder::Descending, null_placement, 100.0, + AssertRankQuantile(SortOrder::Descending, null_placement, /*factor=*/100.0, "[5, 20, 20, 40, 40, 65, 65, 65, 85, 95]"); } From d8c7858b99319092a48f915d0606a0ba7ede76d3 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 23 Jan 2025 18:16:32 +0100 Subject: [PATCH 8/8] Remove factor option --- cpp/src/arrow/compute/api_vector.cc | 8 +-- cpp/src/arrow/compute/api_vector.h | 11 +--- cpp/src/arrow/compute/kernels/vector_rank.cc | 18 ++---- .../arrow/compute/kernels/vector_sort_test.cc | 57 ++++++++----------- docs/source/cpp/compute.rst | 4 +- 5 files changed, 36 insertions(+), 62 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 75d1bc3799f..61335de6ac0 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -154,8 +154,7 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("tiebreaker", &RankOptions::tiebreaker)); static auto kRankQuantileOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankQuantileOptions::sort_keys), - DataMember("null_placement", &RankQuantileOptions::null_placement), - DataMember("factor", &RankQuantileOptions::factor)); + DataMember("null_placement", &RankQuantileOptions::null_placement)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); static auto kListFlattenOptionsType = GetFunctionOptionsType( @@ -234,11 +233,10 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac constexpr char RankOptions::kTypeName[]; RankQuantileOptions::RankQuantileOptions(std::vector sort_keys, - NullPlacement null_placement, double factor) + NullPlacement null_placement) : FunctionOptions(internal::kRankQuantileOptionsType), sort_keys(std::move(sort_keys)), - null_placement(null_placement), - factor(factor) {} + null_placement(null_placement) {} constexpr char RankQuantileOptions::kTypeName[]; PairwiseOptions::PairwiseOptions(int64_t periods) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 99a1603db29..22bb1647197 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -199,13 +199,11 @@ class ARROW_EXPORT RankOptions : public FunctionOptions { class ARROW_EXPORT RankQuantileOptions : public FunctionOptions { public: explicit RankQuantileOptions(std::vector sort_keys = {}, - NullPlacement null_placement = NullPlacement::AtEnd, - double factor = 1.0); + NullPlacement null_placement = NullPlacement::AtEnd); /// Convenience constructor for array inputs explicit RankQuantileOptions(SortOrder order, - NullPlacement null_placement = NullPlacement::AtEnd, - double factor = 1.0) - : RankQuantileOptions({SortKey("", order)}, null_placement, factor) {} + NullPlacement null_placement = NullPlacement::AtEnd) + : RankQuantileOptions({SortKey("", order)}, null_placement) {} static constexpr char const kTypeName[] = "RankQuantileOptions"; static RankQuantileOptions Defaults() { return RankQuantileOptions(); } @@ -214,9 +212,6 @@ class ARROW_EXPORT RankQuantileOptions : public FunctionOptions { std::vector sort_keys; /// Whether nulls and NaNs are placed at the start or at the end NullPlacement null_placement; - /// Factor to apply to the output. - /// Use 1.0 for results in (0, 1), 100.0 for percentages, etc. - double factor; }; /// \brief Partitioning options for NthToIndices diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index da785eaf2dd..2efc61c2e6c 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -166,8 +166,6 @@ class SortAndMarkDuplicate : public TypeVisitor { // A helper class that emits rankings for the "rank_quantile" function struct QuantileRanker { - explicit QuantileRanker(double factor) : factor_(factor) {} - Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted) { const int64_t length = sorted.overall_end() - sorted.overall_begin(); ARROW_ASSIGN_OR_RAISE(auto rankings, @@ -189,7 +187,7 @@ struct QuantileRanker { } // The run length, i.e. the frequency of the current value int64_t freq = run_end - it; - double quantile = (cum_freq + 0.5 * freq) * factor_ / static_cast(length); + double quantile = (cum_freq + 0.5 * freq) / static_cast(length); // Output quantile rank values for (; it < run_end; ++it) { out_begin[original_index(*it)] = quantile; @@ -199,9 +197,6 @@ struct QuantileRanker { DCHECK_EQ(cum_freq, length); return Datum(rankings); } - - private: - const double factor_; }; // A helper class that emits rankings for the "rank" function @@ -292,10 +287,11 @@ const FunctionDoc rank_quantile_doc( "are therefore sorted at the end of the input. For floating-point types,\n" "NaNs are considered greater than any other non-null value, but smaller\n" "than null values.\n" - "Results are computed as in https://en.wikipedia.org/wiki/Quantile_rank\n" + "The results are real values strictly between 0 and 1. They are\n" + "computed as in https://en.wikipedia.org/wiki/Quantile_rank\n" + "but without multiplying by 100.\n" "\n" - "The handling of nulls and NaNs, and the constant factor can be changed\n" - "in RankQuantileOptions."), + "The handling of nulls and NaNs can be changed in RankQuantileOptions."), {"input"}, "RankQuantileOptions"); template @@ -375,9 +371,7 @@ class RankQuantileMetaFunction : public RankMetaFunctionBase& expected) { const std::vector sort_keys{SortKey("foo", order)}; - RankQuantileOptions options(sort_keys, null_placement, factor); + RankQuantileOptions options(sort_keys, null_placement); ARROW_SCOPED_TRACE("options = ", options.ToString()); for (const auto& datum : datums) { ASSERT_OK_AND_ASSIGN(auto actual, CallFunction("rank_quantile", {datum}, &options)); @@ -2485,36 +2485,29 @@ class TestRankQuantile : public BaseTestRank { } void AssertRankQuantile(const DatumVector& datums, SortOrder order, - NullPlacement null_placement, double factor, - const std::string& expected) { - AssertRankQuantile(datums, order, null_placement, factor, - ArrayFromJSON(float64(), expected)); + NullPlacement null_placement, const std::string& expected) { + AssertRankQuantile(datums, order, null_placement, ArrayFromJSON(float64(), expected)); } - void AssertRankQuantile(SortOrder order, NullPlacement null_placement, double factor, + void AssertRankQuantile(SortOrder order, NullPlacement null_placement, const std::shared_ptr& expected) { - AssertRankQuantile(datums_, order, null_placement, factor, expected); + AssertRankQuantile(datums_, order, null_placement, expected); } - void AssertRankQuantile(SortOrder order, NullPlacement null_placement, double factor, + void AssertRankQuantile(SortOrder order, NullPlacement null_placement, const std::string& expected) { - AssertRankQuantile(datums_, order, null_placement, factor, + AssertRankQuantile(datums_, order, null_placement, ArrayFromJSON(float64(), expected)); } void AssertRankQuantileEmpty(std::shared_ptr type) { for (auto null_placement : AllNullPlacements()) { for (auto order : AllOrders()) { - AssertRankQuantile({ArrayFromJSON(type, "[]")}, order, null_placement, - /*factor=*/1.0, "[]"); + AssertRankQuantile({ArrayFromJSON(type, "[]")}, order, null_placement, "[]"); AssertRankQuantile({ArrayFromJSON(type, "[null]")}, order, null_placement, - /*factor=*/1.0, "[0.5]"); - AssertRankQuantile({ArrayFromJSON(type, "[null]")}, order, null_placement, - /*factor=*/10.0, "[5]"); - AssertRankQuantile({ArrayFromJSON(type, "[null, null, null]")}, order, - null_placement, /*factor=*/1.0, "[0.5, 0.5, 0.5]"); + "[0.5]"); AssertRankQuantile({ArrayFromJSON(type, "[null, null, null]")}, order, - null_placement, /*factor=*/100.0, "[50, 50, 50]"); + null_placement, "[0.5, 0.5, 0.5]"); } } } @@ -2522,22 +2515,22 @@ class TestRankQuantile : public BaseTestRank { // Expecting an input ordered like [1, 2, 1, 2, 1] void AssertRankQuantile_12121() { for (auto null_placement : AllNullPlacements()) { - AssertRankQuantile(SortOrder::Ascending, null_placement, /*factor=*/100.0, - "[30.0, 80.0, 30.0, 80.0, 30.0]"); - AssertRankQuantile(SortOrder::Descending, null_placement, /*factor=*/100.0, - "[70.0, 20.0, 70.0, 20.0, 70.0]"); + AssertRankQuantile(SortOrder::Ascending, null_placement, + "[0.3, 0.8, 0.3, 0.8, 0.3]"); + AssertRankQuantile(SortOrder::Descending, null_placement, + "[0.7, 0.2, 0.7, 0.2, 0.7]"); } } // Expecting an input ordered like [null, 1, null, 2, null] void AssertRankQuantile_N1N2N() { - AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtStart, /*factor=*/1.0, + AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtStart, "[0.3, 0.7, 0.3, 0.9, 0.3]"); - AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtEnd, /*factor=*/1.0, + AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtEnd, "[0.7, 0.1, 0.7, 0.3, 0.7]"); - AssertRankQuantile(SortOrder::Descending, NullPlacement::AtStart, /*factor=*/1.0, + AssertRankQuantile(SortOrder::Descending, NullPlacement::AtStart, "[0.3, 0.9, 0.3, 0.7, 0.3]"); - AssertRankQuantile(SortOrder::Descending, NullPlacement::AtEnd, /*factor=*/1.0, + AssertRankQuantile(SortOrder::Descending, NullPlacement::AtEnd, "[0.7, 0.3, 0.7, 0.1, 0.7]"); } @@ -2548,14 +2541,10 @@ class TestRankQuantile : public BaseTestRank { // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]")); for (auto null_placement : AllNullPlacements()) { - AssertRankQuantile(SortOrder::Ascending, null_placement, /*factor=*/10.0, - "[9.5, 8.0, 8.0, 6.0, 6.0, 3.5, 3.5, 3.5, 1.5, 0.5]"); - AssertRankQuantile(SortOrder::Ascending, null_placement, /*factor=*/100.0, - "[95, 80, 80, 60, 60, 35, 35, 35, 15, 5]"); - AssertRankQuantile(SortOrder::Descending, null_placement, /*factor=*/10.0, - "[0.5, 2.0, 2.0, 4.0, 4.0, 6.5, 6.5, 6.5, 8.5, 9.5]"); - AssertRankQuantile(SortOrder::Descending, null_placement, /*factor=*/100.0, - "[5, 20, 20, 40, 40, 65, 65, 65, 85, 95]"); + AssertRankQuantile(SortOrder::Ascending, null_placement, + "[0.95, 0.8, 0.8, 0.6, 0.6, 0.35, 0.35, 0.35, 0.15, 0.05]"); + AssertRankQuantile(SortOrder::Descending, null_placement, + "[0.05, 0.2, 0.2, 0.4, 0.4, 0.65, 0.65, 0.65, 0.85, 0.95]"); } // With nulls diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 838ebede2a6..6acc9e31a5f 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1827,9 +1827,7 @@ in the respective option classes. * \(4) The output is a one-based numerical array of ranks. -* \(5) The output is an array of quantiles between 0 and a constant *factor*. - The *factor* can be configured in :class:`RankQuantileOptions` - (use 100.0 for a percentile rank). +* \(5) The output is an array of quantiles strictly between 0 and 1. * \(6) The input can be an array, chunked array, record batch or table. If the input is a record batch or table, one or more sort