diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 22ecf1cc878..61335de6ac0 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -48,6 +48,7 @@ using compute::DictionaryEncodeOptions; using compute::FilterOptions; using compute::NullPlacement; using compute::RankOptions; +using compute::RankQuantileOptions; template <> struct EnumTraits @@ -151,6 +152,9 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankOptions::sort_keys), DataMember("null_placement", &RankOptions::null_placement), DataMember("tiebreaker", &RankOptions::tiebreaker)); +static auto kRankQuantileOptionsType = GetFunctionOptionsType( + DataMember("sort_keys", &RankQuantileOptions::sort_keys), + DataMember("null_placement", &RankQuantileOptions::null_placement)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); static auto kListFlattenOptionsType = GetFunctionOptionsType( @@ -228,6 +232,13 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac tiebreaker(tiebreaker) {} constexpr char RankOptions::kTypeName[]; +RankQuantileOptions::RankQuantileOptions(std::vector sort_keys, + NullPlacement null_placement) + : FunctionOptions(internal::kRankQuantileOptionsType), + sort_keys(std::move(sort_keys)), + null_placement(null_placement) {} +constexpr char RankQuantileOptions::kTypeName[]; + PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index ada1665b3ec..22bb1647197 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -195,6 +195,25 @@ class ARROW_EXPORT RankOptions : public FunctionOptions { Tiebreaker tiebreaker; }; +/// \brief Quantile rank options +class ARROW_EXPORT RankQuantileOptions : public FunctionOptions { + public: + explicit RankQuantileOptions(std::vector sort_keys = {}, + NullPlacement null_placement = NullPlacement::AtEnd); + /// Convenience constructor for array inputs + explicit RankQuantileOptions(SortOrder order, + NullPlacement null_placement = NullPlacement::AtEnd) + : RankQuantileOptions({SortKey("", order)}, null_placement) {} + + static constexpr char const kTypeName[] = "RankQuantileOptions"; + static RankQuantileOptions Defaults() { return RankQuantileOptions(); } + + /// Column key(s) to order by and how to order by these sort keys. + std::vector sort_keys; + /// Whether nulls and NaNs are placed at the start or at the end + NullPlacement null_placement; +}; + /// \brief Partitioning options for NthToIndices class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { public: diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 4fdc83788c6..2efc61c2e6c 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +#include +#include + #include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" @@ -32,10 +35,6 @@ namespace { // is the same as the value at the previous sort index. constexpr uint64_t kDuplicateMask = 1ULL << 63; -constexpr bool NeedsDuplicates(RankOptions::Tiebreaker tiebreaker) { - return tiebreaker != RankOptions::First; -} - template void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_selector) { using T = decltype(value_selector(int64_t{})); @@ -63,81 +62,68 @@ void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_sel } } -Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted, - const NullPlacement null_placement, - const RankOptions::Tiebreaker tiebreaker) { - auto length = sorted.overall_end() - sorted.overall_begin(); - ARROW_ASSIGN_OR_RAISE(auto rankings, - MakeMutableUInt64Array(length, ctx->memory_pool())); - auto out_begin = rankings->GetMutableValues(1); - uint64_t rank; - - auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; - auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; - - switch (tiebreaker) { - case RankOptions::Dense: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { - if (!is_duplicate(*it)) { - ++rank; - } - out_begin[original_index(*it)] = rank; - } - break; - } - - case RankOptions::First: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) { - // No duplicate marks expected for RankOptions::First - DCHECK(!is_duplicate(*it)); - out_begin[*it] = ++rank; - } - break; - } +const RankOptions* GetDefaultRankOptions() { + static const auto kDefaultRankOptions = RankOptions::Defaults(); + return &kDefaultRankOptions; +} - case RankOptions::Min: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { - if (!is_duplicate(*it)) { - rank = (it - sorted.overall_begin()) + 1; - } - out_begin[original_index(*it)] = rank; - } - break; - } +const RankQuantileOptions* GetDefaultQuantileRankOptions() { + static const auto kDefaultQuantileRankOptions = RankQuantileOptions::Defaults(); + return &kDefaultQuantileRankOptions; +} - case RankOptions::Max: { - rank = length; - for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); --it) { - out_begin[original_index(*it)] = rank; - // If the current index isn't marked as duplicate, then it's the last - // tie in a row (since we iterate in reverse order), so update rank - // for the next row of ties. - if (!is_duplicate(*it)) { - rank = it - sorted.overall_begin(); - } - } - break; - } +template +Result DoSortAndMarkDuplicate( + ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const Array& input, + const std::shared_ptr& physical_type, const SortOrder order, + const NullPlacement null_placement, bool needs_duplicates) { + using GetView = GetViewType; + using ArrayType = typename TypeTraits::ArrayType; + + ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type)); + + ArrayType array(input.data()); + ARROW_ASSIGN_OR_RAISE(auto sorted, + array_sorter(indices_begin, indices_end, array, 0, + ArraySortOptions(order, null_placement), ctx)); + + if (needs_duplicates) { + auto value_selector = [&array](int64_t index) { + return GetView::LogicalValue(array.GetView(index)); + }; + MarkDuplicates(sorted, value_selector); } - - return Datum(rankings); + return sorted; } -const RankOptions* GetDefaultRankOptions() { - static const auto kDefaultRankOptions = RankOptions::Defaults(); - return &kDefaultRankOptions; +template +Result DoSortAndMarkDuplicate( + ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, + const ChunkedArray& input, const std::shared_ptr& physical_type, + const SortOrder order, const NullPlacement null_placement, bool needs_duplicates) { + auto physical_chunks = GetPhysicalChunks(input, physical_type); + if (physical_chunks.empty()) { + return NullPartitionResult{}; + } + ARROW_ASSIGN_OR_RAISE(auto sorted, + SortChunkedArray(ctx, indices_begin, indices_end, physical_type, + physical_chunks, order, null_placement)); + if (needs_duplicates) { + const auto arrays = GetArrayPointers(physical_chunks); + auto value_selector = [resolver = ChunkedArrayResolver(span(arrays))](int64_t index) { + return resolver.Resolve(index).Value(); + }; + MarkDuplicates(sorted, value_selector); + } + return sorted; } -template -class RankerMixin : public TypeVisitor { +template +class SortAndMarkDuplicate : public TypeVisitor { public: - RankerMixin(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, - const InputType& input, const SortOrder order, - const NullPlacement null_placement, - const RankOptions::Tiebreaker tiebreaker, Datum* output) + SortAndMarkDuplicate(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, + const InputType& input, const SortOrder order, + const NullPlacement null_placement, const bool needs_duplicate) : TypeVisitor(), ctx_(ctx), indices_begin_(indices_begin), @@ -145,104 +131,145 @@ class RankerMixin : public TypeVisitor { input_(input), order_(order), null_placement_(null_placement), - tiebreaker_(tiebreaker), - physical_type_(GetPhysicalType(input.type())), - output_(output) {} + needs_duplicates_(needs_duplicate), + physical_type_(GetPhysicalType(input.type())) {} - Status Run() { return physical_type_->Accept(this); } + Result Run() { + RETURN_NOT_OK(physical_type_->Accept(this)); + return sorted_; + } -#define VISIT(TYPE) \ - Status Visit(const TYPE& type) { \ - return static_cast(this)->template RankInternal(); \ +#define VISIT(TYPE) \ + Status Visit(const TYPE& type) { \ + ARROW_ASSIGN_OR_RAISE( \ + sorted_, DoSortAndMarkDuplicate(ctx_, indices_begin_, indices_end_, \ + input_, physical_type_, order_, \ + null_placement_, needs_duplicates_)); \ + return Status::OK(); \ } VISIT_SORTABLE_PHYSICAL_TYPES(VISIT) #undef VISIT - protected: + private: ExecContext* ctx_; uint64_t* indices_begin_; uint64_t* indices_end_; const InputType& input_; const SortOrder order_; const NullPlacement null_placement_; - const RankOptions::Tiebreaker tiebreaker_; + const bool needs_duplicates_; const std::shared_ptr physical_type_; - Datum* output_; + NullPartitionResult sorted_{}; }; -template -class Ranker; - -template <> -class Ranker : public RankerMixin> { - public: - using RankerMixin::RankerMixin; - - template - Status RankInternal() { - using GetView = GetViewType; - using ArrayType = typename TypeTraits::ArrayType; +// A helper class that emits rankings for the "rank_quantile" function +struct QuantileRanker { + Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted) { + const int64_t length = sorted.overall_end() - sorted.overall_begin(); + ARROW_ASSIGN_OR_RAISE(auto rankings, + MakeMutableFloat64Array(length, ctx->memory_pool())); + auto out_begin = rankings->GetMutableValues(1); + + auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; + auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; + + // The count of values strictly less than the value being considered + int64_t cum_freq = 0; + auto it = sorted.overall_begin(); + + while (it < sorted.overall_end()) { + // Look for a run of duplicate values + DCHECK(!is_duplicate(*it)); + auto run_end = it; + while (++run_end < sorted.overall_end() && is_duplicate(*run_end)) { + } + // The run length, i.e. the frequency of the current value + int64_t freq = run_end - it; + double quantile = (cum_freq + 0.5 * freq) / static_cast(length); + // Output quantile rank values + for (; it < run_end; ++it) { + out_begin[original_index(*it)] = quantile; + } + cum_freq += freq; + } + DCHECK_EQ(cum_freq, length); + return Datum(rankings); + } +}; - ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_)); +// A helper class that emits rankings for the "rank" function +struct OrdinalRanker { + explicit OrdinalRanker(RankOptions::Tiebreaker tiebreaker) : tiebreaker_(tiebreaker) {} - ArrayType array(input_.data()); - ARROW_ASSIGN_OR_RAISE(NullPartitionResult sorted, - array_sorter(indices_begin_, indices_end_, array, 0, - ArraySortOptions(order_, null_placement_), ctx_)); + Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted) { + const int64_t length = sorted.overall_end() - sorted.overall_begin(); + ARROW_ASSIGN_OR_RAISE(auto rankings, + MakeMutableUInt64Array(length, ctx->memory_pool())); + auto out_begin = rankings->GetMutableValues(1); + uint64_t rank; + + auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; + auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; + + switch (tiebreaker_) { + case RankOptions::Dense: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { + if (!is_duplicate(*it)) { + ++rank; + } + out_begin[original_index(*it)] = rank; + } + break; + } - if (NeedsDuplicates(tiebreaker_)) { - auto value_selector = [&array](int64_t index) { - return GetView::LogicalValue(array.GetView(index)); - }; - MarkDuplicates(sorted, value_selector); - } - ARROW_ASSIGN_OR_RAISE(*output_, - CreateRankings(ctx_, sorted, null_placement_, tiebreaker_)); + case RankOptions::First: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) { + // No duplicate marks expected for RankOptions::First + DCHECK(!is_duplicate(*it)); + out_begin[*it] = ++rank; + } + break; + } - return Status::OK(); - } -}; + case RankOptions::Min: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { + if (!is_duplicate(*it)) { + rank = (it - sorted.overall_begin()) + 1; + } + out_begin[original_index(*it)] = rank; + } + break; + } -template <> -class Ranker : public RankerMixin> { - public: - template - explicit Ranker(Args&&... args) - : RankerMixin(std::forward(args)...), - physical_chunks_(GetPhysicalChunks(input_, physical_type_)) {} - - template - Status RankInternal() { - if (physical_chunks_.empty()) { - return Status::OK(); + case RankOptions::Max: { + rank = length; + for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); --it) { + out_begin[original_index(*it)] = rank; + // If the current index isn't marked as duplicate, then it's the last + // tie in a row (since we iterate in reverse order), so update rank + // for the next row of ties. + if (!is_duplicate(*it)) { + rank = it - sorted.overall_begin(); + } + } + break; + } } - ARROW_ASSIGN_OR_RAISE( - NullPartitionResult sorted, - SortChunkedArray(ctx_, indices_begin_, indices_end_, physical_type_, - physical_chunks_, order_, null_placement_)); - - if (NeedsDuplicates(tiebreaker_)) { - const auto arrays = GetArrayPointers(physical_chunks_); - auto value_selector = [resolver = - ChunkedArrayResolver(span(arrays))](int64_t index) { - return resolver.Resolve(index).Value(); - }; - MarkDuplicates(sorted, value_selector); - } - ARROW_ASSIGN_OR_RAISE(*output_, - CreateRankings(ctx_, sorted, null_placement_, tiebreaker_)); - return Status::OK(); + return Datum(rankings); } private: - const ArrayVector physical_chunks_; + const RankOptions::Tiebreaker tiebreaker_; }; const FunctionDoc rank_doc( - "Compute numerical ranks of an array (1-based)", + "Compute ordinal ranks of an array (1-based)", ("This function computes a rank of the input array.\n" "By default, null values are considered greater than any other value and\n" "are therefore sorted at the end of the input. For floating-point types,\n" @@ -253,21 +280,34 @@ const FunctionDoc rank_doc( "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."), {"input"}, "RankOptions"); -class RankMetaFunction : public MetaFunction { +const FunctionDoc rank_quantile_doc( + "Compute quantile ranks of an array", + ("This function computes a quantile rank of the input array.\n" + "By default, null values are considered greater than any other value and\n" + "are therefore sorted at the end of the input. For floating-point types,\n" + "NaNs are considered greater than any other non-null value, but smaller\n" + "than null values.\n" + "The results are real values strictly between 0 and 1. They are\n" + "computed as in https://en.wikipedia.org/wiki/Quantile_rank\n" + "but without multiplying by 100.\n" + "\n" + "The handling of nulls and NaNs can be changed in RankQuantileOptions."), + {"input"}, "RankQuantileOptions"); + +template +class RankMetaFunctionBase : public MetaFunction { public: - RankMetaFunction() - : MetaFunction("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {} + using MetaFunction::MetaFunction; Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override { - const auto& rank_options = checked_cast(*options); switch (args[0].kind()) { case Datum::ARRAY: { - return Rank(*args[0].make_array(), rank_options, ctx); + return Rank(*args[0].make_array(), *options, ctx); } break; case Datum::CHUNKED_ARRAY: { - return Rank(*args[0].chunked_array(), rank_options, ctx); + return Rank(*args[0].chunked_array(), *options, ctx); } break; default: break; @@ -278,10 +318,13 @@ class RankMetaFunction : public MetaFunction { args[0].ToString()); } - private: + protected: template - static Result Rank(const T& input, const RankOptions& options, - ExecContext* ctx) { + Result Rank(const T& input, const FunctionOptions& function_options, + ExecContext* ctx) const { + const auto& options = + checked_cast(function_options); + SortOrder order = SortOrder::Ascending; if (!options.sort_keys.empty()) { order = options.sort_keys[0].order; @@ -293,19 +336,53 @@ class RankMetaFunction : public MetaFunction { auto* indices_begin = indices->GetMutableValues(1); auto* indices_end = indices_begin + length; std::iota(indices_begin, indices_end, 0); + auto needs_duplicates = Derived::NeedsDuplicates(options); + ARROW_ASSIGN_OR_RAISE( + auto sorted, SortAndMarkDuplicate(ctx, indices_begin, indices_end, input, order, + options.null_placement, needs_duplicates) + .Run()); + + auto ranker = Derived::GetRanker(options); + return ranker.CreateRankings(ctx, sorted); + } +}; + +class RankMetaFunction : public RankMetaFunctionBase { + public: + using FunctionOptionsType = RankOptions; + using RankerType = OrdinalRanker; - Datum output; - Ranker ranker(ctx, indices_begin, indices_end, input, order, - options.null_placement, options.tiebreaker, &output); - ARROW_RETURN_NOT_OK(ranker.Run()); - return output; + static bool NeedsDuplicates(const RankOptions& options) { + return options.tiebreaker != RankOptions::First; } + + static RankerType GetRanker(const RankOptions& options) { + return RankerType(options.tiebreaker); + } + + RankMetaFunction() + : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {} +}; + +class RankQuantileMetaFunction : public RankMetaFunctionBase { + public: + using FunctionOptionsType = RankQuantileOptions; + using RankerType = QuantileRanker; + + static bool NeedsDuplicates(const RankQuantileOptions&) { return true; } + + static RankerType GetRanker(const RankQuantileOptions& options) { return RankerType(); } + + RankQuantileMetaFunction() + : RankMetaFunctionBase("rank_quantile", Arity::Unary(), rank_quantile_doc, + GetDefaultQuantileRankOptions()) {} }; } // namespace void RegisterVectorRank(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::make_shared())); + DCHECK_OK(registry->AddFunction(std::make_shared())); } } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index cc6b7834a30..6288aa26eaa 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -806,4 +806,11 @@ inline Result> MakeMutableUInt64Array( return ArrayData::Make(uint64(), length, {nullptr, std::move(data)}, /*null_count=*/0); } +inline Result> MakeMutableFloat64Array( + int64_t length, MemoryPool* memory_pool) { + auto buffer_size = length * sizeof(double); + ARROW_ASSIGN_OR_RAISE(auto data, AllocateBuffer(buffer_size, memory_pool)); + return ArrayData::Make(float64(), length, {nullptr, std::move(data)}, /*null_count=*/0); +} + } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 7f0ef641f6c..dc1c055705d 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -2205,9 +2205,9 @@ TEST_F(TestNestedSortIndices, SortRecordBatch) { TestSort(GetRecordBatch()); } TEST_F(TestNestedSortIndices, SortTable) { TestSort(GetTable()); } // ---------------------------------------------------------------------- -// Tests for Rank +// Tests for Rank and Quantile Rank -class TestRank : public ::testing::Test { +class BaseTestRank : public ::testing::Test { protected: // Create several test datums from `array`. One of which is the unmodified Array // while the rest are chunked variants based on it. @@ -2236,6 +2236,11 @@ class TestRank : public ::testing::Test { datums_ = {chunked_array}; } + DatumVector datums_; +}; + +class TestRank : public BaseTestRank { + protected: static void AssertRank(const DatumVector& datums, SortOrder order, NullPlacement null_placement, RankOptions::Tiebreaker tiebreaker, const std::shared_ptr& expected) { @@ -2310,8 +2315,6 @@ class TestRank : public ::testing::Test { AssertRank(SortOrder::Descending, NullPlacement::AtStart, RankOptions::Dense, ArrayFromJSON(uint64(), "[3, 4, 2, 1, 2, 1, 4]")); } - - DatumVector datums_; }; TEST_F(TestRank, Real) { @@ -2466,5 +2469,140 @@ TEST_F(TestRank, EmptyChunks) { } } +class TestRankQuantile : public BaseTestRank { + public: + void AssertRankQuantile(const DatumVector& datums, SortOrder order, + NullPlacement null_placement, + const std::shared_ptr& expected) { + const std::vector sort_keys{SortKey("foo", order)}; + RankQuantileOptions options(sort_keys, null_placement); + ARROW_SCOPED_TRACE("options = ", options.ToString()); + for (const auto& datum : datums) { + ASSERT_OK_AND_ASSIGN(auto actual, CallFunction("rank_quantile", {datum}, &options)); + ValidateOutput(actual); + AssertDatumsEqual(expected, actual, /*verbose=*/true); + } + } + + void AssertRankQuantile(const DatumVector& datums, SortOrder order, + NullPlacement null_placement, const std::string& expected) { + AssertRankQuantile(datums, order, null_placement, ArrayFromJSON(float64(), expected)); + } + + void AssertRankQuantile(SortOrder order, NullPlacement null_placement, + const std::shared_ptr& expected) { + AssertRankQuantile(datums_, order, null_placement, expected); + } + + void AssertRankQuantile(SortOrder order, NullPlacement null_placement, + const std::string& expected) { + AssertRankQuantile(datums_, order, null_placement, + ArrayFromJSON(float64(), expected)); + } + + void AssertRankQuantileEmpty(std::shared_ptr type) { + for (auto null_placement : AllNullPlacements()) { + for (auto order : AllOrders()) { + AssertRankQuantile({ArrayFromJSON(type, "[]")}, order, null_placement, "[]"); + AssertRankQuantile({ArrayFromJSON(type, "[null]")}, order, null_placement, + "[0.5]"); + AssertRankQuantile({ArrayFromJSON(type, "[null, null, null]")}, order, + null_placement, "[0.5, 0.5, 0.5]"); + } + } + } + + // Expecting an input ordered like [1, 2, 1, 2, 1] + void AssertRankQuantile_12121() { + for (auto null_placement : AllNullPlacements()) { + AssertRankQuantile(SortOrder::Ascending, null_placement, + "[0.3, 0.8, 0.3, 0.8, 0.3]"); + AssertRankQuantile(SortOrder::Descending, null_placement, + "[0.7, 0.2, 0.7, 0.2, 0.7]"); + } + } + + // Expecting an input ordered like [null, 1, null, 2, null] + void AssertRankQuantile_N1N2N() { + AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtStart, + "[0.3, 0.7, 0.3, 0.9, 0.3]"); + AssertRankQuantile(SortOrder::Ascending, NullPlacement::AtEnd, + "[0.7, 0.1, 0.7, 0.3, 0.7]"); + AssertRankQuantile(SortOrder::Descending, NullPlacement::AtStart, + "[0.3, 0.9, 0.3, 0.7, 0.3]"); + AssertRankQuantile(SortOrder::Descending, NullPlacement::AtEnd, + "[0.7, 0.3, 0.7, 0.1, 0.7]"); + } + + void AssertRankQuantileNumeric(std::shared_ptr type) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + AssertRankQuantileEmpty(type); + + // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank + SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]")); + for (auto null_placement : AllNullPlacements()) { + AssertRankQuantile(SortOrder::Ascending, null_placement, + "[0.95, 0.8, 0.8, 0.6, 0.6, 0.35, 0.35, 0.35, 0.15, 0.05]"); + AssertRankQuantile(SortOrder::Descending, null_placement, + "[0.05, 0.2, 0.2, 0.4, 0.4, 0.65, 0.65, 0.65, 0.85, 0.95]"); + } + + // With nulls + SetInput(ArrayFromJSON(type, "[null, 1, null, 2, null]")); + AssertRankQuantile_N1N2N(); + } + + void AssertRankQuantileBinaryLike(std::shared_ptr type) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + AssertRankQuantileEmpty(type); + + SetInput(ArrayFromJSON(type, R"(["", "ab", "", "ab", ""])")); + AssertRankQuantile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, R"([null, "", null, "ab", null])")); + AssertRankQuantile_N1N2N(); + } +}; + +TEST_F(TestRankQuantile, Real) { + for (auto type : ::arrow::FloatingPointTypes()) { + AssertRankQuantileNumeric(type); + } +} + +TEST_F(TestRankQuantile, Integral) { + for (auto type : ::arrow::IntTypes()) { + AssertRankQuantileNumeric(type); + } +} + +TEST_F(TestRankQuantile, Boolean) { + auto type = boolean(); + AssertRankQuantileEmpty(type); + + SetInput(ArrayFromJSON(type, "[false, true, false, true, false]")); + AssertRankQuantile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, "[null, false, null, true, null]")); + AssertRankQuantile_N1N2N(); +} + +TEST_F(TestRankQuantile, BinaryLike) { + for (auto type : BaseBinaryTypes()) { + AssertRankQuantileBinaryLike(type); + } +} + +TEST_F(TestRankQuantile, FixedSizeBinary) { + auto type = fixed_size_binary(3); + AssertRankQuantileEmpty(type); + + SetInput(ArrayFromJSON(type, R"(["abc", "def", "abc", "def", "abc"])")); + AssertRankQuantile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, R"([null, "abc", null, "def", null])")); + AssertRankQuantile_N1N2N(); +} + } // namespace compute } // namespace arrow diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 92f3e440391..6acc9e31a5f 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1796,19 +1796,21 @@ in the respective option classes. Binary- and String-like inputs are ordered lexicographically as bytestrings, even for String types. -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=======================+============+=========================================================+===================+================================+================+ -| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(5) \(6) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(5) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+============+=========================================================+===================+===============================+================+ +| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| rank_quantile | Unary | Boolean, Numeric, Temporal, Binary- and String-like | Float64 | :struct:`RankQuantileOptions` | \(5) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(6) \(7) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ +| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(6) | ++-----------------------+------------+---------------------------------------------------------+-------------------+-------------------------------+----------------+ * \(1) The output is an array of indices into the input, that define a @@ -1823,13 +1825,15 @@ in the respective option classes. :func:`std::nth_element`). *N* is given in :member:`PartitionNthOptions::pivot`. -* \(4) The output is a one-based numerical array of ranks +* \(4) The output is a one-based numerical array of ranks. -* \(5) The input can be an array, chunked array, record batch or +* \(5) The output is an array of quantiles strictly between 0 and 1. + +* \(6) The input can be an array, chunked array, record batch or table. If the input is a record batch or table, one or more sort keys must be specified. -* \(6) The output is an array of indices into the input, that define a +* \(7) The output is an array of indices into the input, that define a non-stable sort of the input. .. _cpp-compute-vector-structural-transforms: