Merge branch 'dmlc:release_3.0.0' into release_3.0.0

razdoburdin · web-flow · commit 0b280dda6b1c · 2025-03-11T13:16:43.000+01:00
diff --git a/doc/parameter.rst b/doc/parameter.rst
@@ -540,6 +540,10 @@ These are parameters specific to learning to rank task. See :doc:`Learning to Ra
 
   Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress.
 
+  .. versionchanged:: 3.0.0
+
+  When the ``mean`` method is used, it's normalized by the ``lambdarank_num_pair_per_sample`` instead of gradient.
+
 * ``lambdarank_score_normalization`` [default = ``true``]
 
   .. versionadded:: 3.0.0
diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
@@ -198,8 +198,6 @@ The learning to rank implementation has been significantly updated in 2.0 with a
         # 1.7 only supports sampling, while 2.0 and later use top-k as the default.
 	# See above sections for the trade-off.
         "lambdarank_pair_method": "mean",
-        # Normalization was added in 2.0
-        "lambdarank_normalization": False,
         # 1.7 uses the ranknet loss while later versions use the NDCG weighted loss
         "objective": "rank:pairwise",
 	# 1.7 doesn't have this normalization.
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
@@ -105,9 +105,13 @@ using bst_bin_t = std::int32_t;  // NOLINT
  * @brief Type for data row index (sample).
  */
 using bst_idx_t = std::uint64_t;  // NOLINT
-/*! \brief Type for tree node index. */
+/**
+ * \brief Type for tree node index.
+ */
 using bst_node_t = std::int32_t;      // NOLINT
-/*! \brief Type for ranking group index. */
+/**
+ * @brief Type for ranking group index.
+ */
 using bst_group_t = std::uint32_t;  // NOLINT
 /**
  * @brief Type for indexing into output targets.
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
@@ -105,6 +105,7 @@ def run_ranking_categorical(device: str) -> None:
 def run_normalization(device: str) -> None:
     """Test normalization."""
     X, y, qid, _ = tm.make_ltr(2048, 4, 64, 3)
+    # top-k
     ltr = xgb.XGBRanker(objective="rank:pairwise", n_estimators=4, device=device)
     ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
     e0 = ltr.evals_result()
@@ -119,6 +120,53 @@ def run_normalization(device: str) -> None:
     e1 = ltr.evals_result()
     assert e1["validation_0"]["ndcg@32"][-1] > e0["validation_0"]["ndcg@32"][-1]
 
+    # mean
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_pair_method="mean",
+        lambdarank_normalization=True,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e0 = ltr.evals_result()
+
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_pair_method="mean",
+        lambdarank_normalization=False,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e1 = ltr.evals_result()
+    # no normalization since the number of pairs is 1.
+    assert e1["validation_0"]["ndcg"][-1] == e0["validation_0"]["ndcg"][-1]
+
+    # mean
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_pair_method="mean",
+        lambdarank_normalization=True,
+        lambdarank_num_pair_per_sample=4,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e0 = ltr.evals_result()
+
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_pair_method="mean",
+        lambdarank_normalization=False,
+        lambdarank_num_pair_per_sample=4,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e1 = ltr.evals_result()
+    assert e1["validation_0"]["ndcg"][-1] != e0["validation_0"]["ndcg"][-1]
+
 
 def run_score_normalization(device: str, objective: str) -> None:
     """Test normalization by score differences."""
diff --git a/src/common/ranking_utils.cuh b/src/common/ranking_utils.cuh
@@ -30,6 +30,8 @@ XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size
                                                           std::size_t n_pairs) {
   return group_size * n_pairs;
 }
+// Number of threads in a group divided by the number of samples in this group, returns
+// the number of pairs for pair-wise ltr with sampling.
 XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
                                                          std::size_t group_size) {
   return n_threads / group_size;
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
@@ -115,6 +115,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
   }
 
   [[nodiscard]] bool HasTruncation() const { return lambdarank_pair_method == PairMethod::kTopK; }
+  [[nodiscard]] bool IsMean() const { return lambdarank_pair_method == PairMethod::kMean; }
 
   // Used for evaluation metric and cache initialization, iterate through top-k or the whole list
   [[nodiscard]] auto TopK() const {
@@ -180,7 +181,8 @@ class RankingCache {
   HostDeviceVector<std::size_t> y_sorted_idx_cache_;
   // Cached labels sorted by the model
   HostDeviceVector<float> y_ranked_by_model_;
-  // store rounding factor for objective for each group
+  // Rounding factor for CUDA deterministic floating point summation. One rounding factor
+  // for each ranking group.
   linalg::Vector<GradientPair> roundings_;
   // rounding factor for cost
   HostDeviceVector<double> cost_rounding_;
@@ -215,6 +217,9 @@ class RankingCache {
     if (!info.weights_.Empty()) {
       CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
     }
+    if (param_.HasTruncation()) {
+      CHECK_GE(param_.NumPair(), 1);
+    }
   }
   [[nodiscard]] std::size_t MaxPositionSize() const {
     // Use truncation level as bound.
@@ -267,21 +272,21 @@ class RankingCache {
   }
 
   // CUDA cache getters, the cache is shared between metric and objective, some of these
-  // fields are lazy initialized to avoid unnecessary allocation.
+  // fields are initialized lazily to avoid unnecessary allocation.
   [[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
     CHECK(!threads_group_ptr_.Empty());
     return threads_group_ptr_.ConstDeviceSpan();
   }
   [[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
 
-  linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
+  [[nodiscard]] linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
     if (roundings_.Size() == 0) {
       roundings_.SetDevice(ctx->Device());
       roundings_.Reshape(Groups());
     }
     return roundings_.View(ctx->Device());
   }
-  common::Span<double> CUDACostRounding(Context const* ctx) {
+  [[nodiscard]] common::Span<double> CUDACostRounding(Context const* ctx) {
     if (cost_rounding_.Size() == 0) {
       cost_rounding_.SetDevice(ctx->Device());
       cost_rounding_.Resize(1);
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
@@ -225,10 +225,23 @@ class LambdaRankObj : public FitIntercept {
     };
 
     MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
-    if (sum_lambda > 0.0 && param_.lambdarank_normalization) {
-      double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
-      std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
-                     g_gpair.Values().data(), [norm](GradientPair const& g) { return g * norm; });
+    if (param_.lambdarank_normalization) {
+      double norm = 1.0;
+      if (param_.IsMean()) {
+        // Normalize using the number of pairs for mean.
+        auto n_pairs = this->p_cache_->Param().NumPair();
+        auto scale = 1.0 / static_cast<double>(n_pairs);
+        norm = scale;
+      } else {
+        // Normalize using gradient for top-k.
+        if (sum_lambda > 0.0) {
+          norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+        }
+      }
+      if (norm != 1.0) {
+        std::transform(linalg::begin(g_gpair), linalg::end(g_gpair), linalg::begin(g_gpair),
+                       [norm](GradientPair const& g) { return g * norm; });
+      }
     }
 
     auto w_norm = p_cache_->WeightNorm();
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
@@ -3,18 +3,19 @@
  *
  * \brief CUDA implementation of lambdarank.
  */
+#include <dmlc/registry.h>                      // for DMLC_REGISTRY_FILE_TAG
 #include <thrust/fill.h>                        // for fill_n
 #include <thrust/for_each.h>                    // for for_each_n
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 #include <thrust/iterator/zip_iterator.h>       // for make_zip_iterator
 #include <thrust/tuple.h>                       // for make_tuple, tuple, tie, get
 
-#include <algorithm>                            // for min
-#include <cassert>                              // for assert
-#include <cmath>                                // for abs, log2, isinf
-#include <cstddef>                              // for size_t
-#include <cstdint>                              // for int32_t
-#include <memory>                               // for shared_ptr
+#include <algorithm>  // for min
+#include <cassert>    // for assert
+#include <cmath>      // for abs, log2, isinf
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t
+#include <memory>     // for shared_ptr
 #include <utility>
 
 #include "../common/algorithm.cuh"       // for SegmentedArgSort
@@ -31,7 +32,7 @@
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/linalg.h"              // for VectorView, Range, Vector
 #include "xgboost/logging.h"
-#include "xgboost/span.h"                // for Span
+#include "xgboost/span.h"  // for Span
 
 namespace xgboost::obj {
 DMLC_REGISTRY_FILE_TAG(lambdarank_obj_cu);
@@ -82,7 +83,7 @@ struct GetGradOp {
   MakePairsOp<has_truncation> make_pair;
   Delta delta;
 
-  bool need_update;
+  bool const need_update;
 
   auto __device__ operator()(std::size_t idx) -> GradCostNorm {
     auto const& args = make_pair.args;
@@ -95,6 +96,7 @@ struct GetGradOp {
     auto g_predt = args.predts.subspan(data_group_begin, n_data);
     auto g_gpair = args.gpairs.Slice(linalg::Range(data_group_begin, data_group_begin + n_data));
     auto g_rank = args.d_sorted_idx.subspan(data_group_begin, n_data);
+    auto n_pairs = args.n_pairs;
 
     auto [i, j] = make_pair(idx, g);
 
@@ -108,7 +110,9 @@ struct GetGradOp {
 
     double cost{0};
 
-    auto delta_op = [&](auto const&... args) { return delta(args..., g); };
+    auto delta_op = [&](auto const&... args) {
+      return delta(args..., g);
+    };
     GradientPair pg =
         LambdaGrad<unbiased, norm_by_diff>(g_label, g_predt, g_rank, rank_high, rank_low, delta_op,
                                            args.ti_plus, args.tj_minus, &cost);
@@ -118,7 +122,6 @@ struct GetGradOp {
 
     if (need_update) {
       // second run, update the gradient
-
       auto ng = Repulse(pg);
 
       auto gr = args.d_roundings(g);
@@ -153,6 +156,7 @@ struct GetGradOp {
         }
       }
     }
+
     return thrust::make_tuple(GradientPair{std::abs(pg.GetGrad()), std::abs(pg.GetHess())},
                               std::abs(cost), -2.0 * static_cast<double>(pg.GetGrad()));
   }
@@ -215,12 +219,12 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
     auto hess = std::max(lg.GetHess(), rg.GetHess());
     auto cost = std::max(thrust::get<1>(l), thrust::get<1>(r));
     double sum_lambda = thrust::get<2>(l) + thrust::get<2>(r);
-    return thrust::make_tuple(GradientPair{std::abs(grad), std::abs(hess)}, cost, sum_lambda);
+    return thrust::make_tuple(GradientPair{grad, hess}, cost, sum_lambda);
   };
   auto init = thrust::make_tuple(GradientPair{0.0f, 0.0f}, 0.0, 0.0);
   common::Span<GradCostNorm> d_max_lambdas = p_cache->MaxLambdas<GradCostNorm>(ctx, n_groups);
   CHECK_EQ(n_groups * sizeof(GradCostNorm), d_max_lambdas.size_bytes());
-
+  // Reduce by group.
   std::size_t bytes;
   cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, d_max_lambdas.data(), n_groups,
                                      d_threads_group_ptr.data(), d_threads_group_ptr.data() + 1,
@@ -267,22 +271,35 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
    */
   auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
   auto w_norm = p_cache->WeightNorm();
-  auto norm = p_cache->Param().lambdarank_normalization;
+  auto need_norm = p_cache->Param().lambdarank_normalization;
+  auto n_pairs = p_cache->Param().NumPair();
+  bool is_mean = p_cache->Param().IsMean();
+  CHECK_EQ(is_mean, !has_truncation);
   thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.Size(),
                      [=] XGBOOST_DEVICE(std::size_t i) mutable {
                        auto g = dh::SegmentId(d_gptr, i);
-                       auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
-                       // Normalization
-                       if (sum_lambda > 0.0 && norm) {
-                         double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+                       if (need_norm) {
+                         double norm = 1.0;
+                         if (has_truncation) {
+                           // Normalize using gradient for top-k.
+                           auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
+                           if (sum_lambda > 0.0) {
+                             norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+                           }
+                         } else {
+                           // Normalize using the number of pairs for mean.
+                           double scale = 1.0 / static_cast<double>(n_pairs);
+                           norm = scale;
+                         }
                          d_gpair(i, 0) *= norm;
                        }
+
                        d_gpair(i, 0) *= (d_weights[g] * w_norm);
                      });
 }
 
 /**
- * \brief Handles boilerplate code like getting device span.
+ * @brief Handles boilerplate code like getting device spans.
  */
 template <bool norm_by_diff, typename Delta>
 void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const& preds,
@@ -302,7 +319,6 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
   out_gpair->Reshape(preds.Size(), 1);
 
   CHECK(p_cache);
-
   auto d_rounding = p_cache->CUDARounding(ctx);
   auto d_cost_rounding = p_cache->CUDACostRounding(ctx);
 
@@ -325,9 +341,10 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
     d_y_sorted_idx = SortY(ctx, info, rank_idx, p_cache);
   }
 
-  KernelInputs args{ti_plus,        tj_minus, li,     lj,     d_gptr,     d_threads_group_ptr,
-                    rank_idx,       label,    predts, gpairs, d_rounding, d_cost_rounding.data(),
-                    d_y_sorted_idx, iter};
+  auto n_pairs = p_cache->Param().NumPair();
+  KernelInputs args{ti_plus,  tj_minus,       li,     lj,     d_gptr,     d_threads_group_ptr,
+                    rank_idx, label,          predts, gpairs, d_rounding, d_cost_rounding.data(),
+                    n_pairs,  d_y_sorted_idx, iter};
 
   // dispatch based on unbiased and truncation
   if (p_cache->Param().HasTruncation()) {
diff --git a/src/objective/lambdarank_obj.cuh b/src/objective/lambdarank_obj.cuh
@@ -66,6 +66,7 @@ struct KernelInputs {
   linalg::VectorView<GradientPair const> d_roundings;
   double const *d_cost_rounding;
 
+  ltr::position_t const n_pairs;
   common::Span<std::size_t const> d_y_sorted_idx;
 
   std::int32_t iter;
@@ -136,9 +137,10 @@ struct MakePairsOp {
     // The index pointing to the first element of the next bucket
     std::size_t right_bound = n_data - n_rights;
 
-    thrust::minstd_rand rng(args.iter);
+    std::uint32_t seed = args.iter * (static_cast<std::uint32_t>(args.d_group_ptr.size()) - 1) + g;
+    thrust::minstd_rand rng(seed);
     auto pair_idx = i;
-    rng.discard(sample_pair_idx * n_data + g + pair_idx);  // fixme
+    rng.discard(idx - args.d_threads_group_ptr[g]);  // idx within group
     thrust::uniform_int_distribution<std::size_t> dist(0, n_lefts + n_rights - 1);
     auto ridx = dist(rng);
     SPAN_CHECK(ridx < n_lefts + n_rights);
diff --git a/src/objective/lambdarank_obj.h b/src/objective/lambdarank_obj.h
@@ -227,15 +227,16 @@ void MakePairs(Context const* ctx, std::int32_t iter,
   ltr::position_t cnt = group_ptr[g + 1] - group_ptr[g];
 
   if (cache->Param().HasTruncation()) {
-    for (std::size_t i = 0; i < std::min(cnt, cache->Param().NumPair()); ++i) {
+    for (std::size_t i = 0, n = std::min(cnt, cache->Param().NumPair()); i < n; ++i) {
       for (std::size_t j = i + 1; j < cnt; ++j) {
         op(i, j);
       }
     }
   } else {
     CHECK_EQ(g_rank.size(), g_label.Size());
-    std::minstd_rand rnd(iter);
-    rnd.discard(g);  // fixme(jiamingy): honor the global seed
+
+    std::uint32_t seed = iter * (static_cast<std::uint32_t>(group_ptr.size()) - 1) + g;
+    std::minstd_rand rnd(seed);
     // sort label according to the rank list
     auto it = common::MakeIndexTransformIter(
         [&g_rank, &g_label](std::size_t idx) { return g_label(g_rank[idx]); });
@@ -244,7 +245,6 @@ void MakePairs(Context const* ctx, std::int32_t iter,
     // permutation iterator to get the original label
     auto rev_it = common::MakeIndexTransformIter(
         [&](std::size_t idx) { return g_label(g_rank[y_sorted_idx[idx]]); });
-
     for (std::size_t i = 0; i < cnt;) {
       std::size_t j = i + 1;
       // find the bucket boundary
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
diff --git a/tests/cpp/objective/test_lambdarank_obj.h b/tests/cpp/objective/test_lambdarank_obj.h

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,8 @@ XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size`
`30`	`30`	`std::size_t n_pairs) {`
`31`	`31`	`return group_size * n_pairs;`
`32`	`32`	`}`
	`33`	`+// Number of threads in a group divided by the number of samples in this group, returns`
	`34`	`+// the number of pairs for pair-wise ltr with sampling.`
`33`	`35`	`XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,`
`34`	`36`	`std::size_t group_size) {`
`35`	`37`	`return n_threads / group_size;`