Skip to content

Commit e8baf03

Browse files
authored
Initial implementation of the ordinal recoder. (dmlc#11098)
1 parent bdc5a26 commit e8baf03

File tree

9 files changed

+1311
-1
lines changed

9 files changed

+1311
-1
lines changed

src/common/device_helpers.cuh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,10 +453,15 @@ xgboost::common::Span<T> ToSpan(VectorT &vec, IndexT offset = 0,
453453
}
454454

455455
template <typename T>
456-
xgboost::common::Span<T> ToSpan(thrust::device_vector<T> &vec, size_t offset, size_t size) {
456+
xgboost::common::Span<T> ToSpan(device_vector<T> &vec, size_t offset, size_t size) {
457457
return ToSpan(vec, offset, size);
458458
}
459459

460+
template <typename T>
461+
xgboost::common::Span<std::add_const_t<T>> ToSpan(device_vector<T> const &vec) {
462+
return {thrust::raw_pointer_cast(vec.data()), vec.size()};
463+
}
464+
460465
template <typename T>
461466
xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
462467
return {vec.data(), vec.size()};

src/data/cat_container.cuh

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/**
2+
* Copyright 2024, XGBoost Contributors
3+
*/
4+
#pragma once
5+
#include "../common/device_helpers.cuh" // for ToSpan
6+
#include "../common/device_vector.cuh" // for device_vector, XGBDeviceAllocator
7+
#include "../encoder/ordinal.h" // for CatCharT
8+
9+
namespace xgboost::cuda_impl {
10+
struct CatStrArray {
11+
dh::device_vector<std::int32_t> offsets;
12+
dh::device_vector<enc::CatCharT> values;
13+
14+
CatStrArray() = default;
15+
CatStrArray(CatStrArray const& that) = delete;
16+
CatStrArray& operator=(CatStrArray const& that) = delete;
17+
18+
CatStrArray(CatStrArray&& that) = default;
19+
CatStrArray& operator=(CatStrArray&& that) = default;
20+
21+
[[nodiscard]] explicit operator enc::CatStrArrayView() const {
22+
return {dh::ToSpan(offsets), dh::ToSpan(values)};
23+
}
24+
[[nodiscard]] std::size_t size() const { // NOLINT
25+
return enc::CatStrArrayView(*this).size();
26+
}
27+
};
28+
29+
template <typename T>
30+
struct ViewToStorageImpl;
31+
32+
template <>
33+
struct ViewToStorageImpl<enc::CatStrArrayView> {
34+
using Type = CatStrArray;
35+
};
36+
37+
template <typename T>
38+
struct ViewToStorageImpl<common::Span<T const>> {
39+
using Type = dh::device_vector<T>;
40+
};
41+
42+
template <typename... Ts>
43+
struct ViewToStorage;
44+
45+
template <typename... Ts>
46+
struct ViewToStorage<std::tuple<Ts...>> {
47+
using Type = std::tuple<typename ViewToStorageImpl<Ts>::Type...>;
48+
};
49+
50+
using CatIndexTypes = ViewToStorage<enc::CatIndexViewTypes>::Type;
51+
} // namespace xgboost::cuda_impl

src/data/cat_container.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/**
2+
* Copyright 2025, XGBoost Contributors
3+
*/
4+
#pragma once
5+
6+
#include <cstdint> // for int32_t, int8_t
7+
#include <tuple> // for tuple
8+
#include <vector> // for vector
9+
10+
#include "../encoder/ordinal.h" // for DictionaryView
11+
12+
namespace xgboost {
13+
namespace cpu_impl {
14+
struct CatStrArray {
15+
std::vector<std::int32_t> offsets;
16+
std::vector<enc::CatCharT> values;
17+
18+
[[nodiscard]] explicit operator enc::CatStrArrayView() const { return {offsets, values}; }
19+
[[nodiscard]] std::size_t size() const { // NOLINT
20+
return enc::CatStrArrayView(*this).size();
21+
}
22+
};
23+
24+
template <typename T>
25+
struct ViewToStorageImpl;
26+
27+
template <>
28+
struct ViewToStorageImpl<enc::CatStrArrayView> {
29+
using Type = CatStrArray;
30+
};
31+
32+
template <typename T>
33+
struct ViewToStorageImpl<common::Span<T const>> {
34+
using Type = std::vector<T>;
35+
};
36+
37+
template <typename... Ts>
38+
struct ViewToStorage;
39+
40+
template <typename... Ts>
41+
struct ViewToStorage<std::tuple<Ts...>> {
42+
using Type = std::tuple<typename ViewToStorageImpl<Ts>::Type...>;
43+
};
44+
45+
using CatIndexTypes = ViewToStorage<enc::CatIndexViewTypes>::Type;
46+
using ColumnType = enc::cpu_impl::TupToVarT<CatIndexTypes>;
47+
} // namespace cpu_impl
48+
} // namespace xgboost

0 commit comments

Comments
 (0)