Skip to content

Commit 98ac9d3

Browse files
committed
Merging occupancy tuning changes from David Polikoff.
The old Kokkos fork/branch from : davidp [email protected]:DavidPoliakoff/kokkos.git (fetch) was merged with current Kokkos develop, and tested with ArborX to confirm that autotuning occupancy for the DBSCAN benchmark worked. In tests on a system with V100, the original benchmark when iterated 600 times took 119.064 seconds to run. During the tuning process (using simulated annealing), the runtime was 108.014 seconds. When using cached results, the runtime was 109.058 seconds. The converged occupancy value was 70. Here are the cached results from APEX autotuning: Input_1: name: kokkos.kernel_name id: 1 info.type: string info.category: categorical info.valueQuantity: unbounded info.candidates: unbounded num_bins: 0 Input_2: name: kokkos.kernel_type id: 2 info.type: string info.category: categorical info.valueQuantity: set info.candidates: [parallel_for,parallel_reduce,parallel_scan,parallel_copy] Output_3: name: ArborX::Experimental::HalfTraversal id: 3 info.type: int64 info.category: ratio info.valueQuantity: range info.candidates: lower: 5 upper: 100 step: 5 open upper: 0 open lower: 0 Context_0: Name: "[2:parallel_for,1:ArborX::Experimental::HalfTraversal,tree_node:default]" Converged: true Results: NumVars: 1 id: 3 value: 70 In manual experiments, the ArborX team determined that the optimal occupancy for this example was beetween 40-90, which were a 10% improvement over baseline default of 100. See arborx/ArborX#815 for details. One deviation from the branch that David had written - the occupancy range is [5-100], with a step size of 5. The original implementation in Kokkos used [1-100] with a step size of 1.
1 parent c90a9c6 commit 98ac9d3

File tree

5 files changed

+696
-13
lines changed

5 files changed

+696
-13
lines changed

core/src/Kokkos_Parallel.hpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,10 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy,
134134
const FunctorType& functor) {
135135
uint64_t kpID = 0;
136136

137-
ExecPolicy inner_policy = policy;
138-
Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID);
139-
137+
/** Request a tuned policy from the tools subsystem */
138+
const auto& response =
139+
Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID);
140+
const auto& inner_policy = response.policy;
140141
Kokkos::Impl::shared_allocation_tracking_disable();
141142
Impl::ParallelFor<FunctorType, ExecPolicy> closure(functor, inner_policy);
142143
Kokkos::Impl::shared_allocation_tracking_enable();
@@ -348,9 +349,11 @@ template <class ExecutionPolicy, class FunctorType,
348349
std::enable_if_t<is_execution_policy<ExecutionPolicy>::value>>
349350
inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
350351
const FunctorType& functor) {
351-
uint64_t kpID = 0;
352-
ExecutionPolicy inner_policy = policy;
353-
Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
352+
uint64_t kpID = 0;
353+
/** Request a tuned policy from the tools subsystem */
354+
const auto& response =
355+
Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID);
356+
const auto& inner_policy = response.policy;
354357

355358
Kokkos::Impl::shared_allocation_tracking_disable();
356359
Impl::ParallelScan<FunctorType, ExecutionPolicy> closure(functor,

core/src/Kokkos_Parallel_Reduce.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,9 +1492,11 @@ struct ParallelReduceAdaptor {
14921492
using PassedReducerType = typename return_value_adapter::reducer_type;
14931493
uint64_t kpID = 0;
14941494

1495-
PolicyType inner_policy = policy;
1496-
Kokkos::Tools::Impl::begin_parallel_reduce<PassedReducerType>(
1497-
inner_policy, functor, label, kpID);
1495+
/** Request a tuned policy from the tools subsystem */
1496+
auto response = Kokkos::Tools::Impl::begin_parallel_reduce<
1497+
typename return_value_adapter::reducer_type>(policy, functor, label,
1498+
kpID);
1499+
auto& inner_policy = response.policy;
14981500

14991501
using ReducerSelector =
15001502
Kokkos::Impl::if_c<std::is_same<InvalidType, PassedReducerType>::value,

core/src/Kokkos_Tuners.hpp

Lines changed: 120 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t);
5252
VariableValue make_variable_value(size_t, double);
5353
SetOrRange make_candidate_range(double lower, double upper, double step,
5454
bool openLower, bool openUpper);
55+
SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step,
56+
bool openLower, bool openUpper);
5557
size_t get_new_context_id();
5658
void begin_context(size_t context_id);
5759
void end_context(size_t context_id);
@@ -419,10 +421,11 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
419421
template <typename ViableConfigurationCalculator, typename Functor,
420422
typename TagType, typename... Properties>
421423
TeamSizeTuner(const std::string& name,
422-
Kokkos::TeamPolicy<Properties...>& policy,
424+
const Kokkos::TeamPolicy<Properties...>& policy_in,
423425
const Functor& functor, const TagType& tag,
424426
ViableConfigurationCalculator calc) {
425427
using PolicyType = Kokkos::TeamPolicy<Properties...>;
428+
PolicyType policy(policy_in);
426429
auto initial_vector_length = policy.impl_vector_length();
427430
if (initial_vector_length < 1) {
428431
policy.impl_set_vector_length(1);
@@ -504,7 +507,8 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
504507
}
505508

506509
template <typename... Properties>
507-
void tune(Kokkos::TeamPolicy<Properties...>& policy) {
510+
auto tune(const Kokkos::TeamPolicy<Properties...>& policy_in) {
511+
Kokkos::TeamPolicy<Properties...> policy(policy_in);
508512
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
509513
auto configuration = tuner.begin();
510514
auto team_size = std::get<1>(configuration);
@@ -514,6 +518,117 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
514518
policy.impl_set_vector_length(vector_length);
515519
}
516520
}
521+
return policy;
522+
}
523+
void end() {
524+
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
525+
tuner.end();
526+
}
527+
}
528+
529+
TunerType get_tuner() const { return tuner; }
530+
};
531+
namespace Impl {
532+
template <class T>
533+
struct tuning_type_for;
534+
535+
template <>
536+
struct tuning_type_for<double> {
537+
static constexpr Kokkos::Tools::Experimental::ValueType value =
538+
Kokkos::Tools::Experimental::ValueType::kokkos_value_double;
539+
static double get(
540+
const Kokkos::Tools::Experimental::VariableValue& value_struct) {
541+
return value_struct.value.double_value;
542+
}
543+
};
544+
template <>
545+
struct tuning_type_for<int64_t> {
546+
static constexpr Kokkos::Tools::Experimental::ValueType value =
547+
Kokkos::Tools::Experimental::ValueType::kokkos_value_int64;
548+
static int64_t get(
549+
const Kokkos::Tools::Experimental::VariableValue& value_struct) {
550+
return value_struct.value.int_value;
551+
}
552+
};
553+
} // namespace Impl
554+
template <class Bound>
555+
class SingleDimensionalRangeTuner {
556+
size_t id;
557+
size_t context;
558+
using tuning_util = Impl::tuning_type_for<Bound>;
559+
560+
Bound default_value;
561+
562+
public:
563+
SingleDimensionalRangeTuner() = default;
564+
SingleDimensionalRangeTuner(
565+
const std::string& name,
566+
Kokkos::Tools::Experimental::StatisticalCategory category,
567+
Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) {
568+
default_value = default_val;
569+
Kokkos::Tools::Experimental::VariableInfo info;
570+
info.category = category;
571+
info.candidates = make_candidate_range(
572+
static_cast<Bound>(lower), static_cast<Bound>(upper),
573+
static_cast<Bound>(step), false, false);
574+
info.valueQuantity =
575+
Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range;
576+
info.type = tuning_util::value;
577+
id = Kokkos::Tools::Experimental::declare_output_type(name, info);
578+
}
579+
580+
Bound begin() {
581+
context = Kokkos::Tools::Experimental::get_new_context_id();
582+
Kokkos::Tools::Experimental::begin_context(context);
583+
auto tuned_value =
584+
Kokkos::Tools::Experimental::make_variable_value(id, default_value);
585+
Kokkos::Tools::Experimental::request_output_values(context, 1,
586+
&tuned_value);
587+
return tuning_util::get(tuned_value);
588+
}
589+
590+
void end() { Kokkos::Tools::Experimental::end_context(context); }
591+
592+
template <typename Functor>
593+
void with_tuned_value(Functor& func) {
594+
func(begin());
595+
end();
596+
}
597+
};
598+
599+
class RangePolicyOccupancyTuner {
600+
private:
601+
using TunerType = SingleDimensionalRangeTuner<int64_t>;
602+
TunerType tuner;
603+
604+
public:
605+
RangePolicyOccupancyTuner() = default;
606+
RangePolicyOccupancyTuner& operator=(const RangePolicyOccupancyTuner& other) =
607+
default;
608+
RangePolicyOccupancyTuner(const RangePolicyOccupancyTuner& other) = default;
609+
RangePolicyOccupancyTuner& operator=(RangePolicyOccupancyTuner&& other) =
610+
default;
611+
RangePolicyOccupancyTuner(RangePolicyOccupancyTuner&& other) = default;
612+
template <typename ViableConfigurationCalculator, typename Functor,
613+
typename TagType, typename... Properties>
614+
RangePolicyOccupancyTuner(const std::string& name,
615+
const Kokkos::RangePolicy<Properties...>&,
616+
const Functor&, const TagType&,
617+
ViableConfigurationCalculator)
618+
: tuner(TunerType(name,
619+
Kokkos::Tools::Experimental::StatisticalCategory::
620+
kokkos_value_ratio,
621+
100, 5, 100, 5)) {}
622+
623+
template <typename... Properties>
624+
auto tune(const Kokkos::RangePolicy<Properties...>& policy_in) {
625+
Kokkos::RangePolicy<Properties...> policy(policy_in);
626+
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
627+
auto occupancy = tuner.begin();
628+
policy.impl_set_desired_occupancy(
629+
Kokkos::Experimental::DesiredOccupancy{static_cast<int>(occupancy)});
630+
}
631+
return policy;
517632
}
518633
void end() {
519634
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
@@ -577,11 +692,13 @@ struct MDRangeTuner : public ExtendableTunerMixin<MDRangeTuner<MDRangeRank>> {
577692
policy.impl_change_tile_size({std::get<Indices>(tuple)...});
578693
}
579694
template <typename... Properties>
580-
void tune(Kokkos::MDRangePolicy<Properties...>& policy) {
695+
auto tune(const Kokkos::MDRangePolicy<Properties...>& policy_in) {
696+
Kokkos::MDRangePolicy<Properties...> policy(policy_in);
581697
if (Kokkos::Tools::Experimental::have_tuning_tool()) {
582698
auto configuration = tuner.begin();
583699
set_policy_tile(policy, configuration, std::make_index_sequence<rank>{});
584700
}
701+
return policy;
585702
}
586703
void end() {
587704
if (Kokkos::Tools::Experimental::have_tuning_tool()) {

0 commit comments

Comments
 (0)