|
| 1 | +/* Copyright 2024 The OpenXLA Authors. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. |
| 14 | +==============================================================================*/ |
| 15 | + |
| 16 | +#include "xla/service/gpu/model/sol_gpu_cost_model.h" |
| 17 | + |
| 18 | +#include <cmath> |
| 19 | +#include <cstdint> |
| 20 | +#include <string> |
| 21 | + |
| 22 | +#include "absl/log/check.h" |
| 23 | +#include "absl/log/log.h" |
| 24 | +#include "absl/numeric/bits.h" |
| 25 | +#include "absl/strings/numbers.h" |
| 26 | +#include "absl/strings/string_view.h" |
| 27 | +#include "absl/time/time.h" |
| 28 | +#include "xla/hlo/ir/hlo_module.h" |
| 29 | + |
| 30 | +namespace xla { |
| 31 | +namespace gpu { |
| 32 | +namespace { |
| 33 | +// Constants for NCCL SoL model |
| 34 | +constexpr double kHeaderOverhead = 0.025; |
| 35 | +constexpr absl::string_view kNcclOpLaunchUs = "nccl_op_launch_us"; |
| 36 | +constexpr absl::string_view kNicSpeedGbps = "nic_speed_gbps"; |
| 37 | +constexpr absl::string_view kChunkPrepUs = "chunk_prep_us"; |
| 38 | +constexpr absl::string_view kRttUs = "rtt_us"; |
| 39 | +constexpr absl::string_view kGpusPerNode = "gpus_per_node"; |
| 40 | +constexpr absl::string_view kChunkSizeBytes = "chunk_size_bytes"; |
| 41 | + |
| 42 | +// Returns the number of communicators in the mask. |
| 43 | +// For example, if the mask is 0x0, this function returns 1. If the mask is 0x7, |
| 44 | +// this function returns 8. |
| 45 | +int NumCommunicators(const absl::string_view mask) { |
| 46 | + // Assuming the mask is a hexadecimal number |
| 47 | + uint64_t mask_value = std::stoul(std::string(mask), nullptr, 16); |
| 48 | + int bit_count = absl::popcount(mask_value); // Count set bits |
| 49 | + return static_cast<int>(std::pow(2, bit_count)); |
| 50 | +} |
| 51 | + |
| 52 | +// Returns the number of rounds for the given collective type. |
| 53 | +int NumRounds(const SolGPUCostModel::CollectiveType& coll_type) { |
| 54 | + // AllReduce requires ReduceScatter and AllGather, so it has 2 rounds. |
| 55 | + return coll_type == SolGPUCostModel::CollectiveType::kAllReduce ? 2 : 1; |
| 56 | +} |
| 57 | + |
| 58 | +} // namespace |
| 59 | + |
| 60 | +SolGPUCostModel::Config GetConfig(const HloModule* module) { |
| 61 | + SolGPUCostModel::Config config; |
| 62 | + const auto& extra_options = |
| 63 | + module->config() |
| 64 | + .debug_options() |
| 65 | + .xla_gpu_analytical_latency_estimator_options(); |
| 66 | + for (const auto& [option_name, option_value] : extra_options) { |
| 67 | + int64_t value; |
| 68 | + double value_d; |
| 69 | + VLOG(2) << "[SoL] option: " << option_name << " is " << option_value; |
| 70 | + if (option_name == kNcclOpLaunchUs && |
| 71 | + absl::SimpleAtoi(option_value, &value)) { |
| 72 | + config.nccl_op_launch_time = absl::Microseconds(value); |
| 73 | + } else if (option_name == kNicSpeedGbps && |
| 74 | + absl::SimpleAtod(option_value, &value_d)) { |
| 75 | + config.nic_speed_gbps = value_d; |
| 76 | + } else if (option_name == kChunkPrepUs && |
| 77 | + absl::SimpleAtoi(option_value, &value)) { |
| 78 | + config.chunk_prep_time = absl::Microseconds(value); |
| 79 | + } else if (option_name == kRttUs && |
| 80 | + absl::SimpleAtoi(option_value, &value)) { |
| 81 | + config.rtt = absl::Microseconds(value); |
| 82 | + } else if (option_name == kGpusPerNode && |
| 83 | + absl::SimpleAtoi(option_value, &value)) { |
| 84 | + config.gpus_per_node = value; |
| 85 | + } else if (option_name == kChunkSizeBytes && |
| 86 | + absl::SimpleAtoi(option_value, &value)) { |
| 87 | + config.chunk_size_bytes = value; |
| 88 | + } |
| 89 | + } |
| 90 | + return config; |
| 91 | +} |
| 92 | + |
| 93 | +SolGPUCostModel::SolGPUCostModel(const Config& sys_config) |
| 94 | + : xla_flag_config_(sys_config) { |
| 95 | + VLOG(2) << "[SoL] NIC speed: " << xla_flag_config_.nic_speed_gbps; |
| 96 | + VLOG(2) << "[SoL] RTT: " << xla_flag_config_.rtt; |
| 97 | + VLOG(2) << "[SoL] Chunk preparation time: " |
| 98 | + << xla_flag_config_.chunk_prep_time; |
| 99 | + VLOG(2) << "[SoL] NCCL op launch time: " |
| 100 | + << xla_flag_config_.nccl_op_launch_time; |
| 101 | + VLOG(2) << "[SoL] GPUs per node: " << xla_flag_config_.gpus_per_node; |
| 102 | +} |
| 103 | + |
| 104 | +// This is a insignificant term, and we are making it consistent |
| 105 | +// with the existing formula. |
| 106 | +absl::Duration SolGPUCostModel::ChunkPrepLatency( |
| 107 | + const int64_t per_gpu_msg_size_bytes) const { |
| 108 | + return std::ceil(static_cast<double>(per_gpu_msg_size_bytes) / |
| 109 | + xla_flag_config_.chunk_size_bytes) * |
| 110 | + xla_flag_config_.chunk_prep_time; |
| 111 | +} |
| 112 | + |
| 113 | +absl::Duration SolGPUCostModel::TransferDuration( |
| 114 | + const int64_t per_gpu_msg_size_bytes) const { |
| 115 | + // x1e6 to comvert secs to microseconds; |
| 116 | + // x1024*1024 *1024 to convert Gbytes/sec to bytes/sec |
| 117 | + const long double ret = |
| 118 | + (1e6 * static_cast<long double>(per_gpu_msg_size_bytes)) / |
| 119 | + (std::pow(1024.0, 3) * xla_flag_config_.nic_speed_gbps); |
| 120 | + return absl::Microseconds(ret * (1 + kHeaderOverhead)); |
| 121 | +} |
| 122 | + |
| 123 | +absl::Duration SolGPUCostModel::RingLatency( |
| 124 | + const int64_t buff_size_bytes, const int num_nodes, |
| 125 | + const CollectiveType& coll_type, const absl::string_view mask) const { |
| 126 | + const int num_gpus = NumGpusPerComm(num_nodes, coll_type, mask); |
| 127 | + |
| 128 | + int64_t per_gpu_msg_size_bytes; |
| 129 | + if (coll_type == CollectiveType::kSendRecv) { |
| 130 | + per_gpu_msg_size_bytes = buff_size_bytes; |
| 131 | + } else { |
| 132 | + per_gpu_msg_size_bytes = buff_size_bytes / num_gpus; |
| 133 | + } |
| 134 | + |
| 135 | + // This is the number of GPUs per communicator per node. We assume that each |
| 136 | + // GPU has a NIC, and this is also the number of NICs per communicator per |
| 137 | + // node. |
| 138 | + // Note that this happens to be correct value (i.e. 1) for SendRecv. |
| 139 | + int num_gpus_per_node = num_gpus / num_nodes; |
| 140 | + |
| 141 | + // In each channel, consider one GPU next to the Ethernet link. Below is the |
| 142 | + // sum of 3 time costs for each piece of data of size |
| 143 | + // `per_gpu_msg_size_bytes` |
| 144 | + // |
| 145 | + // 1. transfer duration defined by the NIC bandwidth, |
| 146 | + // 2. chunk preparation latency, and |
| 147 | + // 3. RTT |
| 148 | + // |
| 149 | + // then followed by two factors: |
| 150 | + // |
| 151 | + // 1. Multiply by `num_gpus - 1`, as `num_gpus - 1` pieces of data will be |
| 152 | + // sent over the link in AllGather. |
| 153 | + // 2. Divide by `num_gpus_per_node` as there are `num_gpus_per_node` NICs |
| 154 | + // and |
| 155 | + // GPUs in each node for parallelism. |
| 156 | + // |
| 157 | + // Better estimates of terms like this will come in future versions |
| 158 | + // of the SoL model. |
| 159 | + absl::Duration ret = TransferDuration(per_gpu_msg_size_bytes) + |
| 160 | + ChunkPrepLatency(per_gpu_msg_size_bytes) + |
| 161 | + xla_flag_config_.rtt; |
| 162 | + ret *= (num_gpus - 1.0) / static_cast<long double>(num_gpus_per_node); |
| 163 | + // Multiply by the number of rounds, which is different for AllReduce. |
| 164 | + ret = ret * NumRounds(coll_type); |
| 165 | + |
| 166 | + // Time to initiate the collective. |
| 167 | + return ret + xla_flag_config_.nccl_op_launch_time; |
| 168 | +} |
| 169 | + |
| 170 | +// Helper functions |
| 171 | +int SolGPUCostModel::NumGpusPerComm(int num_nodes, |
| 172 | + const CollectiveType& coll_type, |
| 173 | + const absl::string_view mask) const { |
| 174 | + if (coll_type == CollectiveType::kSendRecv) { |
| 175 | + return 2; |
| 176 | + } |
| 177 | + int num_comms = NumCommunicators(mask); |
| 178 | + CHECK_EQ(xla_flag_config_.gpus_per_node % num_comms, 0) |
| 179 | + << "GPU_PER_NODE must be divisible by the number of communicators. " |
| 180 | + "GPU_PER_NODE: " |
| 181 | + << xla_flag_config_.gpus_per_node |
| 182 | + << " Number of communicators: " << num_comms |
| 183 | + << ". Adjust the number of GPUs per node with the flag " |
| 184 | + "gpus_per_node in xla_gpu_analytical_latency_estimator_options."; |
| 185 | + return num_nodes * xla_flag_config_.gpus_per_node / num_comms; |
| 186 | +} |
| 187 | + |
| 188 | +} // namespace gpu |
| 189 | +} // namespace xla |
0 commit comments