diff --git a/csrc/dispatch.h b/csrc/dispatch.h index f6614d5df00..05e2e98c569 100644 --- a/csrc/dispatch.h +++ b/csrc/dispatch.h @@ -181,6 +181,7 @@ class Val; f(ShareMemHandles); \ f(HirAliasSelect); \ f(ShardByStream); \ + f(Allocate); \ f(Deallocate); \ f(ForLoop); \ f(SymmetricContiguousView); diff --git a/csrc/host_ir/allocate_and_deallocate.cpp b/csrc/host_ir/allocate_and_deallocate.cpp index e0a14abf18b..4c7924baa90 100644 --- a/csrc/host_ir/allocate_and_deallocate.cpp +++ b/csrc/host_ir/allocate_and_deallocate.cpp @@ -17,6 +17,7 @@ #include #include +#include "host_ir/ir.h" #include "ir/builder.h" #include "ir/utils.h" @@ -215,7 +216,7 @@ void insertAllocations(hir::HostIrContainer& hic) { if (needsOutputPreallocation(e)) { auto* allocate = - IrBuilder::create(out, out->getMemoryType()); + IrBuilder::create(out, out->getMemoryType()); node->scope()->insert(node->iterator(), allocate); } @@ -255,13 +256,6 @@ class LowestCommonAncestor { NVF_ERROR(depth_.insert({node, current_depth}).second); Expr* e = node->getExpr(); - // Temporary special-case for kir::Allocate. We will switch - // inserting a new `hir::Allocate` in host IR lowering where - // the allocated `tv` will be the expr input. - if (auto* alloc = dynamic_cast(e)) { - auto* tv = alloc->buffer()->as(); - lca_[tv] = findLca(lca_[tv], node); - } for (auto* tv : ir_utils::filterByType(e->inputs())) { lca_[tv] = findLca(lca_[tv], node); } @@ -337,9 +331,6 @@ void checkMemoryLeak(hir::HostIrContainer& hic) { /*pre_fn=*/ [&](const Node* node) { Expr* e = node->getExpr(); - if (auto* alloc = dynamic_cast(e)) { - allocated.insert(alloc->buffer()->as()); - } for (auto* tv : ir_utils::filterByType(e->inputs())) { allocated.insert(tv); } diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp index d0764071708..020a4ddb3a1 100644 --- a/csrc/host_ir/evaluator.cpp +++ b/csrc/host_ir/evaluator.cpp @@ -68,8 +68,8 @@ KernelArgumentHolder HostIrEvaluator::runWithInputs( FUSER_PERF_SCOPE("HostIrEvaluator::runWithInputs"); expr_evaluator_ = ExpressionEvaluator(); expr_evaluator_.bind("numberOfStreams", params_.number_of_streams); - NVF_ERROR(args.getCacheId().has_value()); - expr_evaluator_.bind("cacheId", static_cast(*args.getCacheId())); + auto cache_id = valueOrError(args.getCacheId()); + expr_evaluator_.bind("cacheId", static_cast(cache_id)); NVF_ERROR_EQ(std::ssize(container_->inputs()), args.size()); for (auto&& [in_val, arg] : zip(container_->inputs(), args)) { @@ -329,7 +329,9 @@ void HostIrEvaluator::handle(Communication* communication) { CommunicatorBackend backend_type = communication->backend(); if (backend_type == CommunicatorBackend::kCuda) { const auto current_stream = static_cast( - c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); + c10::cuda::getCurrentCUDAStream( + static_cast(my_local_device_index_)) + .stream()); NVF_ERROR( communication->type() == CommunicationType::Broadcast || communication->type() == CommunicationType::Allgather, @@ -337,8 +339,8 @@ void HostIrEvaluator::handle(Communication* communication) { communication->type()); int64_t root_val = expr_evaluator_.evaluate(communication->root()).as(); - SymmetricMemoryHandle* multicast_handle = - multicast_handle_cache_.get({output_tensor, communication, root_val}); + SymmetricMemoryHandle* multicast_handle = multicast_handle_cache_.get( + {.buffer = output_tensor, .expr = communication, .root = root_val}); postWithCudaBackend( communication, input_tensor, @@ -369,7 +371,9 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { if (backend_type == CommunicatorBackend::kCuda) { const P2pIpcHandle& p2p_ipc_handle = ipc_handle_cache_.get(communication); const auto current_stream = static_cast( - c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); + c10::cuda::getCurrentCUDAStream( + static_cast(my_local_device_index_)) + .stream()); auto count = buffer.numel() * buffer.element_size(); if (communication->type() == P2PCommunicationType::RECV) { recvPost(p2p_ipc_handle, count, current_stream); @@ -444,7 +448,9 @@ void HostIrEvaluator::handle(Wait* wait) { auto* p2p_comm = dynamic_cast(expr); auto* communication = dynamic_cast(expr); const auto current_stream = static_cast( - c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); + c10::cuda::getCurrentCUDAStream( + static_cast(my_local_device_index_)) + .stream()); if (p2p_comm && p2p_comm->backend() == CommunicatorBackend::kCuda) { const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(p2p_comm); if (p2p_comm->type() == P2PCommunicationType::SEND) { @@ -463,8 +469,8 @@ void HostIrEvaluator::handle(Wait* wait) { at::Tensor output_tensor = getKnownTensorOrUndefined(communication->out()); int64_t root_val = expr_evaluator_.evaluate(communication->root()).as(); - SymmetricMemoryHandle* multicast_handle = - multicast_handle_cache_.get({output_tensor, communication, root_val}); + SymmetricMemoryHandle* multicast_handle = multicast_handle_cache_.get( + {.buffer = output_tensor, .expr = communication, .root = root_val}); waitWithCudaBackend( communication, multicast_handle, current_stream, root_val); } else { @@ -714,6 +720,28 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { expr_evaluator_.bind(tv, tensor); } +void HostIrEvaluator::handle(hir::Allocate* allocate) { + FUSER_PERF_SCOPE("HostIrEvaluator::handle(Allocate)"); + TensorView* tv = allocate->in(); + + GlobalBufferInfo info = + getBufferInfos(expr_evaluator_, PrimDataType::Int, {tv}).at(0); + c10::Device device = + communicator_ ? communicator_->device() : at::Device("cuda:0"); + at::Tensor tensor = at::native::empty_strided_cuda( + info.shape_info.logical_sizes, + info.shape_info.logical_strides, + info.type, + c10::nullopt, + device, + c10::nullopt); + + if (allocate->zeroInit()) { + tensor.zero_(); + } + expr_evaluator_.bind(tv, tensor); +} + void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) { auto indexed_id = hir_alias_select->in()->getLogicalDomain().at(hir_alias_select->axis()); @@ -848,10 +876,8 @@ void HostIrEvaluator::handle(ShardByStream* shard) { const std::vector& allocation_domain = out_tv->getMaybeAllocationDomain(); - auto i = std::find_if( - allocation_domain.begin(), - allocation_domain.end(), - std::mem_fn(&IterDomain::isStream)); + auto i = std::ranges::find_if( + allocation_domain, std::mem_fn(&IterDomain::isStream)); NVF_ERROR( i != allocation_domain.end(), "Stream axis not found in allocation domain: ", @@ -915,8 +941,9 @@ void HostIrEvaluator::handle( at::Tensor in_tensor = getKnownConcreteValue(in_tv).as(); // Get or create SymMemForContiguousView from the cache - SymMemForContiguousView* handle = static_cast( - multicast_handle_cache_.get({in_tensor, symmetric_contiguous_view})); + SymMemForContiguousView* handle = + static_cast(multicast_handle_cache_.get( + {.buffer = in_tensor, .expr = symmetric_contiguous_view})); // Bind the symmetric_contiguous_viewed tensor to the output expr_evaluator_.bind(out_tv, handle->tensor()); diff --git a/csrc/host_ir/evaluator.h b/csrc/host_ir/evaluator.h index 4a1929ba1bd..f2c26c15797 100644 --- a/csrc/host_ir/evaluator.h +++ b/csrc/host_ir/evaluator.h @@ -109,6 +109,7 @@ class NVF_API HostIrEvaluator final : public OptOutDispatch { void handle(MatmulOp*) override; void handle(LinearOp*) override; void handle(kir::Allocate*) override; + void handle(hir::Allocate*) override; void handle(LoadStoreOp*) override; void handle(BinaryOp*) override; void handle(ReductionOp*) override; @@ -138,7 +139,7 @@ class NVF_API HostIrEvaluator final : public OptOutDispatch { using StreamKey = std::variant; std::unordered_map streams_; std::unordered_map> works_; - const int64_t my_local_device_index_; + int64_t my_local_device_index_; IpcHandleCache ipc_handle_cache_; SymmetricMemoryHandleCache multicast_handle_cache_; // Allocation cache diff --git a/csrc/host_ir/ir.cpp b/csrc/host_ir/ir.cpp index b7a43650797..1519c185c9c 100644 --- a/csrc/host_ir/ir.cpp +++ b/csrc/host_ir/ir.cpp @@ -109,7 +109,7 @@ std::string PostOnStream::toString(int indent_size) const { std::for_each(outputs().begin(), outputs().end(), [&ss](auto output) { ss << output->toString(0) << ", "; }); - ss << "})" << std::endl; + ss << "})\n"; return ss.str(); } @@ -149,13 +149,13 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(LaunchKernel) std::string LaunchKernel::toString(int indent_size) const { std::stringstream ss; - indent(ss, indent_size) << "LaunchKernel(" << std::endl; - indent(ss, indent_size + 1) << "Group ID: " << groupId() << "," << std::endl; + indent(ss, indent_size) << "LaunchKernel(\n"; + indent(ss, indent_size + 1) << "Group ID: " << groupId() << ",\n"; indent(ss, indent_size + 1) - << "Inputs: {" << toDelimitedString(inputs()) << "}," << std::endl; + << "Inputs: {" << toDelimitedString(inputs()) << "},\n"; indent(ss, indent_size + 1) - << "Outputs: {" << toDelimitedString(outputs()) << "}," << std::endl; - indent(ss, indent_size) << ")" << std::endl; + << "Outputs: {" << toDelimitedString(outputs()) << "},\n"; + indent(ss, indent_size) << ")\n"; return ss.str(); } @@ -172,9 +172,9 @@ TensorView* Deallocate::buffer() const { std::string Deallocate::toString(int indent_size) const { std::stringstream ss; - indent(ss, indent_size) << "Deallocate {" << std::endl; - ss << buffer()->toString(indent_size + 1) << std::endl; - indent(ss, indent_size) << "}" << std::endl; + indent(ss, indent_size) << "Deallocate {\n"; + ss << buffer()->toString(indent_size + 1) << '\n'; + indent(ss, indent_size) << "}\n"; return ss.str(); } @@ -230,8 +230,8 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(SetCurrentStream) std::string SetCurrentStream::toString(int indent_size) const { std::stringstream ss; - indent(ss, indent_size) << "SetCurrentStream(" << stream()->toString() << ")" - << std::endl; + indent(ss, indent_size) << "SetCurrentStream(" << stream()->toString() + << ")\n"; return ss.str(); } @@ -246,7 +246,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(GetCurrentStream) std::string GetCurrentStream::toString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << stream()->toInlineString() - << " = GetCurrentStream()" << std::endl; + << " = GetCurrentStream()\n"; return ss.str(); } @@ -319,7 +319,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(StartCoalescing) std::string StartCoalescing::toString(int indent_size) const { std::stringstream ss; - indent(ss, indent_size) << "StartCoalescing" << std::endl; + indent(ss, indent_size) << "StartCoalescing\n"; return ss.str(); } @@ -339,7 +339,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(EndCoalescing) std::string EndCoalescing::toString(int indent_size) const { std::stringstream ss; - indent(ss, indent_size) << "EndCoalescing " << name() << std::endl; + indent(ss, indent_size) << "EndCoalescing " << name() << '\n'; return ss.str(); } @@ -367,7 +367,7 @@ std::string ShareMemHandles::toString(int indent_size) const { for (auto communication : communications()) { ss << communication->toInlineString() << ", "; } - ss << std::endl; + ss << '\n'; return ss.str(); } @@ -440,7 +440,7 @@ std::string ShardByStream::toString(int indent_size) const { indent(ss, indent_size) << out()->toString() << " = ShardByStream(" << in()->toString() << ", stream_index=" << stream_index()->toString() - << ")" << std::endl; + << ")\n"; return ss.str(); } @@ -462,7 +462,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(SymmetricContiguousView) std::string SymmetricContiguousView::toString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << out()->toString() << " = SymmetricContiguousView(" - << in()->toString() << ")" << std::endl; + << in()->toString() << ")\n"; return ss.str(); } @@ -484,7 +484,7 @@ std::string ForLoop::toString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << "FOR " << index()->toString() << " from " << start()->toInlineString() << " to " - << stop()->toInlineString() << ":" << std::endl + << stop()->toInlineString() << ":\n" << body().toString(indent_size + 1); return ss.str(); } @@ -504,4 +504,38 @@ std::string ForLoop::toInlineString(int indent_size) const { index, iter_domain->start(), iter_domain->stop()); } +Allocate::Allocate( + IrBuilderPasskey passkey, + TensorView* in, + MemoryType memory_type, + bool zero_init) + : Expr(passkey) { + NVF_ERROR(passkey.ir_container_ != nullptr); + NVF_ERROR(passkey.ir_container_->isA()); + + addInput(in); + addDataAttribute(memory_type); + addDataAttribute(zero_init); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(Allocate) + +std::string Allocate::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << in()->toString() << " = ALLOCATE(" + << "mem_type=" << memoryType() << ", " + << "zero_init=" << std::boolalpha << zeroInit() + << ")\n"; + return ss.str(); +} + +std::string Allocate::toInlineString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << in()->toInlineString() << " = ALLOCATE(" + << "mem_type=" << memoryType() << ", " + << "zero_init=" << std::boolalpha << zeroInit() + << ")"; + return ss.str(); +} + } // namespace nvfuser::hir diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h index 3ddbc9e67a6..69c255b7885 100644 --- a/csrc/host_ir/ir.h +++ b/csrc/host_ir/ir.h @@ -161,6 +161,42 @@ class LaunchKernel : public Expr { CompiledKernel* compiled_kernel_ = nullptr; }; +class Allocate : public Expr { + public: + using Expr::Expr; + + explicit Allocate( + IrBuilderPasskey passkey, + TensorView* in, + MemoryType memory_type, + bool zero_init = false); + + Allocate(const Allocate& other) = delete; + Allocate& operator=(const Allocate& other) = delete; + Allocate(Allocate&& other) = delete; + Allocate& operator=(Allocate&& other) = delete; + + NVFUSER_DECLARE_CLONE_AND_CREATE + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + const char* getOpString() const override { + return "hir::Allocate"; + } + + TensorView* in() const { + return inputs().at(0)->as(); + } + + MemoryType memoryType() const { + return attribute(0); + } + + bool zeroInit() const { + return attribute(1); + } +}; + class Deallocate : public Expr { public: using Expr::Expr; diff --git a/csrc/host_ir/jit.cpp b/csrc/host_ir/jit.cpp index 00183754c56..dba0d3be916 100644 --- a/csrc/host_ir/jit.cpp +++ b/csrc/host_ir/jit.cpp @@ -760,7 +760,7 @@ class HostIrCompileDispatcher : public OptInDispatch { smem}); } - void handle(kir::Allocate* allocate) final { + void handle(hir::Allocate* allocate) final { llvm::LLVMContext& context = builder().getContext(); llvm::Module* module = builder().GetInsertBlock()->getParent()->getParent(); @@ -769,16 +769,13 @@ class HostIrCompileDispatcher : public OptInDispatch { llvm::SmallVector tensor_sizes; llvm::SmallVector tensor_strides; inferTensorShapesAndStrides( - allocate->buffer()->as(), - valToValue(), - builder(), - tensor_sizes, - tensor_strides); + allocate->in(), valToValue(), builder(), tensor_sizes, tensor_strides); - const std::vector& logical_domain = TensorDomain::noReductions( - allocate->buffer()->as()->getLogicalDomain()); + auto logical_domain = + allocate->in()->getLogicalDomain() | TensorDomain::kNoReductions; - NVF_ERROR_EQ(tensor_sizes.size(), logical_domain.size()); + NVF_ERROR_EQ( + std::ssize(tensor_sizes), std::ranges::distance(logical_domain)); llvm::ArrayType* sizes_type = getInt64StaticArrayType( context, static_cast(tensor_sizes.size())); @@ -819,9 +816,8 @@ class HostIrCompileDispatcher : public OptInDispatch { // Create constants for type and device from params at::ScalarType data_type = data_type_to_aten( - allocate->buffer()->dtype() == DataType::Index - ? PrimDataType::Int - : allocate->buffer()->dtype()); + allocate->in()->dtype() == DataType::Index ? PrimDataType::Int + : allocate->in()->dtype()); llvm::Value* dtype_constant = builder().getInt32(static_cast(data_type)); llvm::Value* device_index_constant = @@ -841,7 +837,7 @@ class HostIrCompileDispatcher : public OptInDispatch { dtype_constant, device_index_constant, out_tensor}); - valToValue()[allocate->buffer()] = out_tensor; + valToValue()[allocate->in()] = out_tensor; } void handle(hir::Deallocate* deallocate) final { diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp index 3757370ba28..bb14c8c6eae 100644 --- a/csrc/host_ir/lowering.cpp +++ b/csrc/host_ir/lowering.cpp @@ -179,46 +179,47 @@ void lowerSegment( // TODO: `replacement_map` should be associated with the scope so // ShardByStream across segments in the same for-loop can be reused. std::unordered_map replacement_map; + + // All communications from a single expr share the same in/out TVs; + // only root and team vary. Handle input sharding and output + // allocation once, outside the per-communication loop. + TensorView* in = e->input(0)->as(); + TensorView* out = e->output(0)->as(); + + if (haveDifferentShardings( + in, + DomainType::kAllocation, + out, + DomainType::kLoop, + {ParallelType::Stream})) { + Val*& sharded_in = replacement_map[in]; + if (sharded_in == nullptr) { + sharded_in = hir::shardByStream(in, innermost.loop->index(), e); + innermost_scope.pushBack(sharded_in->definition()); + } + } + + auto* allocate = + IrBuilder::create(out, out->getMemoryType()); + if (getShardedIterDomain(out, ParallelType::Stream, DomainType::kLoop) != + nullptr && + getShardedIterDomain( + out, ParallelType::Stream, DomainType::kAllocation) == nullptr) { + innermost.parent_scope->insert( + innermost.parent_insertion_point, allocate); + auto [i, inserted] = replacement_map.emplace( + out, hir::shardByStream(out, innermost.loop->index(), e)); + NVF_ERROR(inserted, "The input segmented fusion should be SSA."); + innermost_scope.pushBack(i->second->definition()); + } else { + innermost_scope.pushBack(allocate); + } + for (Expr* c : convertSingleOpToCommunication(e, device_id)) { NVF_ERROR( c->isA(), "Exprs in a Communication group should be Communication: ", c); - auto* communication = c->as(); - TensorView* in = communication->in(); - TensorView* out = communication->out(); - if (haveDifferentShardings( - in, - DomainType::kAllocation, - out, - DomainType::kLoop, - {ParallelType::Stream})) { - Val*& sharded_in = replacement_map[in]; - if (sharded_in == nullptr) { - sharded_in = - hir::shardByStream(in, innermost.loop->index(), communication); - innermost_scope.pushBack(sharded_in->definition()); - } - } - - // Allocate the recv buffers of communications - auto* allocate = - IrBuilder::create(out, out->getMemoryType()); - if (getShardedIterDomain( - out, ParallelType::Stream, DomainType::kLoop) != nullptr && - getShardedIterDomain( - out, ParallelType::Stream, DomainType::kAllocation) == - nullptr) { - innermost.parent_scope->insert( - innermost.parent_insertion_point, allocate); - auto [i, inserted] = replacement_map.emplace( - out, - hir::shardByStream(out, innermost.loop->index(), communication)); - NVF_ERROR(inserted, "The input segmented fusion should be SSA."); - innermost_scope.pushBack(i->second->definition()); - } else { - innermost_scope.pushBack(allocate); - } Expr* new_c = cloneWithNewOperands(c, replacement_map); innermost_scope.pushBack(new_c); @@ -310,7 +311,7 @@ void lowerSegment( out, ParallelType::Stream, DomainType::kAllocation) == nullptr) { auto* allocate = - IrBuilder::create(out, out->getMemoryType()); + IrBuilder::create(out, out->getMemoryType()); innermost.parent_scope->insert( innermost.parent_insertion_point, allocate); // Loop is stream parallelized but allocation is not. Therefore, @@ -347,7 +348,7 @@ void lowerSegment( alias); auto* allocate = - IrBuilder::create(out_tv, out_tv->getMemoryType()); + IrBuilder::create(out_tv, out_tv->getMemoryType()); innermost_scope.pushBack(allocate); } diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp index aa4f933f65e..52950202150 100644 --- a/tests/cpp/test_host_ir_evaluator.cpp +++ b/tests/cpp/test_host_ir_evaluator.cpp @@ -55,7 +55,8 @@ TEST_F(HostIrEvaluatorTest, LaunchKernel) { Val* in = ir_cloner.clone(fusion.inputs().at(0)); Val* out = ir_cloner.clone(fusion.outputs().at(0)); - auto allocate = IrBuilder::create(out, MemoryType::Global); + auto allocate = IrBuilder::create( + out->as(), MemoryType::Global); auto* cache_id = IrBuilder::create("cacheId", DataType::UInt64); auto launch_kernel = IrBuilder::create( @@ -182,8 +183,8 @@ TEST_F(HostIrEvaluatorTest, AddInLoop) { hic->addInput(in); hic->addOutput(out); - auto* allocate_out = IrBuilder::create( - out, MemoryType::Global, std::vector({}), /*zero_init=*/true); + auto* allocate_out = IrBuilder::create( + out, MemoryType::Global, /*zero_init=*/true); auto* stream_index = IrBuilder::create(DataType::Index); auto* for_loop = IrBuilder::create( diff --git a/tests/cpp/test_host_ir_jit.cpp b/tests/cpp/test_host_ir_jit.cpp index b150af6459c..a9d45aea7e1 100644 --- a/tests/cpp/test_host_ir_jit.cpp +++ b/tests/cpp/test_host_ir_jit.cpp @@ -95,11 +95,11 @@ TEST_F(HostIrJitTest, Deallocate) { TensorView* t2 = makeConcreteTensor(t2_sizes); t2->setMemoryType(MemoryType::Global); - auto* allocate_t0 = IrBuilder::create(t0, MemoryType::Global); + auto* allocate_t0 = IrBuilder::create(t0, MemoryType::Global); auto* deallocate_t0 = IrBuilder::create(t0); - auto* allocate_t1 = IrBuilder::create(t1, MemoryType::Global); + auto* allocate_t1 = IrBuilder::create(t1, MemoryType::Global); auto* deallocate_t1 = IrBuilder::create(t1); - auto* allocate_t2 = IrBuilder::create(t2, MemoryType::Global); + auto* allocate_t2 = IrBuilder::create(t2, MemoryType::Global); hic->pushBackTopLevelExprs(allocate_t0); hic->pushBackTopLevelExprs(allocate_t1); @@ -132,7 +132,7 @@ TEST_F(HostIrJitTest, DynamicSizedTensorAllocate) { hic->addInput(hic_in); hic->addOutput(hic_out); auto* allocate = - IrBuilder::create(hic_out, MemoryType::Global); + IrBuilder::create(hic_out, MemoryType::Global); hic->pushBackTopLevelExprs(allocate); HostIrJit jit(std::move(hic)); @@ -158,7 +158,7 @@ TEST_F(HostIrJitTest, Reorder) { hic->addInput(hic_in); hic->addOutput(hic_out); auto* allocate = - IrBuilder::create(hic_out, MemoryType::Global); + IrBuilder::create(hic_out, MemoryType::Global); hic->pushBackTopLevelExprs(allocate); HostIrJit jit(std::move(hic)); @@ -184,7 +184,7 @@ TEST_F(HostIrJitTest, Permute) { hic->addInput(hic_in); hic->addOutput(hic_out); auto* allocate = - IrBuilder::create(hic_out, MemoryType::Global); + IrBuilder::create(hic_out, MemoryType::Global); hic->pushBackTopLevelExprs(allocate); HostIrJit jit(std::move(hic)); @@ -210,7 +210,7 @@ TEST_F(HostIrJitTest, AllocationDomainReorder) { hic->addInput(hic_in); hic->addOutput(hic_out); auto* allocate = - IrBuilder::create(hic_out, MemoryType::Global); + IrBuilder::create(hic_out, MemoryType::Global); hic->pushBackTopLevelExprs(allocate); HostIrJit jit(std::move(hic)); @@ -244,9 +244,9 @@ TEST_F(HostIrJitTest, BroadcastTest) { hic->addOutput(broadcast_tv); hic->addOutput(expand_tv); auto* allocate_broadcast = - IrBuilder::create(broadcast_tv, MemoryType::Global); + IrBuilder::create(broadcast_tv, MemoryType::Global); auto* allocate_expand = - IrBuilder::create(expand_tv, MemoryType::Global); + IrBuilder::create(expand_tv, MemoryType::Global); hic->pushBackTopLevelExprs(allocate_broadcast); hic->pushBackTopLevelExprs(allocate_expand); @@ -292,7 +292,7 @@ TEST_F(HostIrJitTest, LaunchKernel) { hic->addInput(hic_in); hic->addOutput(hic_out); - auto allocate = IrBuilder::create(hic_out, MemoryType::Global); + auto allocate = IrBuilder::create(hic_out, MemoryType::Global); auto* cache_id = IrBuilder::create("cacheId", DataType::UInt64); auto launch_kernel = IrBuilder::create( 0, @@ -332,7 +332,7 @@ TEST_F(HostIrJitTest, Matmul) { hic->addOutput(tv2); hic->pushBackTopLevelExprs( - IrBuilder::create(tv2, MemoryType::Global)); + IrBuilder::create(tv2, MemoryType::Global)); hic->pushBackTopLevelExprs(tv2->definition()); HostIrJit jit(std::move(hic)); @@ -375,10 +375,10 @@ TEST_F(HostIrJitTest, Linear) { hic->addOutput(out_without_bias); hic->pushBackTopLevelExprs( - IrBuilder::create(out_with_bias, MemoryType::Global)); + IrBuilder::create(out_with_bias, MemoryType::Global)); hic->pushBackTopLevelExprs(out_with_bias->definition()); hic->pushBackTopLevelExprs( - IrBuilder::create(out_without_bias, MemoryType::Global)); + IrBuilder::create(out_without_bias, MemoryType::Global)); hic->pushBackTopLevelExprs(out_without_bias->definition()); HostIrJit jit(std::move(hic)); diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index 7caa06abf71..f011511930f 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -532,8 +532,8 @@ TEST_F(MultiDeviceTest, SwizzleWithParallelType) { tv->axis(0)->parallelize(ParallelType::Stream); } - auto* allocate_out = IrBuilder::create( - out_tv, MemoryType::Global, std::vector({}), /*zero_init=*/true); + auto* allocate_out = IrBuilder::create( + out_tv, MemoryType::Global, /*zero_init=*/true); auto* stream_index = IrBuilder::create(DataType::Index); auto* for_loop = IrBuilder::create( stream_index,