diff --git a/csrc/dispatch.h b/csrc/dispatch.h
index f6614d5df00..05e2e98c569 100644
--- a/csrc/dispatch.h
+++ b/csrc/dispatch.h
@@ -181,6 +181,7 @@ class Val;
   f(ShareMemHandles);                 \
   f(HirAliasSelect);                  \
   f(ShardByStream);                   \
+  f(Allocate);                        \
   f(Deallocate);                      \
   f(ForLoop);                         \
   f(SymmetricContiguousView);
diff --git a/csrc/host_ir/allocate_and_deallocate.cpp b/csrc/host_ir/allocate_and_deallocate.cpp
index e0a14abf18b..4c7924baa90 100644
--- a/csrc/host_ir/allocate_and_deallocate.cpp
+++ b/csrc/host_ir/allocate_and_deallocate.cpp
@@ -17,6 +17,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "host_ir/ir.h"
 #include "ir/builder.h"
 #include "ir/utils.h"
 
@@ -215,7 +216,7 @@ void insertAllocations(hir::HostIrContainer& hic) {
 
           if (needsOutputPreallocation(e)) {
             auto* allocate =
-                IrBuilder::create<kir::Allocate>(out, out->getMemoryType());
+                IrBuilder::create<hir::Allocate>(out, out->getMemoryType());
             node->scope()->insert(node->iterator(), allocate);
           }
 
@@ -255,13 +256,6 @@ class LowestCommonAncestor {
           NVF_ERROR(depth_.insert({node, current_depth}).second);
           Expr* e = node->getExpr();
 
-          // Temporary special-case for kir::Allocate. We will switch
-          // inserting a new `hir::Allocate` in host IR lowering where
-          // the allocated `tv` will be the expr input.
-          if (auto* alloc = dynamic_cast<kir::Allocate*>(e)) {
-            auto* tv = alloc->buffer()->as<TensorView>();
-            lca_[tv] = findLca(lca_[tv], node);
-          }
           for (auto* tv : ir_utils::filterByType<TensorView>(e->inputs())) {
             lca_[tv] = findLca(lca_[tv], node);
           }
@@ -337,9 +331,6 @@ void checkMemoryLeak(hir::HostIrContainer& hic) {
       /*pre_fn=*/
       [&](const Node* node) {
         Expr* e = node->getExpr();
-        if (auto* alloc = dynamic_cast<kir::Allocate*>(e)) {
-          allocated.insert(alloc->buffer()->as<TensorView>());
-        }
         for (auto* tv : ir_utils::filterByType<TensorView>(e->inputs())) {
           allocated.insert(tv);
         }
diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
index d0764071708..020a4ddb3a1 100644
--- a/csrc/host_ir/evaluator.cpp
+++ b/csrc/host_ir/evaluator.cpp
@@ -68,8 +68,8 @@ KernelArgumentHolder HostIrEvaluator::runWithInputs(
   FUSER_PERF_SCOPE("HostIrEvaluator::runWithInputs");
   expr_evaluator_ = ExpressionEvaluator();
   expr_evaluator_.bind("numberOfStreams", params_.number_of_streams);
-  NVF_ERROR(args.getCacheId().has_value());
-  expr_evaluator_.bind("cacheId", static_cast<int64_t>(*args.getCacheId()));
+  auto cache_id = valueOrError(args.getCacheId());
+  expr_evaluator_.bind("cacheId", static_cast<int64_t>(cache_id));
 
   NVF_ERROR_EQ(std::ssize(container_->inputs()), args.size());
   for (auto&& [in_val, arg] : zip(container_->inputs(), args)) {
@@ -329,7 +329,9 @@ void HostIrEvaluator::handle(Communication* communication) {
   CommunicatorBackend backend_type = communication->backend();
   if (backend_type == CommunicatorBackend::kCuda) {
     const auto current_stream = static_cast<CUstream>(
-        c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
+        c10::cuda::getCurrentCUDAStream(
+            static_cast<c10::DeviceIndex>(my_local_device_index_))
+            .stream());
     NVF_ERROR(
         communication->type() == CommunicationType::Broadcast ||
             communication->type() == CommunicationType::Allgather,
@@ -337,8 +339,8 @@ void HostIrEvaluator::handle(Communication* communication) {
         communication->type());
     int64_t root_val =
         expr_evaluator_.evaluate(communication->root()).as<int64_t>();
-    SymmetricMemoryHandle* multicast_handle =
-        multicast_handle_cache_.get({output_tensor, communication, root_val});
+    SymmetricMemoryHandle* multicast_handle = multicast_handle_cache_.get(
+        {.buffer = output_tensor, .expr = communication, .root = root_val});
     postWithCudaBackend(
         communication,
         input_tensor,
@@ -369,7 +371,9 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   if (backend_type == CommunicatorBackend::kCuda) {
     const P2pIpcHandle& p2p_ipc_handle = ipc_handle_cache_.get(communication);
     const auto current_stream = static_cast<CUstream>(
-        c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
+        c10::cuda::getCurrentCUDAStream(
+            static_cast<c10::DeviceIndex>(my_local_device_index_))
+            .stream());
     auto count = buffer.numel() * buffer.element_size();
     if (communication->type() == P2PCommunicationType::RECV) {
       recvPost(p2p_ipc_handle, count, current_stream);
@@ -444,7 +448,9 @@ void HostIrEvaluator::handle(Wait* wait) {
   auto* p2p_comm = dynamic_cast<P2PCommunication*>(expr);
   auto* communication = dynamic_cast<Communication*>(expr);
   const auto current_stream = static_cast<CUstream>(
-      c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
+      c10::cuda::getCurrentCUDAStream(
+          static_cast<c10::DeviceIndex>(my_local_device_index_))
+          .stream());
   if (p2p_comm && p2p_comm->backend() == CommunicatorBackend::kCuda) {
     const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(p2p_comm);
     if (p2p_comm->type() == P2PCommunicationType::SEND) {
@@ -463,8 +469,8 @@ void HostIrEvaluator::handle(Wait* wait) {
     at::Tensor output_tensor = getKnownTensorOrUndefined(communication->out());
     int64_t root_val =
         expr_evaluator_.evaluate(communication->root()).as<int64_t>();
-    SymmetricMemoryHandle* multicast_handle =
-        multicast_handle_cache_.get({output_tensor, communication, root_val});
+    SymmetricMemoryHandle* multicast_handle = multicast_handle_cache_.get(
+        {.buffer = output_tensor, .expr = communication, .root = root_val});
     waitWithCudaBackend(
         communication, multicast_handle, current_stream, root_val);
   } else {
@@ -714,6 +720,28 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
   expr_evaluator_.bind(tv, tensor);
 }
 
+void HostIrEvaluator::handle(hir::Allocate* allocate) {
+  FUSER_PERF_SCOPE("HostIrEvaluator::handle(Allocate)");
+  TensorView* tv = allocate->in();
+
+  GlobalBufferInfo info =
+      getBufferInfos(expr_evaluator_, PrimDataType::Int, {tv}).at(0);
+  c10::Device device =
+      communicator_ ? communicator_->device() : at::Device("cuda:0");
+  at::Tensor tensor = at::native::empty_strided_cuda(
+      info.shape_info.logical_sizes,
+      info.shape_info.logical_strides,
+      info.type,
+      c10::nullopt,
+      device,
+      c10::nullopt);
+
+  if (allocate->zeroInit()) {
+    tensor.zero_();
+  }
+  expr_evaluator_.bind(tv, tensor);
+}
+
 void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) {
   auto indexed_id =
       hir_alias_select->in()->getLogicalDomain().at(hir_alias_select->axis());
@@ -848,10 +876,8 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
 
   const std::vector<IterDomain*>& allocation_domain =
       out_tv->getMaybeAllocationDomain();
-  auto i = std::find_if(
-      allocation_domain.begin(),
-      allocation_domain.end(),
-      std::mem_fn(&IterDomain::isStream));
+  auto i = std::ranges::find_if(
+      allocation_domain, std::mem_fn(&IterDomain::isStream));
   NVF_ERROR(
       i != allocation_domain.end(),
       "Stream axis not found in allocation domain: ",
@@ -915,8 +941,9 @@ void HostIrEvaluator::handle(
   at::Tensor in_tensor = getKnownConcreteValue(in_tv).as<at::Tensor>();
 
   // Get or create SymMemForContiguousView from the cache
-  SymMemForContiguousView* handle = static_cast<SymMemForContiguousView*>(
-      multicast_handle_cache_.get({in_tensor, symmetric_contiguous_view}));
+  SymMemForContiguousView* handle =
+      static_cast<SymMemForContiguousView*>(multicast_handle_cache_.get(
+          {.buffer = in_tensor, .expr = symmetric_contiguous_view}));
 
   // Bind the symmetric_contiguous_viewed tensor to the output
   expr_evaluator_.bind(out_tv, handle->tensor());
diff --git a/csrc/host_ir/evaluator.h b/csrc/host_ir/evaluator.h
index 4a1929ba1bd..f2c26c15797 100644
--- a/csrc/host_ir/evaluator.h
+++ b/csrc/host_ir/evaluator.h
@@ -109,6 +109,7 @@ class NVF_API HostIrEvaluator final : public OptOutDispatch {
   void handle(MatmulOp*) override;
   void handle(LinearOp*) override;
   void handle(kir::Allocate*) override;
+  void handle(hir::Allocate*) override;
   void handle(LoadStoreOp*) override;
   void handle(BinaryOp*) override;
   void handle(ReductionOp*) override;
@@ -138,7 +139,7 @@ class NVF_API HostIrEvaluator final : public OptOutDispatch {
   using StreamKey = std::variant<int64_t, Stream*>;
   std::unordered_map<StreamKey, c10::cuda::CUDAStream> streams_;
   std::unordered_map<Expr*, c10::intrusive_ptr<c10d::Work>> works_;
-  const int64_t my_local_device_index_;
+  int64_t my_local_device_index_;
   IpcHandleCache ipc_handle_cache_;
   SymmetricMemoryHandleCache multicast_handle_cache_;
   // Allocation cache
diff --git a/csrc/host_ir/ir.cpp b/csrc/host_ir/ir.cpp
index b7a43650797..1519c185c9c 100644
--- a/csrc/host_ir/ir.cpp
+++ b/csrc/host_ir/ir.cpp
@@ -109,7 +109,7 @@ std::string PostOnStream::toString(int indent_size) const {
   std::for_each(outputs().begin(), outputs().end(), [&ss](auto output) {
     ss << output->toString(0) << ", ";
   });
-  ss << "})" << std::endl;
+  ss << "})\n";
   return ss.str();
 }
 
@@ -149,13 +149,13 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(LaunchKernel)
 
 std::string LaunchKernel::toString(int indent_size) const {
   std::stringstream ss;
-  indent(ss, indent_size) << "LaunchKernel(" << std::endl;
-  indent(ss, indent_size + 1) << "Group ID: " << groupId() << "," << std::endl;
+  indent(ss, indent_size) << "LaunchKernel(\n";
+  indent(ss, indent_size + 1) << "Group ID: " << groupId() << ",\n";
   indent(ss, indent_size + 1)
-      << "Inputs: {" << toDelimitedString(inputs()) << "}," << std::endl;
+      << "Inputs: {" << toDelimitedString(inputs()) << "},\n";
   indent(ss, indent_size + 1)
-      << "Outputs: {" << toDelimitedString(outputs()) << "}," << std::endl;
-  indent(ss, indent_size) << ")" << std::endl;
+      << "Outputs: {" << toDelimitedString(outputs()) << "},\n";
+  indent(ss, indent_size) << ")\n";
   return ss.str();
 }
 
@@ -172,9 +172,9 @@ TensorView* Deallocate::buffer() const {
 
 std::string Deallocate::toString(int indent_size) const {
   std::stringstream ss;
-  indent(ss, indent_size) << "Deallocate {" << std::endl;
-  ss << buffer()->toString(indent_size + 1) << std::endl;
-  indent(ss, indent_size) << "}" << std::endl;
+  indent(ss, indent_size) << "Deallocate {\n";
+  ss << buffer()->toString(indent_size + 1) << '\n';
+  indent(ss, indent_size) << "}\n";
   return ss.str();
 }
 
@@ -230,8 +230,8 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(SetCurrentStream)
 
 std::string SetCurrentStream::toString(int indent_size) const {
   std::stringstream ss;
-  indent(ss, indent_size) << "SetCurrentStream(" << stream()->toString() << ")"
-                          << std::endl;
+  indent(ss, indent_size) << "SetCurrentStream(" << stream()->toString()
+                          << ")\n";
   return ss.str();
 }
 
@@ -246,7 +246,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(GetCurrentStream)
 std::string GetCurrentStream::toString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << stream()->toInlineString()
-                          << " = GetCurrentStream()" << std::endl;
+                          << " = GetCurrentStream()\n";
   return ss.str();
 }
 
@@ -319,7 +319,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(StartCoalescing)
 
 std::string StartCoalescing::toString(int indent_size) const {
   std::stringstream ss;
-  indent(ss, indent_size) << "StartCoalescing" << std::endl;
+  indent(ss, indent_size) << "StartCoalescing\n";
   return ss.str();
 }
 
@@ -339,7 +339,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(EndCoalescing)
 
 std::string EndCoalescing::toString(int indent_size) const {
   std::stringstream ss;
-  indent(ss, indent_size) << "EndCoalescing " << name() << std::endl;
+  indent(ss, indent_size) << "EndCoalescing " << name() << '\n';
   return ss.str();
 }
 
@@ -367,7 +367,7 @@ std::string ShareMemHandles::toString(int indent_size) const {
   for (auto communication : communications()) {
     ss << communication->toInlineString() << ", ";
   }
-  ss << std::endl;
+  ss << '\n';
   return ss.str();
 }
 
@@ -440,7 +440,7 @@ std::string ShardByStream::toString(int indent_size) const {
   indent(ss, indent_size) << out()->toString() << " = ShardByStream("
                           << in()->toString()
                           << ", stream_index=" << stream_index()->toString()
-                          << ")" << std::endl;
+                          << ")\n";
   return ss.str();
 }
 
@@ -462,7 +462,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(SymmetricContiguousView)
 std::string SymmetricContiguousView::toString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << " = SymmetricContiguousView("
-                          << in()->toString() << ")" << std::endl;
+                          << in()->toString() << ")\n";
   return ss.str();
 }
 
@@ -484,7 +484,7 @@ std::string ForLoop::toString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << "FOR " << index()->toString() << " from "
                           << start()->toInlineString() << " to "
-                          << stop()->toInlineString() << ":" << std::endl
+                          << stop()->toInlineString() << ":\n"
                           << body().toString(indent_size + 1);
   return ss.str();
 }
@@ -504,4 +504,38 @@ std::string ForLoop::toInlineString(int indent_size) const {
       index, iter_domain->start(), iter_domain->stop());
 }
 
+Allocate::Allocate(
+    IrBuilderPasskey passkey,
+    TensorView* in,
+    MemoryType memory_type,
+    bool zero_init)
+    : Expr(passkey) {
+  NVF_ERROR(passkey.ir_container_ != nullptr);
+  NVF_ERROR(passkey.ir_container_->isA<HostIrContainer>());
+
+  addInput(in);
+  addDataAttribute(memory_type);
+  addDataAttribute(zero_init);
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(Allocate)
+
+std::string Allocate::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << in()->toString() << " = ALLOCATE("
+                          << "mem_type=" << memoryType() << ", "
+                          << "zero_init=" << std::boolalpha << zeroInit()
+                          << ")\n";
+  return ss.str();
+}
+
+std::string Allocate::toInlineString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << in()->toInlineString() << " = ALLOCATE("
+                          << "mem_type=" << memoryType() << ", "
+                          << "zero_init=" << std::boolalpha << zeroInit()
+                          << ")";
+  return ss.str();
+}
+
 } // namespace nvfuser::hir
diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h
index 3ddbc9e67a6..69c255b7885 100644
--- a/csrc/host_ir/ir.h
+++ b/csrc/host_ir/ir.h
@@ -161,6 +161,42 @@ class LaunchKernel : public Expr {
   CompiledKernel* compiled_kernel_ = nullptr;
 };
 
+class Allocate : public Expr {
+ public:
+  using Expr::Expr;
+
+  explicit Allocate(
+      IrBuilderPasskey passkey,
+      TensorView* in,
+      MemoryType memory_type,
+      bool zero_init = false);
+
+  Allocate(const Allocate& other) = delete;
+  Allocate& operator=(const Allocate& other) = delete;
+  Allocate(Allocate&& other) = delete;
+  Allocate& operator=(Allocate&& other) = delete;
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "hir::Allocate";
+  }
+
+  TensorView* in() const {
+    return inputs().at(0)->as<TensorView>();
+  }
+
+  MemoryType memoryType() const {
+    return attribute<MemoryType>(0);
+  }
+
+  bool zeroInit() const {
+    return attribute<bool>(1);
+  }
+};
+
 class Deallocate : public Expr {
  public:
   using Expr::Expr;
diff --git a/csrc/host_ir/jit.cpp b/csrc/host_ir/jit.cpp
index 00183754c56..dba0d3be916 100644
--- a/csrc/host_ir/jit.cpp
+++ b/csrc/host_ir/jit.cpp
@@ -760,7 +760,7 @@ class HostIrCompileDispatcher : public OptInDispatch {
          smem});
   }
 
-  void handle(kir::Allocate* allocate) final {
+  void handle(hir::Allocate* allocate) final {
     llvm::LLVMContext& context = builder().getContext();
     llvm::Module* module = builder().GetInsertBlock()->getParent()->getParent();
 
@@ -769,16 +769,13 @@ class HostIrCompileDispatcher : public OptInDispatch {
     llvm::SmallVector<llvm::Value*, kMaxTensorDim> tensor_sizes;
     llvm::SmallVector<llvm::Value*, kMaxTensorDim> tensor_strides;
     inferTensorShapesAndStrides(
-        allocate->buffer()->as<TensorView>(),
-        valToValue(),
-        builder(),
-        tensor_sizes,
-        tensor_strides);
+        allocate->in(), valToValue(), builder(), tensor_sizes, tensor_strides);
 
-    const std::vector<IterDomain*>& logical_domain = TensorDomain::noReductions(
-        allocate->buffer()->as<TensorView>()->getLogicalDomain());
+    auto logical_domain =
+        allocate->in()->getLogicalDomain() | TensorDomain::kNoReductions;
 
-    NVF_ERROR_EQ(tensor_sizes.size(), logical_domain.size());
+    NVF_ERROR_EQ(
+        std::ssize(tensor_sizes), std::ranges::distance(logical_domain));
 
     llvm::ArrayType* sizes_type = getInt64StaticArrayType(
         context, static_cast<int64_t>(tensor_sizes.size()));
@@ -819,9 +816,8 @@ class HostIrCompileDispatcher : public OptInDispatch {
 
     // Create constants for type and device from params
     at::ScalarType data_type = data_type_to_aten(
-        allocate->buffer()->dtype() == DataType::Index
-            ? PrimDataType::Int
-            : allocate->buffer()->dtype());
+        allocate->in()->dtype() == DataType::Index ? PrimDataType::Int
+                                                   : allocate->in()->dtype());
     llvm::Value* dtype_constant =
         builder().getInt32(static_cast<int32_t>(data_type));
     llvm::Value* device_index_constant =
@@ -841,7 +837,7 @@ class HostIrCompileDispatcher : public OptInDispatch {
          dtype_constant,
          device_index_constant,
          out_tensor});
-    valToValue()[allocate->buffer()] = out_tensor;
+    valToValue()[allocate->in()] = out_tensor;
   }
 
   void handle(hir::Deallocate* deallocate) final {
diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp
index 3757370ba28..bb14c8c6eae 100644
--- a/csrc/host_ir/lowering.cpp
+++ b/csrc/host_ir/lowering.cpp
@@ -179,46 +179,47 @@ void lowerSegment(
       // TODO: `replacement_map` should be associated with the scope so
       // ShardByStream across segments in the same for-loop can be reused.
       std::unordered_map<Val*, Val*> replacement_map;
+
+      // All communications from a single expr share the same in/out TVs;
+      // only root and team vary. Handle input sharding and output
+      // allocation once, outside the per-communication loop.
+      TensorView* in = e->input(0)->as<TensorView>();
+      TensorView* out = e->output(0)->as<TensorView>();
+
+      if (haveDifferentShardings(
+              in,
+              DomainType::kAllocation,
+              out,
+              DomainType::kLoop,
+              {ParallelType::Stream})) {
+        Val*& sharded_in = replacement_map[in];
+        if (sharded_in == nullptr) {
+          sharded_in = hir::shardByStream(in, innermost.loop->index(), e);
+          innermost_scope.pushBack(sharded_in->definition());
+        }
+      }
+
+      auto* allocate =
+          IrBuilder::create<hir::Allocate>(out, out->getMemoryType());
+      if (getShardedIterDomain(out, ParallelType::Stream, DomainType::kLoop) !=
+              nullptr &&
+          getShardedIterDomain(
+              out, ParallelType::Stream, DomainType::kAllocation) == nullptr) {
+        innermost.parent_scope->insert(
+            innermost.parent_insertion_point, allocate);
+        auto [i, inserted] = replacement_map.emplace(
+            out, hir::shardByStream(out, innermost.loop->index(), e));
+        NVF_ERROR(inserted, "The input segmented fusion should be SSA.");
+        innermost_scope.pushBack(i->second->definition());
+      } else {
+        innermost_scope.pushBack(allocate);
+      }
+
       for (Expr* c : convertSingleOpToCommunication(e, device_id)) {
         NVF_ERROR(
             c->isA<Communication>(),
             "Exprs in a Communication group should be Communication: ",
             c);
-        auto* communication = c->as<Communication>();
-        TensorView* in = communication->in();
-        TensorView* out = communication->out();
-        if (haveDifferentShardings(
-                in,
-                DomainType::kAllocation,
-                out,
-                DomainType::kLoop,
-                {ParallelType::Stream})) {
-          Val*& sharded_in = replacement_map[in];
-          if (sharded_in == nullptr) {
-            sharded_in =
-                hir::shardByStream(in, innermost.loop->index(), communication);
-            innermost_scope.pushBack(sharded_in->definition());
-          }
-        }
-
-        // Allocate the recv buffers of communications
-        auto* allocate =
-            IrBuilder::create<kir::Allocate>(out, out->getMemoryType());
-        if (getShardedIterDomain(
-                out, ParallelType::Stream, DomainType::kLoop) != nullptr &&
-            getShardedIterDomain(
-                out, ParallelType::Stream, DomainType::kAllocation) ==
-                nullptr) {
-          innermost.parent_scope->insert(
-              innermost.parent_insertion_point, allocate);
-          auto [i, inserted] = replacement_map.emplace(
-              out,
-              hir::shardByStream(out, innermost.loop->index(), communication));
-          NVF_ERROR(inserted, "The input segmented fusion should be SSA.");
-          innermost_scope.pushBack(i->second->definition());
-        } else {
-          innermost_scope.pushBack(allocate);
-        }
 
         Expr* new_c = cloneWithNewOperands(c, replacement_map);
         innermost_scope.pushBack(new_c);
@@ -310,7 +311,7 @@ void lowerSegment(
                   out, ParallelType::Stream, DomainType::kAllocation) ==
               nullptr) {
             auto* allocate =
-                IrBuilder::create<kir::Allocate>(out, out->getMemoryType());
+                IrBuilder::create<hir::Allocate>(out, out->getMemoryType());
             innermost.parent_scope->insert(
                 innermost.parent_insertion_point, allocate);
             // Loop is stream parallelized but allocation is not. Therefore,
@@ -347,7 +348,7 @@ void lowerSegment(
             alias);
 
         auto* allocate =
-            IrBuilder::create<kir::Allocate>(out_tv, out_tv->getMemoryType());
+            IrBuilder::create<hir::Allocate>(out_tv, out_tv->getMemoryType());
         innermost_scope.pushBack(allocate);
       }
 
diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp
index aa4f933f65e..52950202150 100644
--- a/tests/cpp/test_host_ir_evaluator.cpp
+++ b/tests/cpp/test_host_ir_evaluator.cpp
@@ -55,7 +55,8 @@ TEST_F(HostIrEvaluatorTest, LaunchKernel) {
     Val* in = ir_cloner.clone(fusion.inputs().at(0));
     Val* out = ir_cloner.clone(fusion.outputs().at(0));
 
-    auto allocate = IrBuilder::create<kir::Allocate>(out, MemoryType::Global);
+    auto allocate = IrBuilder::create<hir::Allocate>(
+        out->as<TensorView>(), MemoryType::Global);
     auto* cache_id =
         IrBuilder::create<NamedScalar>("cacheId", DataType::UInt64);
     auto launch_kernel = IrBuilder::create<LaunchKernel>(
@@ -182,8 +183,8 @@ TEST_F(HostIrEvaluatorTest, AddInLoop) {
     hic->addInput(in);
     hic->addOutput(out);
 
-    auto* allocate_out = IrBuilder::create<kir::Allocate>(
-        out, MemoryType::Global, std::vector<Val*>({}), /*zero_init=*/true);
+    auto* allocate_out = IrBuilder::create<hir::Allocate>(
+        out, MemoryType::Global, /*zero_init=*/true);
 
     auto* stream_index = IrBuilder::create<Val>(DataType::Index);
     auto* for_loop = IrBuilder::create<ForLoop>(
diff --git a/tests/cpp/test_host_ir_jit.cpp b/tests/cpp/test_host_ir_jit.cpp
index b150af6459c..a9d45aea7e1 100644
--- a/tests/cpp/test_host_ir_jit.cpp
+++ b/tests/cpp/test_host_ir_jit.cpp
@@ -95,11 +95,11 @@ TEST_F(HostIrJitTest, Deallocate) {
   TensorView* t2 = makeConcreteTensor(t2_sizes);
   t2->setMemoryType(MemoryType::Global);
 
-  auto* allocate_t0 = IrBuilder::create<kir::Allocate>(t0, MemoryType::Global);
+  auto* allocate_t0 = IrBuilder::create<hir::Allocate>(t0, MemoryType::Global);
   auto* deallocate_t0 = IrBuilder::create<Deallocate>(t0);
-  auto* allocate_t1 = IrBuilder::create<kir::Allocate>(t1, MemoryType::Global);
+  auto* allocate_t1 = IrBuilder::create<hir::Allocate>(t1, MemoryType::Global);
   auto* deallocate_t1 = IrBuilder::create<Deallocate>(t1);
-  auto* allocate_t2 = IrBuilder::create<kir::Allocate>(t2, MemoryType::Global);
+  auto* allocate_t2 = IrBuilder::create<hir::Allocate>(t2, MemoryType::Global);
 
   hic->pushBackTopLevelExprs(allocate_t0);
   hic->pushBackTopLevelExprs(allocate_t1);
@@ -132,7 +132,7 @@ TEST_F(HostIrJitTest, DynamicSizedTensorAllocate) {
   hic->addInput(hic_in);
   hic->addOutput(hic_out);
   auto* allocate =
-      IrBuilder::create<kir::Allocate>(hic_out, MemoryType::Global);
+      IrBuilder::create<hir::Allocate>(hic_out, MemoryType::Global);
   hic->pushBackTopLevelExprs(allocate);
 
   HostIrJit jit(std::move(hic));
@@ -158,7 +158,7 @@ TEST_F(HostIrJitTest, Reorder) {
   hic->addInput(hic_in);
   hic->addOutput(hic_out);
   auto* allocate =
-      IrBuilder::create<kir::Allocate>(hic_out, MemoryType::Global);
+      IrBuilder::create<hir::Allocate>(hic_out, MemoryType::Global);
   hic->pushBackTopLevelExprs(allocate);
 
   HostIrJit jit(std::move(hic));
@@ -184,7 +184,7 @@ TEST_F(HostIrJitTest, Permute) {
   hic->addInput(hic_in);
   hic->addOutput(hic_out);
   auto* allocate =
-      IrBuilder::create<kir::Allocate>(hic_out, MemoryType::Global);
+      IrBuilder::create<hir::Allocate>(hic_out, MemoryType::Global);
   hic->pushBackTopLevelExprs(allocate);
 
   HostIrJit jit(std::move(hic));
@@ -210,7 +210,7 @@ TEST_F(HostIrJitTest, AllocationDomainReorder) {
   hic->addInput(hic_in);
   hic->addOutput(hic_out);
   auto* allocate =
-      IrBuilder::create<kir::Allocate>(hic_out, MemoryType::Global);
+      IrBuilder::create<hir::Allocate>(hic_out, MemoryType::Global);
   hic->pushBackTopLevelExprs(allocate);
 
   HostIrJit jit(std::move(hic));
@@ -244,9 +244,9 @@ TEST_F(HostIrJitTest, BroadcastTest) {
   hic->addOutput(broadcast_tv);
   hic->addOutput(expand_tv);
   auto* allocate_broadcast =
-      IrBuilder::create<kir::Allocate>(broadcast_tv, MemoryType::Global);
+      IrBuilder::create<hir::Allocate>(broadcast_tv, MemoryType::Global);
   auto* allocate_expand =
-      IrBuilder::create<kir::Allocate>(expand_tv, MemoryType::Global);
+      IrBuilder::create<hir::Allocate>(expand_tv, MemoryType::Global);
   hic->pushBackTopLevelExprs(allocate_broadcast);
   hic->pushBackTopLevelExprs(allocate_expand);
 
@@ -292,7 +292,7 @@ TEST_F(HostIrJitTest, LaunchKernel) {
   hic->addInput(hic_in);
   hic->addOutput(hic_out);
 
-  auto allocate = IrBuilder::create<kir::Allocate>(hic_out, MemoryType::Global);
+  auto allocate = IrBuilder::create<hir::Allocate>(hic_out, MemoryType::Global);
   auto* cache_id = IrBuilder::create<NamedScalar>("cacheId", DataType::UInt64);
   auto launch_kernel = IrBuilder::create<LaunchKernel>(
       0,
@@ -332,7 +332,7 @@ TEST_F(HostIrJitTest, Matmul) {
   hic->addOutput(tv2);
 
   hic->pushBackTopLevelExprs(
-      IrBuilder::create<kir::Allocate>(tv2, MemoryType::Global));
+      IrBuilder::create<hir::Allocate>(tv2, MemoryType::Global));
   hic->pushBackTopLevelExprs(tv2->definition());
 
   HostIrJit jit(std::move(hic));
@@ -375,10 +375,10 @@ TEST_F(HostIrJitTest, Linear) {
   hic->addOutput(out_without_bias);
 
   hic->pushBackTopLevelExprs(
-      IrBuilder::create<kir::Allocate>(out_with_bias, MemoryType::Global));
+      IrBuilder::create<hir::Allocate>(out_with_bias, MemoryType::Global));
   hic->pushBackTopLevelExprs(out_with_bias->definition());
   hic->pushBackTopLevelExprs(
-      IrBuilder::create<kir::Allocate>(out_without_bias, MemoryType::Global));
+      IrBuilder::create<hir::Allocate>(out_without_bias, MemoryType::Global));
   hic->pushBackTopLevelExprs(out_without_bias->definition());
 
   HostIrJit jit(std::move(hic));
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index 7caa06abf71..f011511930f 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -532,8 +532,8 @@ TEST_F(MultiDeviceTest, SwizzleWithParallelType) {
       tv->axis(0)->parallelize(ParallelType::Stream);
     }
 
-    auto* allocate_out = IrBuilder::create<kir::Allocate>(
-        out_tv, MemoryType::Global, std::vector<Val*>({}), /*zero_init=*/true);
+    auto* allocate_out = IrBuilder::create<hir::Allocate>(
+        out_tv, MemoryType::Global, /*zero_init=*/true);
     auto* stream_index = IrBuilder::create<Val>(DataType::Index);
     auto* for_loop = IrBuilder::create<ForLoop>(
         stream_index,