NVIDIA · Priya2698 · Mar 3, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1105,6 +1105,7 @@ if(BUILD_TEST)
   list(APPEND HOSTIR_TEST_SRCS
     ${NVFUSER_ROOT}/tests/cpp/test_host_ir_evaluator.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_host_ir_passes.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_host_ir_stream_lowering.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp
   )

diff --git a/csrc/host_ir/allocate_and_deallocate.cpp b/csrc/host_ir/allocate_and_deallocate.cpp
@@ -17,6 +17,8 @@
 #include <unordered_set>
 #include <vector>
 
+#include "fusion.h"
+#include "host_ir/ir.h"
 #include "ir/builder.h"
 #include "ir/utils.h"
 
@@ -28,8 +30,8 @@ class DominatorTree {
  public:
   class Node {
    public:
-    Node(Scope* scope, Scope::Iterator iterator)
-        : scope_(scope), iterator_(iterator) {}
+    Node(Scope* scope, Scope::Iterator iterator, Node* parent)
+        : scope_(scope), iterator_(iterator), parent_(parent) {}
     Node(const Node& other) = delete;
     Node(Node&& other) = delete;
     Node& operator=(const Node& other) = delete;
@@ -51,6 +53,10 @@ class DominatorTree {
       return iterator_;
     }
 
+    const Node* parent() const {
+      return parent_;
+    }
+
     Expr* getExpr() const {
       return *iterator_;
     }
@@ -60,6 +66,7 @@ class DominatorTree {
     // They are only needed when the user wants to modify the host IR.
     Scope* scope_;
     Scope::Iterator iterator_;
+    Node* parent_;
 
     std::vector<Node*> children_;
   };
@@ -108,7 +115,8 @@ class DominatorTree {
     for (auto scope_it = scope.exprs().begin(); scope_it != scope.exprs().end();
          ++scope_it) {
       Expr* e = *scope_it;
-      auto [node_it, inserted] = nodes_.try_emplace(e, &scope, scope_it);
+      auto [node_it, inserted] =
+          nodes_.try_emplace(e, &scope, scope_it, parent);
       NVF_ERROR(inserted);
       Node& node = node_it->second;
       if (parent != nullptr) {
@@ -186,6 +194,23 @@ void insertAllocations(hir::HostIrContainer& hic) {
       });
 }
 
+bool needsDeallocation(TensorView* tv) {
+  if (tv->isFusionInput()) {
+    return false;
+  }
+  if (tv->isFusionOutput()) {
+    return false;
+  }
+  if (tv->definition()->isA<ShardByStream>()) {
+    return false;
+  }
+  const AliasInfo& alias_info = tv->container()->getOutputAlias(tv);
+  if (alias_info.type == AllocationType::ReuseBuffer) {
+    return false;
+  }
+  return true;
+}
+
 void insertDeallocations(hir::HostIrContainer& hic) {
   const std::list<Expr*>& top_level_exprs = hic.topLevelExprs();
   std::for_each(top_level_exprs.begin(), top_level_exprs.end(), [](Expr* expr) {
@@ -196,39 +221,50 @@ void insertDeallocations(hir::HostIrContainer& hic) {
         expr);
   });
 
-  // For each input in every expression in the container, find the position of
-  // its last use and insert a deallocate directly after, except for fusion
-  // inputs and outputs.
-  std::unordered_set<TensorView*> last_use_found;
-  for (auto insertion_point = top_level_exprs.end();
-       insertion_point != top_level_exprs.begin();) {
-    auto prev = std::prev(insertion_point);
-    Expr* e = *prev;
-
-    // Only tensors need to be allocated.
-    for (auto* in : ir_utils::filterByType<TensorView>(e->inputs())) {
-      // Fusion inputs are managed by the caller.
-      if (in->isFusionInput()) {
-        continue;
-      }
-
-      // Fusion outputs need to be kept alive for the caller.
-      if (in->isFusionOutput()) {
-        continue;
-      }
+  DominatorTree dom_tree(hic);
+  std::unordered_map<TensorView*, const DominatorTree::Node*> outermost_scope;
+  std::unordered_map<TensorView*, const DominatorTree::Node*> last_use;
 
-      // Skip if `e` is not the last use.
-      if (!last_use_found.insert(in).second) {
-        continue;
-      }
+  dom_tree.depthFirstTraverse(
+      /*pre_fn=*/
+      [&](const DominatorTree::Node* node) {
+        Expr* e = node->getExpr();
+        if (auto* alloc = dynamic_cast<kir::Allocate*>(e)) {
+          outermost_scope.try_emplace(alloc->buffer()->as<TensorView>(), node);
+        }
+        for (auto* input : ir_utils::filterByType<TensorView>(e->inputs())) {
+          outermost_scope.try_emplace(input, node);
+        }
+      },
+      /*post_fn=*/
+      [&](const DominatorTree::Node* node) {
+        Expr* e = node->getExpr();
+        if (auto* alloc = dynamic_cast<kir::Allocate*>(e)) {
+          last_use.try_emplace(alloc->buffer()->as<TensorView>(), node);
+        }
+        for (auto* in : ir_utils::filterByType<TensorView>(e->inputs())) {
+          last_use.try_emplace(in, node);
+        }
+      });
 
-      auto* deallocate = IrBuilder::create<hir::Deallocate>(in);
-      hic.insertExprBefore(insertion_point, deallocate);
+  for (const auto& [allocated_tv, outermost_scope_node] : outermost_scope) {
+    if (!needsDeallocation(allocated_tv)) {
+      continue;
+    }
+    const DominatorTree::Node* last_use_node = last_use.at(allocated_tv);
+    Scope* outermost_scope = outermost_scope_node->scope();
+
+    // Insert in allocation_scope, after the node that contains last_use.
+    // Walk up from last_use until we reach allocation_scope (no-op when same).
+    const DominatorTree::Node* insert_after = last_use_node;
+    while (insert_after->scope() != outermost_scope) {
+      insert_after = insert_after->parent();
+      NVF_ERROR(
+          insert_after != nullptr, "Allocation scope must dominate last use");
     }
 
-    // Don't `--insertion_point;` because we'd like to skip newly inserted
-    // deallocations.
-    insertion_point = prev;
+    auto* deallocate = IrBuilder::create<hir::Deallocate>(allocated_tv);
+    outermost_scope->insert(std::next(insert_after->iterator()), deallocate);
   }
 }
 

diff --git a/csrc/host_ir/jit.cpp b/csrc/host_ir/jit.cpp
@@ -47,6 +47,7 @@
 #include "ir/all_nodes.h"
 #include "linked_hash_map.h"
 #include "ops/all_ops.h"
+#include "options.h"
 #include "polymorphic_value.h"
 #include "runtime/compiled_kernel.h"
 #include "runtime/executor.h"
@@ -909,6 +910,10 @@ HostIrJitImpl::HostIrJitImpl(
     : container_(std::move(container)) {
   FUSER_PERF_SCOPE("HostIrJitImpl::HostIrJitImpl");
 
+  if (isDebugDumpEnabled(DebugDumpOption::HostIr)) {
+    container_->print(debug());
+  }
+
   // Initialize LLVM
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();

diff --git a/tests/cpp/test_host_ir_passes.cpp b/tests/cpp/test_host_ir_passes.cpp
@@ -0,0 +1,145 @@
+// clang-format off
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+* All rights reserved.
+* SPDX-License-Identifier: BSD-3-Clause
+*/
+// clang-format on
+
+// This file contains integration tests that run fusions through
+// FusionExecutorCache with host IR lowering turned on.
+#include <algorithm>
+
+#include <gmock/gmock-matchers.h>
+#include <gmock/gmock-more-matchers.h>
+#include <gtest/gtest.h>
+
+#include "fusion.h"
+#include "host_ir/ir.h"
+#include "ir/all_nodes.h"
+#include "ops/all_ops.h"
+#include "options.h"
+#include "runtime/fusion_kernel_runtime.h"
+#include "tests/cpp/utils.h"
+#include "tests/cpp/validator.h"
+
+namespace nvfuser {
+
+class HostIrPassesTest : public NVFuserTest {
+ protected:
+  HostIrPassesTest() {
+    EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrLowering);
+  }
+};
+
+TEST_F(HostIrPassesTest, TwoMatmulsInlinable) {
+  constexpr int64_t c = 3;
+
+  DisableOptionsGuard dog;
+  DisableOptionsGuard::getCurOptions().set(DisableOption::InferContiguity);
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* inp = makeContigTensor(2);
+  TensorView* w1 = makeContigTensor(2);
+  TensorView* w2 = makeContigTensor(2);
+  TensorView* intermediate = matmul(inp, w1);
+  TensorView* out = matmul(intermediate, w2);
+
+  fusion->addInput(inp);
+  fusion->addInput(w1);
+  fusion->addInput(w2);
+  fusion->addOutput(out);
+
+  inp->outer_split(0, c);
+  inp->axis(0)->parallelize(ParallelType::Stream);
+
+  FusionExecutorCache executor(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor inp_tensor = at::randn({c * 2, 3}, options);
+  at::Tensor w1_tensor = at::randn({3, 5}, options);
+  at::Tensor w2_tensor = at::randn({5, 3}, options);
+
+  auto out_tensors =
+      executor.runFusionWithInputs({inp_tensor, w1_tensor, w2_tensor});
+
+  // Both matmuls are inlined in the same loop; `intermediate` is
+  // allocated and deallocated within the loop body.
+  FusionKernelRuntime* runtime = executor.getMostRecentKernelRuntime();
+  const auto& exprs = runtime->getHostIrContainer().topLevelExprs();
+  auto it = std::find_if(exprs.begin(), exprs.end(), [](Expr* e) {
+    return e->isA<hir::ForLoop>();
+  });
+  ASSERT_NE(it, exprs.end());
+  const auto& body = (*it)->as<hir::ForLoop>()->body();
+  int deallocate_count =
+      std::count_if(body.exprs().begin(), body.exprs().end(), [](Expr* e) {
+        return e->isA<hir::Deallocate>();
+      });
+  EXPECT_EQ(deallocate_count, 1)
+      << "Expected for-loop body to have exactly one Deallocate, got "
+      << deallocate_count;
+
+  testValidate(
+      executor.fusion(),
+      out_tensors,
+      {inp_tensor, w1_tensor, w2_tensor},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(HostIrPassesTest, TwoMatmulsNotInlinable) {
+  constexpr int64_t c = 3;
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* inp = makeContigTensor(2);
+  TensorView* w1 = makeContigTensor(2);
+  TensorView* w2 = makeContigTensor(2);
+  TensorView* out1 = matmul(inp, w1);
+  TensorView* out = matmul(out1, w2);
+
+  fusion->addInput(inp);
+  fusion->addInput(w1);
+  fusion->addInput(w2);
+  fusion->addOutput(out);
+
+  w1->split(1, c, /*inner_split=*/false);
+  w1->axis(1)->parallelize(ParallelType::Stream);
+  out->split(0, c, /*inner_split=*/false);
+  out->axis(0)->parallelize(ParallelType::Stream);
+
+  FusionExecutorCache executor(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor inp_tensor = at::randn({c * 2, 3}, options);
+  at::Tensor w1_tensor = at::randn({3, c * 5}, options);
+  at::Tensor w2_tensor = at::randn({c * 5, 3}, options);
+
+  auto out_tensors =
+      executor.runFusionWithInputs({inp_tensor, w1_tensor, w2_tensor});
+
+  // The intermediate (out1) is fully allocated; its deallocate is at top level.
+  FusionKernelRuntime* runtime = executor.getMostRecentKernelRuntime();
+  const auto& exprs = runtime->getHostIrContainer().topLevelExprs();
+  int deallocate_count = std::count_if(exprs.begin(), exprs.end(), [](Expr* e) {
+    return e->isA<hir::Deallocate>();
+  });
+  EXPECT_EQ(deallocate_count, 1)
+      << "Expected exactly one Deallocate at top level, got "
+      << deallocate_count;
+
+  testValidate(
+      executor.fusion(),
+      out_tensors,
+      {inp_tensor, w1_tensor, w2_tensor},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+} // namespace nvfuser