Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,7 @@ if(BUILD_TEST)
list(APPEND HOSTIR_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_host_ir_evaluator.cpp
${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp
${NVFUSER_ROOT}/tests/cpp/test_host_ir_passes.cpp
${NVFUSER_ROOT}/tests/cpp/test_host_ir_stream_lowering.cpp
${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp
)
Expand Down
100 changes: 68 additions & 32 deletions csrc/host_ir/allocate_and_deallocate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <unordered_set>
#include <vector>

#include "fusion.h"
#include "host_ir/ir.h"
#include "ir/builder.h"
#include "ir/utils.h"

Expand All @@ -28,8 +30,8 @@ class DominatorTree {
public:
class Node {
public:
Node(Scope* scope, Scope::Iterator iterator)
: scope_(scope), iterator_(iterator) {}
Node(Scope* scope, Scope::Iterator iterator, Node* parent)
: scope_(scope), iterator_(iterator), parent_(parent) {}
Node(const Node& other) = delete;
Node(Node&& other) = delete;
Node& operator=(const Node& other) = delete;
Expand All @@ -51,6 +53,10 @@ class DominatorTree {
return iterator_;
}

const Node* parent() const {
return parent_;
}

Expr* getExpr() const {
return *iterator_;
}
Expand All @@ -60,6 +66,7 @@ class DominatorTree {
// They are only needed when the user wants to modify the host IR.
Scope* scope_;
Scope::Iterator iterator_;
Node* parent_;

std::vector<Node*> children_;
};
Expand Down Expand Up @@ -108,7 +115,8 @@ class DominatorTree {
for (auto scope_it = scope.exprs().begin(); scope_it != scope.exprs().end();
++scope_it) {
Expr* e = *scope_it;
auto [node_it, inserted] = nodes_.try_emplace(e, &scope, scope_it);
auto [node_it, inserted] =
nodes_.try_emplace(e, &scope, scope_it, parent);
NVF_ERROR(inserted);
Node& node = node_it->second;
if (parent != nullptr) {
Expand Down Expand Up @@ -186,6 +194,23 @@ void insertAllocations(hir::HostIrContainer& hic) {
});
}

bool needsDeallocation(TensorView* tv) {
if (tv->isFusionInput()) {
return false;
}
if (tv->isFusionOutput()) {
return false;
}
if (tv->definition()->isA<ShardByStream>()) {
return false;
}
const AliasInfo& alias_info = tv->container()->getOutputAlias(tv);
if (alias_info.type == AllocationType::ReuseBuffer) {
return false;
}
return true;
}

void insertDeallocations(hir::HostIrContainer& hic) {
const std::list<Expr*>& top_level_exprs = hic.topLevelExprs();
std::for_each(top_level_exprs.begin(), top_level_exprs.end(), [](Expr* expr) {
Expand All @@ -196,39 +221,50 @@ void insertDeallocations(hir::HostIrContainer& hic) {
expr);
});

// For each input in every expression in the container, find the position of
// its last use and insert a deallocate directly after, except for fusion
// inputs and outputs.
std::unordered_set<TensorView*> last_use_found;
for (auto insertion_point = top_level_exprs.end();
insertion_point != top_level_exprs.begin();) {
auto prev = std::prev(insertion_point);
Expr* e = *prev;

// Only tensors need to be allocated.
for (auto* in : ir_utils::filterByType<TensorView>(e->inputs())) {
// Fusion inputs are managed by the caller.
if (in->isFusionInput()) {
continue;
}

// Fusion outputs need to be kept alive for the caller.
if (in->isFusionOutput()) {
continue;
}
DominatorTree dom_tree(hic);
std::unordered_map<TensorView*, const DominatorTree::Node*> outermost_scope;
std::unordered_map<TensorView*, const DominatorTree::Node*> last_use;

// Skip if `e` is not the last use.
if (!last_use_found.insert(in).second) {
continue;
}
dom_tree.depthFirstTraverse(
/*pre_fn=*/
[&](const DominatorTree::Node* node) {
Expr* e = node->getExpr();
if (auto* alloc = dynamic_cast<kir::Allocate*>(e)) {
outermost_scope.try_emplace(alloc->buffer()->as<TensorView>(), node);
}
for (auto* input : ir_utils::filterByType<TensorView>(e->inputs())) {
outermost_scope.try_emplace(input, node);
}
},
/*post_fn=*/
[&](const DominatorTree::Node* node) {
Expr* e = node->getExpr();
if (auto* alloc = dynamic_cast<kir::Allocate*>(e)) {
last_use.try_emplace(alloc->buffer()->as<TensorView>(), node);
}
for (auto* in : ir_utils::filterByType<TensorView>(e->inputs())) {
last_use.try_emplace(in, node);
}
});

auto* deallocate = IrBuilder::create<hir::Deallocate>(in);
hic.insertExprBefore(insertion_point, deallocate);
for (const auto& [allocated_tv, outermost_scope_node] : outermost_scope) {
if (!needsDeallocation(allocated_tv)) {
continue;
}
const DominatorTree::Node* last_use_node = last_use.at(allocated_tv);
Scope* outermost_scope = outermost_scope_node->scope();

// Insert in allocation_scope, after the node that contains last_use.
// Walk up from last_use until we reach allocation_scope (no-op when same).
const DominatorTree::Node* insert_after = last_use_node;
while (insert_after->scope() != outermost_scope) {
insert_after = insert_after->parent();
NVF_ERROR(
insert_after != nullptr, "Allocation scope must dominate last use");
}

// Don't `--insertion_point;` because we'd like to skip newly inserted
// deallocations.
insertion_point = prev;
auto* deallocate = IrBuilder::create<hir::Deallocate>(allocated_tv);
outermost_scope->insert(std::next(insert_after->iterator()), deallocate);
}
}

Expand Down
5 changes: 5 additions & 0 deletions csrc/host_ir/jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "ir/all_nodes.h"
#include "linked_hash_map.h"
#include "ops/all_ops.h"
#include "options.h"
#include "polymorphic_value.h"
#include "runtime/compiled_kernel.h"
#include "runtime/executor.h"
Expand Down Expand Up @@ -909,6 +910,10 @@ HostIrJitImpl::HostIrJitImpl(
: container_(std::move(container)) {
FUSER_PERF_SCOPE("HostIrJitImpl::HostIrJitImpl");

if (isDebugDumpEnabled(DebugDumpOption::HostIr)) {
container_->print(debug());
}

// Initialize LLVM
llvm::InitializeNativeTarget();
llvm::InitializeNativeTargetAsmPrinter();
Expand Down
145 changes: 145 additions & 0 deletions tests/cpp/test_host_ir_passes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on

// This file contains integration tests that run fusions through
// FusionExecutorCache with host IR lowering turned on.
#include <algorithm>

#include <gmock/gmock-matchers.h>
#include <gmock/gmock-more-matchers.h>
#include <gtest/gtest.h>

#include "fusion.h"
#include "host_ir/ir.h"
#include "ir/all_nodes.h"
#include "ops/all_ops.h"
#include "options.h"
#include "runtime/fusion_kernel_runtime.h"
#include "tests/cpp/utils.h"
#include "tests/cpp/validator.h"

namespace nvfuser {

class HostIrPassesTest : public NVFuserTest {
protected:
HostIrPassesTest() {
EnableOptionsGuard::getCurOptions().set(EnableOption::HostIrLowering);
}
};

TEST_F(HostIrPassesTest, TwoMatmulsInlinable) {
constexpr int64_t c = 3;

DisableOptionsGuard dog;
DisableOptionsGuard::getCurOptions().set(DisableOption::InferContiguity);

auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

TensorView* inp = makeContigTensor(2);
TensorView* w1 = makeContigTensor(2);
TensorView* w2 = makeContigTensor(2);
TensorView* intermediate = matmul(inp, w1);
TensorView* out = matmul(intermediate, w2);

fusion->addInput(inp);
fusion->addInput(w1);
fusion->addInput(w2);
fusion->addOutput(out);

inp->outer_split(0, c);
inp->axis(0)->parallelize(ParallelType::Stream);

FusionExecutorCache executor(std::move(fusion));

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor inp_tensor = at::randn({c * 2, 3}, options);
at::Tensor w1_tensor = at::randn({3, 5}, options);
at::Tensor w2_tensor = at::randn({5, 3}, options);

auto out_tensors =
executor.runFusionWithInputs({inp_tensor, w1_tensor, w2_tensor});

// Both matmuls are inlined in the same loop; `intermediate` is
// allocated and deallocated within the loop body.
FusionKernelRuntime* runtime = executor.getMostRecentKernelRuntime();
const auto& exprs = runtime->getHostIrContainer().topLevelExprs();
auto it = std::find_if(exprs.begin(), exprs.end(), [](Expr* e) {
return e->isA<hir::ForLoop>();
});
ASSERT_NE(it, exprs.end());
const auto& body = (*it)->as<hir::ForLoop>()->body();
int deallocate_count =
std::count_if(body.exprs().begin(), body.exprs().end(), [](Expr* e) {
return e->isA<hir::Deallocate>();
});
EXPECT_EQ(deallocate_count, 1)
<< "Expected for-loop body to have exactly one Deallocate, got "
<< deallocate_count;

testValidate(
executor.fusion(),
out_tensors,
{inp_tensor, w1_tensor, w2_tensor},
__LINE__,
__FILE__,
"");
}

TEST_F(HostIrPassesTest, TwoMatmulsNotInlinable) {
constexpr int64_t c = 3;

auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

TensorView* inp = makeContigTensor(2);
TensorView* w1 = makeContigTensor(2);
TensorView* w2 = makeContigTensor(2);
TensorView* out1 = matmul(inp, w1);
TensorView* out = matmul(out1, w2);

fusion->addInput(inp);
fusion->addInput(w1);
fusion->addInput(w2);
fusion->addOutput(out);

w1->split(1, c, /*inner_split=*/false);
w1->axis(1)->parallelize(ParallelType::Stream);
out->split(0, c, /*inner_split=*/false);
out->axis(0)->parallelize(ParallelType::Stream);

FusionExecutorCache executor(std::move(fusion));

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor inp_tensor = at::randn({c * 2, 3}, options);
at::Tensor w1_tensor = at::randn({3, c * 5}, options);
at::Tensor w2_tensor = at::randn({c * 5, 3}, options);

auto out_tensors =
executor.runFusionWithInputs({inp_tensor, w1_tensor, w2_tensor});

// The intermediate (out1) is fully allocated; its deallocate is at top level.
FusionKernelRuntime* runtime = executor.getMostRecentKernelRuntime();
const auto& exprs = runtime->getHostIrContainer().topLevelExprs();
int deallocate_count = std::count_if(exprs.begin(), exprs.end(), [](Expr* e) {
return e->isA<hir::Deallocate>();
});
EXPECT_EQ(deallocate_count, 1)
<< "Expected exactly one Deallocate at top level, got "
<< deallocate_count;

testValidate(
executor.fusion(),
out_tensors,
{inp_tensor, w1_tensor, w2_tensor},
__LINE__,
__FILE__,
"");
}

} // namespace nvfuser
Loading