githubsgi
diff --git a/‎test/cpp/nativert/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions b/‎test/cpp/nativert/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎test/cpp/nativert/test_alias_analyzer.cpp‎
Lines changed: 182 additions & 0 deletions b/‎test/cpp/nativert/test_alias_analyzer.cpp‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎torch/nativert/executor/ExecutionFrame.h‎
Lines changed: 4 additions & 3 deletions b/‎torch/nativert/executor/ExecutionFrame.h‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎torch/nativert/executor/Executor.cpp‎
Lines changed: 19 additions & 16 deletions b/‎torch/nativert/executor/Executor.cpp‎
Lines changed: 19 additions & 16 deletions
diff --git a/‎torch/nativert/executor/Executor.h‎
Lines changed: 9 additions & 9 deletions b/‎torch/nativert/executor/Executor.h‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎torch/nativert/executor/GraphExecutorBase.cpp‎
Lines changed: 2 additions & 2 deletions b/‎torch/nativert/executor/GraphExecutorBase.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/nativert/executor/GraphExecutorBase.h‎
Lines changed: 1 addition & 1 deletion b/‎torch/nativert/executor/GraphExecutorBase.h‎
Lines changed: 1 addition & 1 deletion
@@ -24,6 +24,15 @@ set(NATIVERT_TEST_SRCS
   ${TORCH_ROOT}/torch/nativert/executor/memory/LayoutPlanner.cpp
   ${TORCH_ROOT}/torch/nativert/executor/memory/LayoutManager.cpp
   ${TORCH_ROOT}/torch/nativert/executor/memory/AliasAnalyzer.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/Executor.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/KernelFactory.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/ConstantFolder.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/GraphExecutorBase.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/SerialGraphExecutor.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/ParallelGraphExecutor.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/CallTorchBindKernel.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/HigherOrderKernel.cpp
 )
 
 add_executable(test_nativert
 
@@ -0,0 +1,182 @@
+#include <gtest/gtest.h>
+
+#include <fmt/format.h>
+
+#include <torch/nativert/executor/memory/AliasAnalyzer.h>
+#include <torch/nativert/graph/Graph.h>
+
+#include <torch/nativert/executor/Executor.h>
+#include <torch/nativert/kernels/KernelFactory.h>
+
+using namespace ::testing;
+using namespace torch::nativert;
+
+using AliasTestCase = std::tuple<
+    std::string /* value */,
+    AllocationLifetime,
+    bool /* is_alias */,
+    bool /* is_storage_associated_with_output */,
+    c10::FastSet<std::string> /* source(s) */>;
+
+class AliasAnalyzerTests : public testing::Test {
+  void SetUp() override {}
+
+  void TearDown() override {
+    test_cases.clear();
+    model.clear();
+  }
+
+ public:
+  void setTestCases(std::vector<AliasTestCase> cases) {
+    test_cases = std::move(cases);
+  }
+
+  void setModel(std::string m) {
+    model = std::move(m);
+  }
+
+  void run() {
+    EXPECT_FALSE(test_cases.empty());
+    EXPECT_FALSE(model.empty());
+
+    ExecutorConfig cfg;
+    cfg.enableStaticCPUKernels = true;
+
+    auto graph = stringToGraph(model);
+    auto kernels = KernelFactory().initializeNodeKernels(
+        *graph, nullptr, cfg, {}, nullptr);
+    auto kernelSchemas = Executor::getKernelSchemas(kernels.nodeKernels);
+
+    AliasAnalyzer analyzer(*graph, kernelSchemas);
+
+    for (
+        auto& [value, lifetime, is_alias, is_storage_associated_with_output, srcs] :
+        test_cases) {
+      LOG(INFO) << fmt::format(
+          "running test: value={}, lifetime=({}, {}), is_alias={}, is_storage_associated_with_output={}, src={}",
+          value,
+          lifetime.start,
+          lifetime.end,
+          is_alias,
+          is_storage_associated_with_output,
+          srcs.empty() ? "{}"
+                       : std::accumulate(
+                             srcs.begin(),
+                             srcs.end(),
+                             std::string{},
+                             [](std::string cur, const std::string& src) {
+                               cur.append(",");
+                               cur.append(src);
+                               return cur;
+                             }));
+      auto* v = graph->getValue(value);
+      EXPECT_EQ(analyzer.lifetime(v), lifetime);
+      EXPECT_EQ(analyzer.is_alias(v), is_alias);
+      EXPECT_EQ(
+          analyzer.is_storage_associated_with_output(v),
+          is_storage_associated_with_output);
+      const auto* resolved_srcs = analyzer.get_sources_of_alias(v);
+      if (resolved_srcs /* ensure set equality between *resolved_srcs and srcs */) {
+        EXPECT_FALSE(srcs.empty());
+        EXPECT_EQ(resolved_srcs->size(), srcs.size());
+        for (const auto& resolved_src : *resolved_srcs) {
+          EXPECT_TRUE(srcs.erase(std::string(resolved_src->name())) == 1);
+        }
+        EXPECT_TRUE(srcs.empty());
+      } else {
+        EXPECT_TRUE(srcs.empty());
+      }
+    }
+  }
+
+ private:
+  std::string model;
+  std::vector<AliasTestCase> test_cases;
+};
+
+TEST_F(AliasAnalyzerTests, TestNoAlias) {
+  setModel(R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))");
+
+  setTestCases({
+      {"out_t", AllocationLifetime(1, 2), false, false, {}},
+      {"res", AllocationLifetime(2, 3), false, true, {}},
+  });
+
+  run();
+}
+
+TEST_F(AliasAnalyzerTests, TestSimpleAlias) {
+  setModel(R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.slice.Tensor(self=%out_t, dim=1, start=0, end=0, step=1)
+  return (%res))");
+
+  setTestCases({
+      {"out_t", AllocationLifetime(1, 3), false, true, {}},
+      {"res", AllocationLifetime(2, 3), true, false, {"out_t"}},
+  });
+
+  run();
+}
+
+TEST_F(AliasAnalyzerTests, TestDeepAlias) {
+  setModel(R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %a1 = torch.ops.aten.slice.Tensor(self=%out_t, dim=1, start=0, end=0, step=1)
+      %res = torch.ops.aten.slice.Tensor(self=%a1, dim=1, start=0, end=0, step=1)
+  return (%res))");
+
+  setTestCases({
+      {"out_t", AllocationLifetime(1, 4), false, true, {}},
+      {"a1", AllocationLifetime(2, 4), true, false, {"out_t"}},
+      {"res", AllocationLifetime(3, 4), true, false, {"out_t"}},
+  });
+
+  run();
+}
+
+TEST_F(AliasAnalyzerTests, TestPackedListUnpack) {
+  setModel(R"(
+    graph(%a, %b, %c, %d):
+  %input_list[] = prim.ListPack(l0=%a, l1=%b, l2=%c, l3=%d)
+  %x0, %x1, %x2, %x3 = prim.ListUnpack(input=%input_list)
+  return (%x1, %x3))");
+
+  setTestCases({
+      {"a", AllocationLifetime(0, 2), false, false, {}},
+      {"x0", AllocationLifetime(2, 2), true, false, {"a"}},
+      {"b", AllocationLifetime(0, 3), false, true, {}},
+      {"x1", AllocationLifetime(2, 3), true, false, {"b"}},
+      {"c", AllocationLifetime(0, 2), false, false, {}},
+      {"x2", AllocationLifetime(2, 2), true, false, {"c"}},
+      {"d", AllocationLifetime(0, 3), false, true, {}},
+      {"x3", AllocationLifetime(2, 3), true, false, {"d"}},
+  });
+
+  run();
+}
+
+TEST_F(AliasAnalyzerTests, TestAmbiguousSourceOfAlias) {
+  setModel(R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %out_t2 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %a1 = prim.VarStack(l0=%out_t, l1=%out_t2)
+      %res = torch.ops.aten.slice.Tensor(self=%a1, dim=1, start=0, end=0, step=1)
+  return (%res))");
+
+  setTestCases({
+      {"out_t", AllocationLifetime(1, 5), false, true, {}},
+      {"out_t2", AllocationLifetime(2, 5), false, true, {}},
+      {"a1", AllocationLifetime(3, 5), true, false, {"out_t", "out_t2"}},
+      {"res", AllocationLifetime(4, 5), true, false, {"out_t", "out_t2"}},
+  });
+
+  run();
+}
@@ -46,13 +46,14 @@ class ExecutionFrame {
   }
 
   template <typename CB>
-  auto withMemoryPlanner(CB&& cb) {
+  auto withManagedMemory(CB&& cb) {
     if (!layoutManager_) {
-      return std::forward<CB>(cb)();
+      return std::forward<CB>(cb)(nullptr);
     }
 
     LayoutManagerGuard guard(*layoutManager_);
-    return std::forward<CB>(cb)();
+    return std::forward<CB>(cb)(
+        const_cast<const LayoutManager*>(layoutManager_.get()));
   }
 
   std::vector<c10::IValue> tryMoveUserOutputs();
 
@@ -19,30 +19,31 @@ namespace torch::nativert {
 Executor::Executor(
     torch::nativert::ExecutorConfig executorConfig,
     std::shared_ptr<Graph> graph,
-    std::shared_ptr<Weights> weights,
-    const Placement& placement,
-    std::shared_ptr<caffe2::serialize::PyTorchStreamReader> pytorchStreamReader,
-    const MakeProxyExecutorFn& makeProxyExecutorFunc)
+    const std::shared_ptr<Weights>& weights,
+    Placement placement,
+    const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
+        pytorchStreamReader,
+    MakeProxyExecutorFn makeProxyExecutorFunc)
     : executorConfig_(std::move(executorConfig)),
       graph_(std::move(graph)),
-      placement_(placement),
+      placement_(std::move(placement)),
       constantFolder_(
           executorConfig_.runConstFolding
               ? std::optional<ConstantFolder>(*graph_)
               : std::nullopt),
-      makeProxyExecutorFunc_(makeProxyExecutorFunc),
+      makeProxyExecutorFunc_(std::move(makeProxyExecutorFunc)),
       executionFrames_(executorConfig_.maxNumConcurrentThreads),
       clearedExecutionFrames_(executorConfig_.maxNumConcurrentThreads),
       numExecutionFrames_(0),
       lastClearedTimestamp_(getCurrentTimestampSeconds()) {
   if (weights) {
-    initialize(std::move(weights), std::move(pytorchStreamReader));
+    initialize(weights, pytorchStreamReader);
   }
 }
 
 void Executor::initialize(
-    std::shared_ptr<Weights> weights,
-    std::shared_ptr<caffe2::serialize::PyTorchStreamReader>
+    const std::shared_ptr<Weights>& weights,
+    const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
         pytorchStreamReader) {
   auto start = std::chrono::high_resolution_clock::now();
 
@@ -51,7 +52,7 @@ void Executor::initialize(
       weights,
       executorConfig_,
       placement_,
-      std::move(pytorchStreamReader),
+      pytorchStreamReader,
       makeProxyExecutorFunc_);
 
   if (constantFolder_.has_value()) {
@@ -113,13 +114,14 @@ void Executor::atomicSwapWeights(std::shared_ptr<Weights> weights) {
   }
 }
 
-void Executor::maybeRunConstantFolding(std::shared_ptr<Weights> weights) {
+void Executor::maybeRunConstantFolding(
+    const std::shared_ptr<Weights>& weights) {
   for (auto& execution : constFoldingExecutions_) {
     ExecutionFrame constFoldingFrame(execution.executor->graph());
     std::vector<c10::IValue> inputs;
     inputs.reserve(graph_->signature().inputsToWeights().size());
     for (const auto& [_, name] : graph_->signature().inputsToWeights()) {
-      inputs.push_back(weights->at(name));
+      inputs.emplace_back(weights->at(name));
     }
 
     auto outputs = execution.executor->execute(constFoldingFrame, inputs);
@@ -130,7 +132,7 @@ void Executor::maybeRunConstantFolding(std::shared_ptr<Weights> weights) {
   }
 }
 
-void Executor::processWeights(std::shared_ptr<Weights> weights) {
+void Executor::processWeights(const std::shared_ptr<Weights>& weights) {
   maybeRunConstantFolding(weights);
   if (constantFolder_.has_value()) {
     constantFolder_->evaluate(*weights);
@@ -352,10 +354,10 @@ std::vector<c10::IValue> Executor::execute(
 }
 
 ProfileMetrics Executor::benchmarkIndividualNodes(
-    std::vector<std::vector<c10::IValue>> inputsList,
+    const std::vector<std::vector<c10::IValue>>& inputsList,
     const uint32_t warmupRuns,
     const uint32_t mainRuns) {
-  CHECK(inputsList.size() > 0) << "Need at least one input to benchmark";
+  CHECK(!inputsList.empty()) << "Need at least one input to benchmark";
   CHECK(warmupRuns >= 1 && mainRuns >= 1) << "Need at least one run";
 
   for (const auto& inputs : inputsList) {
@@ -378,8 +380,9 @@ int64_t Executor::getCurrentTimestampSeconds() const {
 
 std::vector<DelegateExecutor*> Executor::getDelegates() {
   std::vector<DelegateExecutor*> delegates;
+  delegates.reserve(delegateExecutors_.size());
   for (const auto& delegateExecutor : delegateExecutors_) {
-    delegates.push_back(delegateExecutor.get());
+    delegates.emplace_back(delegateExecutor.get());
   }
   return delegates;
 }
 
@@ -79,19 +79,19 @@ class Executor {
   Executor(
       torch::nativert::ExecutorConfig executorConfig,
       std::shared_ptr<Graph> graph,
-      std::shared_ptr<Weights> weights,
-      const Placement& placement = Placement(),
-      std::shared_ptr<caffe2::serialize::PyTorchStreamReader>
+      const std::shared_ptr<Weights>& weights,
+      Placement placement = Placement(),
+      const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
           pytorchStreamReader = nullptr,
-      const MakeProxyExecutorFn& makeProxyExecutorFunc = nullptr);
+      MakeProxyExecutorFn makeProxyExecutorFunc = nullptr);
 
   std::shared_ptr<Weights> getWeights() {
     std::shared_ptr<Weights> ret;
     weights_.withLock([&](auto& w) { ret = w; });
     return ret;
   }
 
-  void processWeights(std::shared_ptr<Weights> weights);
+  void processWeights(const std::shared_ptr<Weights>& weights);
   void atomicSwapWeights(std::shared_ptr<Weights> weights);
 
   // This API only returns the flattened UserOutputs,
@@ -106,7 +106,7 @@ class Executor {
       const ITreeSpec& inputTreeSpec);
 
   ProfileMetrics benchmarkIndividualNodes(
-      std::vector<std::vector<c10::IValue>> inputsList,
+      const std::vector<std::vector<c10::IValue>>& inputsList,
       const uint32_t warmupRuns,
       const uint32_t mainRuns);
 
@@ -141,8 +141,8 @@ class Executor {
   c10::Synchronized<std::shared_ptr<Weights>> weights_;
 
   void initialize(
-      std::shared_ptr<Weights> weights,
-      std::shared_ptr<caffe2::serialize::PyTorchStreamReader>
+      const std::shared_ptr<Weights>& weights,
+      const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
           pytorchStreamReader);
 
   ExecutorFramePtr getExecutorFrameFromPool();
@@ -171,7 +171,7 @@ class Executor {
     ExecutionFrameEntry& operator=(const ExecutionFrameEntry&) = delete;
   };
 
-  void maybeRunConstantFolding(std::shared_ptr<Weights> weights);
+  void maybeRunConstantFolding(const std::shared_ptr<Weights>& weights);
   void validateInputs(const std::vector<c10::IValue>& inputs) const;
 
   // Helper method to get current timestamp in seconds
 
@@ -32,7 +32,7 @@ void GraphExecutorBase::fillUserInputs(
 
 ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
     ExecutionFrame& executionFrame,
-    std::vector<std::vector<c10::IValue>> inputsList,
+    const std::vector<std::vector<c10::IValue>>& inputsList,
     const uint32_t warmupRuns,
     const uint32_t mainRuns) {
   // TODO: add support for memory profiling
@@ -112,7 +112,7 @@ ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
   results.totalNodesCount = numNodes;
   for (const auto& r : results.timePerNodeType) {
     const std::string& target = r.first;
-    results.percentPerNodeType[target] = r.second * 100.0 / results.totalTime;
+    results.percentPerNodeType[target] = r.second * 100.0f / results.totalTime;
   }
   return results;
 }
 
@@ -51,7 +51,7 @@ class GraphExecutorBase {
 
   ProfileMetrics benchmarkIndividualNodes(
       ExecutionFrame& executionFrame,
-      std::vector<std::vector<c10::IValue>> inputs,
+      const std::vector<std::vector<c10::IValue>>& inputs,
       const uint32_t warmup_runs,
       const uint32_t main_runs);
Original file line number	Diff line number	Diff line change
`@@ -46,13 +46,14 @@ class ExecutionFrame {`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`template <typename CB>`
`49`		`- auto withMemoryPlanner(CB&& cb) {`
	`49`	`+ auto withManagedMemory(CB&& cb) {`
`50`	`50`	`if (!layoutManager_) {`
`51`		`- return std::forward<CB>(cb)();`
	`51`	`+ return std::forward<CB>(cb)(nullptr);`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`LayoutManagerGuard guard(*layoutManager_);`
`55`		`- return std::forward<CB>(cb)();`
	`55`	`+ return std::forward<CB>(cb)(`
	`56`	`+ const_cast<const LayoutManager*>(layoutManager_.get()));`
`56`	`57`	`}`
`57`	`58`
`58`	`59`	`std::vector<c10::IValue> tryMoveUserOutputs();`