pytorch
diff --git a/‎core/compiler.cpp
Lines changed: 2 additions & 2 deletions b/‎core/compiler.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/partitioning/BUILD
Lines changed: 1 addition & 1 deletion b/‎core/partitioning/BUILD
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/partitioning/partitioning.cpp
Lines changed: 42 additions & 53 deletions b/‎core/partitioning/partitioning.cpp
Lines changed: 42 additions & 53 deletions
diff --git a/‎core/partitioning/partitioning.h
Lines changed: 3 additions & 3 deletions b/‎core/partitioning/partitioning.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎core/partitioning/partitioningctx/PartitioningCtx.cpp
Lines changed: 11 additions & 41 deletions b/‎core/partitioning/partitioningctx/PartitioningCtx.cpp
Lines changed: 11 additions & 41 deletions
diff --git a/‎core/partitioning/partitioningctx/PartitioningCtx.h
Lines changed: 3 additions & 1 deletion b/‎core/partitioning/partitioningctx/PartitioningCtx.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎core/partitioning/shape_analysis.cpp
Lines changed: 8 additions & 8 deletions b/‎core/partitioning/shape_analysis.cpp
Lines changed: 8 additions & 8 deletions
@@ -138,11 +138,11 @@ partitioning::GraphAndMapping BuildHybridGraph(
 
   auto partitioning_ctx = partitioning::PartitioningCtx(block, partitioning_info);
   auto collection_input_ivalues_map =
-      partitioning::generateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types);
+      partitioning::GenerateRandomInputs(partitioning_info.collection_input_spec_map, first_use_types);
 
   partitioning::Partition(&partitioning_ctx, collection_input_ivalues_map);
 
-  for (auto &partitioned_block : partitioning_ctx.partitioned_blocks) {
+  for (auto& partitioned_block : partitioning_ctx.partitioned_blocks) {
     partitioning::PartitionedGraph& segmented_blocks = partitioned_block.second;
 
     for (auto& seg_block : segmented_blocks) {
 
@@ -15,7 +15,7 @@ cc_library(
     srcs = [
         "partitioning.cpp",
         "shape_analysis.cpp",
-        "stitching.cpp"
+        "stitching.cpp",
     ],
     hdrs = [
         "partitioning.h",
 
@@ -73,7 +73,6 @@ void SetExplicitFallbackNodes(PartitioningCtx* ctx, torch::jit::Block* block) {
       // Set the rest nodes to TensorRt
       ctx->setNodeExecutorDecision(n, NodeExecutorDecision::kCONVERT);
     }
-
   }
   return;
 }
@@ -103,7 +102,8 @@ void SetNonTensorConnectedNodes(PartitioningCtx* ctx, std::vector<torch::jit::No
       if (!isTensor(output)) {
         for (auto use : output->uses()) {
           auto node = use.user;
-          if (node->kind() != torch::jit::prim::Constant && ctx->shouldNodeRunInTensorRT(node)) {
+          if (node->kind() != torch::jit::prim::Constant && node->kind() != torch::jit::prim::Return &&
+              ctx->shouldNodeRunInTensorRT(node)) {
             ctx->setNodeExecutorDecision(node, NodeExecutorDecision::kNON_TENSOR);
             q.push(node);
           }
@@ -175,7 +175,7 @@ bool isModifyingNodes(torch::jit::Node* node, torch::jit::Value* val) {
   return false;
 }
 
-std::vector<torch::jit::Node*> findModifyingNodes(
+std::vector<torch::jit::Node*> FindModifyingNodes(
     torch::jit::Value* val,
     const std::unordered_set<torch::jit::Node*>& seg_block_nodes) {
   std::vector<torch::jit::Node*> modifying_nodes;
@@ -192,7 +192,7 @@ std::vector<torch::jit::Node*> findModifyingNodes(
 }
 
 // this function is only used when a TRT segment produces nonTensor values which are used by later TRT segment
-std::vector<torch::jit::Node*> getDependencyNodes(
+std::vector<torch::jit::Node*> GetDependencyNodes(
     const std::vector<torch::jit::Value*>& vals,
     const SegmentedBlock& seg_block) {
   // get all nodes in the segmentedblock
@@ -208,7 +208,7 @@ std::vector<torch::jit::Node*> getDependencyNodes(
     auto node = cur_val->node();
     if (node->kind() != torch::jit::prim::Constant && !visited.count(node)) {
       visited.insert(node);
-      auto modifying_nodes = findModifyingNodes(cur_val, seg_block_nodes);
+      auto modifying_nodes = FindModifyingNodes(cur_val, seg_block_nodes);
       stk.insert(stk.end(), modifying_nodes.rbegin(), modifying_nodes.rend());
       stk.push_back(node);
       for (auto input : node->inputs()) {
@@ -222,7 +222,7 @@ std::vector<torch::jit::Node*> getDependencyNodes(
   return stk;
 }
 
-void resolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) {
+void ResolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) {
   // if a TRT segment has nonTensor Inputs, the nodes that produce this nonTensor Inputs must in another TensorRT engine
   // because we have already found the interface between Torch and TRT in segmentation phase
   // what we do here is just find the dependency nodes of the TRT segments that have nonTensor inputs
@@ -236,16 +236,19 @@ void resolveTRTNonTensorInputs(PartitioningCtx* ctx, torch::jit::Block* block) {
         }
       }
       if (!inputs_to_resolve.empty()) {
-        std::vector<torch::jit::Node*> dependency_nodes = getDependencyNodes(inputs_to_resolve, cur_partitioned_block[i]);
+        std::vector<torch::jit::Node*> dependency_nodes =
+            GetDependencyNodes(inputs_to_resolve, cur_partitioned_block[i]);
         dependency_nodes.insert(
-            dependency_nodes.end(), cur_partitioned_block[i].raw_nodes().begin(), cur_partitioned_block[i].raw_nodes().end());
+            dependency_nodes.end(),
+            cur_partitioned_block[i].raw_nodes().begin(),
+            cur_partitioned_block[i].raw_nodes().end());
         cur_partitioned_block[i] = SegmentedBlock(SegmentedBlock::kTensorRT, dependency_nodes);
       }
     }
   }
 }
 
-void registerSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) {
+void RegisterSegmentsOutputs(PartitioningCtx* ctx, torch::jit::Block* block) {
   // find the corresponding raw values in original global graph for this segmented block's inputs/outputs
   PartitionedGraph& cur_partitioned_block = ctx->partitioned_blocks[block];
   auto cmp = [](torch::jit::Value* a, torch::jit::Value* b) { return a->unique() < b->unique(); };
@@ -331,21 +334,46 @@ void finalizeNewBlock(
   LOG_DEBUG(g.back());
 }
 
+void SetNodeExecutorLUT(PartitioningCtx* ctx, torch::jit::Block* block) {
+  // First, find all the explicit fallback nodes that should run in Torch:
+  // 1. nodes that are unsupported
+  // 2. nodes that the user specifies to run in torch
+  // 3. nodes that the user specifies the module containing this op to run in torch
+  // At the same time, set all the rest nodes to NodeExecutorDecision::kCONVERT
+  SetExplicitFallbackNodes(ctx, block);
+
+  // Second, check if there is nonTensor input/output for the block, if there is, then fallback the nodes that
+  // consume/produce this nonTensor value
+  SetInputsOutputsConnectedNodes(ctx, block);
+
+  // Third, for fallback nodes, if it consumes any NonTensor inputs, then the nodes that produce this
+  // input should also fallback. Similarly, if it produces any NonTensor outputs, then the nodes
+  // that consume this output should also fallback
+  auto cur_fallback_nodes = ctx->getNodesRunInTorch();
+  SetNonTensorConnectedNodes(ctx, cur_fallback_nodes);
+
+  // Finally, check if all current tensorrt blocks satisfy the min_block_size requirement.
+  // We need to traverse the whole graph many times here
+  SetMinBlockFallbackNodes(ctx, block);
+}
+
 void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) {
+  // Find all the fallback nodes and build execution decision LUT for all nodes
+  SetNodeExecutorLUT(ctx, block);
+
   auto nodes = block->nodes();
 
   // segment the nodes
   PartitionedGraph segmented_blocks;
 
   std::vector<torch::jit::Node*> in_prog_trt_blk_nodes, in_prog_pyt_blk_nodes;
   for (const auto n : nodes) {
-
     // Skip constant nodes as they are resources for both kinds of modules
     if (n->kind() == torch::jit::prim::Constant) {
       continue;
     }
     // the outputs of trt subgraph shouldn't be collections
-    if (!ctx->shouldNodeRunInTorch(n)) {
+    if (ctx->shouldNodeRunInTensorRT(n)) {
       in_prog_trt_blk_nodes.push_back(n);
 
       // If there is an active PyTorch block and we have passed the threshold for a valid TRT
@@ -410,65 +438,26 @@ void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block) {
   return;
 }
 
-void SetNodeExecutorLUT(PartitioningCtx* ctx, torch::jit::Block* block) {
-  // First, find all the explicit fallback nodes that should run in Torch:
-  // 1. nodes that are unsupported
-  // 2. nodes that the user specifies to run in torch
-  // 3. nodes that the user specifies the module containing this op to run in torch
-  // At the same time, set all the rest nodes to NodeExecutorDecision::kCONVERT
-  SetExplicitFallbackNodes(ctx, block);
-
-  // Second, check if there is nonTensor input/output for the block, if there is, then fallback the nodes that
-  // consume/produce this nonTensor value
-  SetInputsOutputsConnectedNodes(ctx, block);
-
-  // Third, for fallback nodes, if it consumes any NonTensor inputs, then the nodes that produce this
-  // input should also fallback. Similarly, if it produces any NonTensor outputs, then the nodes
-  // that consume this output should also fallback
-  auto cur_fallback_nodes = ctx->getNodesRunInTorch();
-  SetNonTensorConnectedNodes(ctx, cur_fallback_nodes);
-
-  // Finally, check if all current tensorrt blocks satisfy the min_block_size requirement.
-  // We need to traverse the whole graph many times here
-  SetMinBlockFallbackNodes(ctx, block);
-}
-
 void Partition(PartitioningCtx* ctx, ExampleIValues& example_tensor_map) {
   LOG_DEBUG(ctx->settings);
 
   // Go through all the blocks to do the partitioning
   for (torch::jit::Block* block : ctx->original_blocks) {
-
-    // Find all the fallback nodes and build execution decision LUT for all nodes
-    SetNodeExecutorLUT(ctx, block);
-
     // segment lowering global graph into blocks
     SegmentGraph(ctx, block);
 
     // It's possible that some TensorRT blocks have nonTensor inputs/output because they are interleaved by Torch blocks
     // resolve nonTensor inputs/outputs
-    resolveTRTNonTensorInputs(ctx, block);
+    ResolveTRTNonTensorInputs(ctx, block);
 
     // register input/output torch::jit::Value for segmented graphs
     LOG_DEBUG("Registering input/output torch::jit::Value for segmented graphs");
-    registerSegmentsOutputs(ctx, block);
+    RegisterSegmentsOutputs(ctx, block);
 
-    for (auto &i : ctx->partitioned_blocks[block]) {
-      LOG_DEBUG(i);
-    }
 
     // run shape analysis on each segmented block
-    runShapeAnalysis(ctx, block, example_tensor_map);
-
+    RunShapeAnalysis(ctx, block, example_tensor_map);
   }
-
-
-
-//  for (uint64_t i = 0; i < ctx->blocks.size(); i++) {
-//    ctx->blocks[i].update_id(i);
-//  }
-
-
 }
 
 } // namespace partitioning
 
@@ -20,11 +20,11 @@ typedef std::unordered_map<const torch::jit::Value*, torch::jit::IValue> Example
 typedef std::pair<std::shared_ptr<torch::jit::Graph>, std::unordered_map<torch::jit::Value*, torch::jit::Value*>>
     GraphAndMapping;
 
-ExampleIValues generateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types);
+ExampleIValues GenerateRandomInputs(ir::CollectionInputSpecMap& input_ranges, ir::CollectionTypeMap& input_types);
 
-void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& ivalues_maps);
+void RunShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& ivalues_maps);
 
-void segment_graph(PartitioningCtx* ctx, torch::jit::Block* block);
+void SegmentGraph(PartitioningCtx* ctx, torch::jit::Block* block);
 
 GraphAndMapping Stitch(PartitioningCtx* ctx, torch::jit::Block* block);
 
 
@@ -15,7 +15,9 @@ PartitioningCtx::PartitioningCtx(torch::jit::Block* b, PartitioningInfo info)
 }
 
 void PartitioningCtx::_load_nodes_into_decision_map(torch::jit::Block* b) {
-  original_blocks.push_back(b);
+  if (!b->owningNode() || b->owningNode()->kind() != torch::jit::prim::Loop) {
+    original_blocks.push_back(b);
+  }
   for (const auto n : b->nodes()) {
     if (n->kind() == torch::jit::prim::Constant) {
       continue;
@@ -33,60 +35,28 @@ void PartitioningCtx::setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorD
   if (iter != node_executor_decision_map.end()) {
     prev_decision = iter->second;
   }
-  LOG_GRAPH("Setting node " << util::node_info(n) << " " << decision << " (previously was " << prev_decision << ")");
-
-  // NOTE: This is this way due to partitioning.cpp L#134 I dont know if this is what we should do.
+  LOG_DEBUG("Setting node " << util::node_info(n) << " " << decision << " (previously was " << prev_decision << ")");
 
-  auto result = node_executor_decision_map[n] = decision;
-  return ;
+  node_executor_decision_map[n] = decision;
+  return;
 }
 
 bool PartitioningCtx::shouldNodeRunInTorch(torch::jit::Node* n) {
   auto iter = node_executor_decision_map.find(n);
-  auto decision = NodeExecutorDecision::kUNKNOWN;
-  if (iter != node_executor_decision_map.end()) {
-    decision = iter->second;
-  }
-
-  if (decision == NodeExecutorDecision::kCONVERT || decision == NodeExecutorDecision::kUNKNOWN) {
-    return false;
-  } else {
-    return true;
+  if (iter == node_executor_decision_map.end()) {
+    LOG_ERROR("No info about node " << *n << " execution decision status.");
   }
+  return iter->second != NodeExecutorDecision::kCONVERT;
 }
 
 bool PartitioningCtx::shouldNodeRunInTensorRT(torch::jit::Node* n) {
-  auto iter = node_executor_decision_map.find(n);
-  auto decision = NodeExecutorDecision::kUNKNOWN;
-  if (iter != node_executor_decision_map.end()) {
-    decision = iter->second;
-  }
-
-  if (decision == NodeExecutorDecision::kCONVERT) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool PartitioningCtx::isNodeExecutorKnown(torch::jit::Node* n) {
-  auto iter = node_executor_decision_map.find(n);
-  auto decision = NodeExecutorDecision::kUNKNOWN;
-  if (iter != node_executor_decision_map.end()) {
-    decision = iter->second;
-  }
-
-  if (decision == NodeExecutorDecision::kUNKNOWN) {
-    return false;
-  } else {
-    return true;
-  }
+  return !shouldNodeRunInTorch(n);
 }
 
 std::vector<torch::jit::Node*> PartitioningCtx::getNodesRunInTorch() {
   std::vector<torch::jit::Node*> nodes_run_in_torch;
   for (auto i : node_executor_decision_map) {
-    if (i.second == NodeExecutorDecision::kCONVERT) {
+    if (i.second != NodeExecutorDecision::kCONVERT) {
       nodes_run_in_torch.push_back(i.first);
     }
   }
 
@@ -47,16 +47,18 @@ struct UsageInfo {
 struct PartitioningCtx {
   // TODO: Make the set a part of settings not stand alone
   PartitioningInfo settings;
+  // records all the original blocks topologically in the module
   std::vector<torch::jit::Block*> original_blocks;
+  // mapping: node=> execution status
   NodeExecutorDecisionMap node_executor_decision_map;
+  // LUT of the segmented blocks for each blocks in the module
   std::unordered_map<torch::jit::Block*, PartitionedGraph> partitioned_blocks;
   std::unordered_set<std::string> forced_fallback_ops;
 
   PartitioningCtx(torch::jit::Block* b, PartitioningInfo info);
   void setNodeExecutorDecision(torch::jit::Node* n, NodeExecutorDecision decision);
   bool shouldNodeRunInTorch(torch::jit::Node* n);
   bool shouldNodeRunInTensorRT(torch::jit::Node* n);
-  bool isNodeExecutorKnown(torch::jit::Node* n);
   std::vector<torch::jit::Node*> getNodesRunInTorch();
 
  private:
 
@@ -9,7 +9,7 @@ namespace torch_tensorrt {
 namespace core {
 namespace partitioning {
 
-at::Tensor generateSingleInput(ir::Input& input, c10::optional<at::ScalarType>& type_opt) {
+at::Tensor GenerateSingleInput(ir::Input& input, c10::optional<at::ScalarType>& type_opt) {
   auto cur_shape = input.input_shape;
   std::vector<int64_t> shape;
   shape.insert(shape.begin(), std::begin(cur_shape.d), std::begin(cur_shape.d) + cur_shape.nbDims);
@@ -25,7 +25,7 @@ at::Tensor generateSingleInput(ir::Input& input, c10::optional<at::ScalarType>&
   return in;
 }
 
-std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
+std::unordered_map<const torch::jit::Value*, torch::jit::IValue> GenerateRandomInputs(
     std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& inputs,
     std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types) {
   // generate random inputs for running pytorch segments
@@ -38,28 +38,28 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
       c10::TypePtr elementType = c10::TensorType::get();
       auto generic_list = c10::impl::GenericList(elementType);
       for (size_t i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        auto in = GenerateSingleInput(input.second[i], types[input.first][i]);
         generic_list.push_back(in.clone());
       }
       ivalue_map[input.first] = c10::IValue(generic_list);
     } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
       std::vector<torch::jit::IValue> list;
       for (size_t i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i]);
+        auto in = GenerateSingleInput(input.second[i], types[input.first][i]);
         list.push_back(in.clone());
       }
       auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr
       ivalue_map[input.first] = c10::IValue(tuple);
     } else {
-      auto in = generateSingleInput(input.second[0], types[input.first][0]);
+      auto in = GenerateSingleInput(input.second[0], types[input.first][0]);
       ivalue_map[input.first] = in.clone();
     }
   }
   return ivalue_map;
 }
 
-void getSegmentsOutputByRunning(
+void GetSegmentsOutputByRunning(
     SegmentedBlock& seg_block,
     std::unordered_map<const torch::jit::Value*, torch::jit::IValue>& ivalues_maps,
     const PartitioningInfo& partitioning_info) {
@@ -181,11 +181,11 @@ void getSegmentsOutputByRunning(
   seg_block.register_intypes(input_types);
 }
 
-void runShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) {
+void RunShapeAnalysis(PartitioningCtx* ctx, torch::jit::Block* block, ExampleIValues& example_tensor_map) {
   // register every segment's input shape, and it's running output IValues
   for (auto& seg_block : ctx->partitioned_blocks[block]) {
     torch::jit::ConstantPooling(seg_block.g());
-    getSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings);
+    GetSegmentsOutputByRunning(seg_block, example_tensor_map, ctx->settings);
   }
   return;
 }