pytorch
diff --git a/‎core/compiler.cpp‎
Lines changed: 141 additions & 83 deletions b/‎core/compiler.cpp‎
Lines changed: 141 additions & 83 deletions
diff --git a/‎core/compiler.h‎
Lines changed: 2 additions & 1 deletion b/‎core/compiler.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/conversion/BUILD‎
Lines changed: 0 additions & 1 deletion b/‎core/conversion/BUILD‎
Lines changed: 0 additions & 1 deletion
@@ -128,22 +128,6 @@ bool CheckMethodOperatorSupport(const torch::jit::script::Module& mod, std::stri
   return conversion::VerifyConverterSupportForBlock(g->block());
 }
 
-std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
-  // Go through Lowering to simplify graph and extract weight parameters
-  auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
-
-  auto convert_cfg = std::move(cfg.convert_info);
-  auto g = graph_and_parameters.first;
-
-  auto params = graph_and_parameters.second;
-  auto named_params = conversion::get_named_params(g->inputs(), params);
-
-  LOG_INFO(*g << "(CompileGraph)\n");
-
-  auto engine = conversion::ConvertBlockToEngine(g->block(), convert_cfg, named_params);
-  return std::move(engine);
-}
-
 void AddSegmentedBlockToGraph(
     std::shared_ptr<torch::jit::Graph>& g,
     partitioning::SegmentedBlock& seg,
@@ -237,15 +221,15 @@ void AddIfBlockToGraph(
 GraphAndMapping ConstructFallbackGraph(
     torch::jit::script::Module& new_mod,
     torch::jit::Block* block,
-    std::unordered_map<torch::jit::Value*, torch::jit::IValue> input_ivalues_map,
+    std::unordered_map<const torch::jit::Value*, torch::jit::IValue> example_tensor_map,
     CompileSpec cfg,
-    conversion::GraphParams named_params) {
+    ir::StaticParams static_params) {
   auto convert_cfg = cfg.convert_info;
   auto partition_info = cfg.partition_info;
 
   auto new_g = std::make_shared<torch::jit::Graph>();
 
-  auto segmented_blocks = partitioning::Partition(block, input_ivalues_map, partition_info);
+  auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partition_info);
 
   // the mapping from lowering graph => fallback global graph
   std::unordered_map<torch::jit::Value*, torch::jit::Value*> old_to_new_g;
@@ -259,13 +243,18 @@ GraphAndMapping ConstructFallbackGraph(
     trt_engine_id << reinterpret_cast<const int*>(&seg_block);
 
     if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
+      auto shapes = seg_block.in_shapes();
+      auto types = seg_block.in_types();
       std::vector<ir::Input> inputs;
-      for (auto& shape : seg_block.in_shape()) {
-        inputs.push_back(ir::Input(shape));
+      for (size_t i = 0; i < shapes.size(); i++) {
+        auto in = ir::Input(shapes[i]);
+        in.dtype = util::ScalarTypeToTRTDataType(types[i]);
+        inputs.push_back(in);
       }
       // update the input ranges for each segments
-      convert_cfg.inputs = inputs;
-      auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
+      convert_cfg.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
+
+      auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, static_params);
       auto temp_g = std::make_shared<torch::jit::Graph>();
       auto device_spec = convert_cfg.engine_settings.device;
       auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
@@ -281,7 +270,7 @@ GraphAndMapping ConstructFallbackGraph(
         std::vector<GraphAndMapping> graph_and_mappings;
         for (auto cur_block : if_node->blocks()) {
           graph_and_mappings.push_back(
-              ConstructFallbackGraph(new_mod, cur_block, input_ivalues_map, cfg, named_params));
+              ConstructFallbackGraph(new_mod, cur_block, example_tensor_map, cfg, static_params));
         }
         AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g);
 
@@ -299,88 +288,157 @@ GraphAndMapping ConstructFallbackGraph(
   return {new_g, old_to_new_g};
 }
 
-torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Module& mod, CompileSpec cfg) {
-  // TODO: Should be doing a functional transform but need PR #31978
-  // [jit] More robust mangling
-  // torch::jit::script::Module new_mod = mod.clone();
-  torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
-  std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
-  for (const torch::jit::script::Method& method : mod.get_methods()) {
-    // Compile only forward methods. forward method contains the entire graph.
-    if (method.name().compare("forward") == 0) {
-      auto new_g = std::make_shared<torch::jit::Graph>();
-      auto graph_and_parameters = lowering::Lower(mod, method.name(), cfg.lower_info);
+void MapInputsAndDetermineDTypes(
+    CompileSpec& cfg,
+    std::shared_ptr<torch::jit::Graph>& g,
+    ir::StaticParams& static_params,
+    ir::TypeMap& first_use_type_map) {
+  // Associate input specs with inputs
+  cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));
+
+  for (auto& in : g->inputs()) {
+    auto est_type_opt = first_use_type_map.find(in)->second;
+    ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
+    if (est_type_opt && !spec.dtype_is_user_defined) {
+      // If we can calculate the type from the graph and the type was not defined by the user then use the calculated
+      // type
+      LOG_INFO(
+          "Since input type is not explicitly defined, infering using first tensor calculation\n  Found input "
+          << in->debugName() << " has type " << est_type_opt.value()
+          << ". If this is incorrect explicitly set dtype for input and file a bug");
+      spec.dtype = util::ScalarTypeToTRTDataType(est_type_opt.value());
+    } else if (!est_type_opt && !spec.dtype_is_user_defined) {
+      // If we cannot calculate the type and the user did not define the type, then default to FP32
+      LOG_WARNING(
+          "Cannot infer input type from calcuations in graph for input "
+          << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
+      spec.dtype = nvinfer1::DataType::kFLOAT;
+    } else if (spec.dtype_is_user_defined && cfg.partition_info.enabled) {
+      if (!est_type_opt) {
+        LOG_INFO("Cannot infer input tensor dtype in graph, unable to verify user input dtype settings");
+      } else {
+        if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
+          std::stringstream ss;
+          ss << "For input " << in->debugName() << ", found user specified input dtype as ";
+          ss << cfg.convert_info.inputs.find(in)->second.dtype;
+          ss << ", however when inspecting the graph, the input type expected was inferred to be ";
+          ss << est_type_opt.value() << std::endl;
+          ss << "The compiler is going to use the user setting " << cfg.convert_info.inputs.find(in)->second.dtype;
+          ss << "\nThis conflict may cause an error at runtime due to partial compilation being enabled and therefore\n";
+          ss << "compatibility with PyTorch's data type convention is required.\n";
+          ss << "If you do indeed see errors at runtime either:\n";
+          ss << "- Remove the dtype spec for " << in->debugName() << std::endl;
+          ss << "- Disable partial compilation by setting require_full_compilation to True";
+          auto warn_str = ss.str();
+          LOG_WARNING(warn_str);
+          // Overwrite type map with user settings
+          first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
+        }
+      }
+    } else {
+      // The user defined the type so no changes are necessary
+    }
+  }
+}
 
-      auto g = graph_and_parameters.first;
-      auto params = graph_and_parameters.second;
-      auto named_params = conversion::get_named_params(g->inputs(), params);
-      LOG_INFO("(LoweredGraph)\n" << *g);
+uint64_t GetRecommendedWorkspaceSize(const runtime::CudaDevice& device) {
+  if (device.major < 6) {
+    return 256 * (1 << 20);
+  } else {
+    return 1 << 30;
+  }
+}
 
-      std::unordered_map<torch::jit::Value*, ir::Input> inputs;
-      for (size_t i = 0; i < g->inputs().size(); ++i) {
-        inputs.insert({g->inputs()[i], cfg.convert_info.inputs[i]});
-      }
-      auto input_ivalues_map = partitioning::generateRandomInputs(inputs);
-      auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, named_params);
-      new_g = graph_and_mapping.first;
-      LOG_INFO("(FallbackGraph)\n" << *new_g);
+std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
+  // Go through Lowering to simplify graph and extract weight parameters
+  auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
 
-      // if there is no tensorrt engine self in fallback graph, there is no conversion, we just return the initial
-      // module
-      if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) {
-        LOG_WARNING("Didn't generate any TensorRT engines, the compiler did nothing\n");
-        return mod;
-      }
+  auto g = graph_and_parameters.first;
+  TRTORCH_CHECK(
+      conversion::VerifyConverterSupportForBlock(g->block()),
+      "Not all operations in graph are supported by the compiler");
+  auto params = graph_and_parameters.second;
+  auto static_params = ir::get_static_params(g->inputs(), params);
+  // Infer the type of an input from the weights of the calculation
+  auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
 
-      auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
-      auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
-      new_mod.type()->addMethod(new_method);
-      new_method->setSchema(schema);
-    }
+  // GPU default WS size : 1 GB
+  // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
+  auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
+  auto device_spec = cfg.convert_info.engine_settings.device;
+  auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+  if (workspace_size == 0) {
+    cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
   }
 
-  return new_mod;
+  MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
+
+  auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params);
+
+  return std::move(engine);
 }
 
-torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) {
-  // TODO: not sure how to deal with duplicated code here, so just cut out a branch temporally
-  if (cfg.partition_info.enabled) {
-    return CompileGraphWithFallback(mod, cfg);
-  }
-  auto device_spec = cfg.convert_info.engine_settings.device;
+torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
+  torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
 
   // GPU default WS size : 1 GB
   // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
   auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
-  cudaDeviceProp device_prop;
-  cudaGetDeviceProperties(&device_prop, device_spec.gpu_id);
+  auto device_spec = cfg.convert_info.engine_settings.device;
+  auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
   if (workspace_size == 0) {
-    if (device_prop.major < 6) {
-      cfg.convert_info.engine_settings.workspace_size = 256 * (1 << 20);
-    } else {
-      cfg.convert_info.engine_settings.workspace_size = 1 << 30;
-    }
+    cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
   }
 
-  // TODO: Should be doing a functional transform but need PR #31978
-  // [jit] More robust mangling
-  // torch::jit::script::Module new_mod = mod.clone();
-  torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
-  std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
-  for (const torch::jit::script::Method& method : mod.get_methods()) {
-    // Compile only forward methods. forward method contains the entire graph.
+  for (const torch::jit::Method& method : mod.get_methods()) {
     if (method.name().compare("forward") == 0) {
-      auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
       auto new_g = std::make_shared<torch::jit::Graph>();
-      auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
-      AddEngineToGraph(new_mod, new_g, engine, cuda_device);
+
+      auto graph_and_parameters = lowering::Lower(mod, method.name(), cfg.lower_info);
+
+      auto g = graph_and_parameters.first;
+      auto params = graph_and_parameters.second;
+      auto static_params = ir::get_static_params(g->inputs(), params);
+      // Infer the type of an input from the weights of the calculation
+      auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
+
+      MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
+
+      if (cfg.partition_info.enabled &&
+          (cfg.lower_info.forced_fallback_modules.size() == 0 &&
+           cfg.partition_info.forced_fallback_operators.size() == 0 &&
+           conversion::VerifyConverterSupportForBlock(g->block(), true))) {
+        LOG_INFO("Skipping partitioning since model is fully supported");
+      }
+
+      if (cfg.partition_info.enabled &&
+          !(cfg.lower_info.forced_fallback_modules.size() == 0 &&
+            cfg.partition_info.forced_fallback_operators.size() == 0 &&
+            conversion::VerifyConverterSupportForBlock(g->block(), false))) {
+        auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types);
+        auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params);
+        new_g = graph_and_mapping.first;
+        LOG_INFO("Segmented Graph: " << *new_g);
+
+        // if there is no tensorrt engine self in fallback graph, there is no conversion, we just return the initial
+        // module
+        if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) {
+          LOG_WARNING("Didn't generate any TensorRT engines, the compiler did nothing\n");
+          return mod;
+        }
+      } else {
+        TRTORCH_CHECK(
+            conversion::VerifyConverterSupportForBlock(g->block()),
+            "Not all operations in graph are supported by the compiler");
+        auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params);
+        AddEngineToGraph(new_mod, new_g, engine, cuda_device);
+      }
       auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
       auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
       new_mod.type()->addMethod(new_method);
       new_method->setSchema(schema);
     }
   }
-
   return new_mod;
 }
 
 
@@ -13,7 +13,8 @@ namespace trtorch {
 namespace core {
 
 struct CompileSpec {
-  CompileSpec(std::vector<ir::Input> inputs) : convert_info(std::move(inputs)) {}
+  CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {}
+  std::vector<ir::Input> inputs;
   conversion::ConversionInfo convert_info;
   lowering::LowerInfo lower_info;
   partitioning::PartitionInfo partition_info;
 
@@ -10,7 +10,6 @@ config_setting(
 cc_library(
     name = "conversion",
     srcs = [
-        "InterfaceTypes.cpp",
         "conversion.cpp",
         "conversion_ignorelist.cpp",
     ],