pytorch
diff --git a/‎.circleci/config.yml
Lines changed: 17 additions & 2 deletions b/‎.circleci/config.yml
Lines changed: 17 additions & 2 deletions
diff --git a/‎.github/workflows/build-test.yml
Lines changed: 36 additions & 33 deletions b/‎.github/workflows/build-test.yml
Lines changed: 36 additions & 33 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/conversion/converters/impl/shuffle.cpp
Lines changed: 6 additions & 1 deletion b/‎core/conversion/converters/impl/shuffle.cpp
Lines changed: 6 additions & 1 deletion
diff --git a/‎core/runtime/execute_engine.cpp
Lines changed: 3 additions & 3 deletions b/‎core/runtime/execute_engine.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎core/runtime/runtime.cpp
Lines changed: 22 additions & 5 deletions b/‎core/runtime/runtime.cpp
Lines changed: 22 additions & 5 deletions
diff --git a/‎core/runtime/runtime.h
Lines changed: 3 additions & 1 deletion b/‎core/runtime/runtime.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎core/util/trt_util.cpp
Lines changed: 1 addition & 1 deletion b/‎core/util/trt_util.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/torch_tensorrt/torch_tensorrt.h
Lines changed: 2 additions & 0 deletions b/‎cpp/include/torch_tensorrt/torch_tensorrt.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/src/types.cpp
Lines changed: 7 additions & 1 deletion b/‎cpp/src/types.cpp
Lines changed: 7 additions & 1 deletion
@@ -802,7 +802,7 @@ commands:
       - store_artifacts:
           path: /tmp/testlogs
 
-  test-dynamo-models_torch_export:
+  test-dynamo-models_export:
     description: "Test the Dynamo models via torch_export path"
     steps:
       - run:
@@ -818,6 +818,20 @@ commands:
       - store_artifacts:
           path: /tmp/testlogs
 
+  test-dynamo-export_serde:
+    description: "Test the export serialize/deserialize functionality for Dynamo models"
+    steps:
+      - run:
+          name: Run Dynamo models and test export serde with TRT compiled modules
+          command: |
+            cd tests/py/dynamo/models
+            pytest test_export_serde.py --junitxml=/tmp/artifacts/test_results/dynamo/backend/test_results.xml --ir dynamo
+
+      - store_test_results:
+          path: /tmp/artifacts
+      - store_artifacts:
+          path: /tmp/testlogs
+
   test-dynamo-converters:
     description: "Test the Dynamo aten converters"
     steps:
@@ -1122,7 +1136,8 @@ jobs:
       - test-dynamo-backend
       - test-dynamo-shared_utilities
       - test-dynamo-models_torch_compile
-      - test-dynamo-models_torch_export
+      - test-dynamo-models_export
+      - test-dynamo-export_serde
 
   package-x86_64-linux:
     parameters:
 
@@ -54,39 +54,40 @@ jobs:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
 
-  # tests-py-torchscript-fe:
-  #   name: Test torchscript frontend [Python]
-  #   needs: [generate-matrix, build]
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       include:
-  #         - repository: pytorch/tensorrt
-  #           package-name: torch_tensorrt
-  #           pre-script: packaging/pre_build_script.sh
-  #   uses: pytorch/tensorrt/.github/workflows/linux-test.yml@main
-  #   with:
-  #     job-name: tests-py-torchscript-fe
-  #     repository: "pytorch/tensorrt"
-  #     ref: ""
-  #     test-infra-repository: pytorch/test-infra
-  #     test-infra-ref: main
-  #     build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-  #     pre-script: ${{ matrix.pre-script }}
-  #     script: |
-  #       export USE_HOST_DEPS=1
-  #       pushd .
-  #       cd tests/modules
-  #       ${CONDA_RUN} python -m pip install -r requirements.txt
-  #       ${CONDA_RUN} python hub.py
-  #       popd
-  #       pushd .
-  #       cd tests/py/ts
-  #       ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest --use-deprecated=legacy-resolver
-  #       ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
-  #       ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
-  #       ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
-  #       popd
+  tests-py-torchscript-fe:
+    name: Test torchscript frontend [Python]
+    needs: [generate-matrix, build]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensorrt
+            package-name: torch_tensorrt
+            pre-script: packaging/pre_build_script.sh
+    uses: pytorch/tensorrt/.github/workflows/linux-test.yml@main
+    with:
+      job-name: tests-py-torchscript-fe
+      repository: "pytorch/tensorrt"
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      script: |
+        export USE_HOST_DEPS=1
+        export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
+        pushd .
+        cd tests/modules
+        ${CONDA_RUN} python -m pip install --pre -r requirements.txt --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python hub.py
+        popd
+        pushd .
+        cd tests/py/ts
+        ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
+        popd
 
   tests-py-dynamo-converters:
     name: Test dynamo converters [Python]
@@ -140,6 +141,8 @@ jobs:
         cd tests/py/dynamo
         ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_fe_test_results.xml --ir dynamo models/test_models_export.py
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
+        ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/test_dyn_models.py
         popd
 
   tests-py-torch-compile-be:
 
@@ -40,7 +40,7 @@ repos:
     rev: 'v1.4.1'
     hooks:
     -   id: mypy
-        exclude: "^py/torch_tensorrt/fx|^examples|^tests|^tools|^docs|noxfile.py|setup.py|versions.py"
+        exclude: "^py/torch_tensorrt/fx|^examples|^tests|^py/torch_tensorrt/dynamo/_experimental|^tools|^docs|noxfile.py|setup.py|versions.py"
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.0.278
 
@@ -20,7 +20,12 @@ static auto shuffle_registrations TORCHTRT_UNUSED =
                auto in_shape = util::toVec(in->getDimensions());
                std::vector<int64_t> out_shape;
                if (ctx->input_is_dynamic) {
-                 end_dim = (end_dim == -1) ? in_shape.size() - 1 : end_dim;
+                 if (start_dim < 0) {
+                   start_dim = start_dim + in_shape.size();
+                 }
+                 if (end_dim < 0) {
+                   end_dim = end_dim + in_shape.size();
+                 }
                  int nbDynamicFlattenedDims = 0;
                  int nbDynamicUnflattenedDims = 0;
                  for (int i = 0; i < (int)in_shape.size(); i++) {
 
@@ -43,8 +43,8 @@ bool is_switch_required(const RTDevice& curr_device, const RTDevice& engine_devi
   return false;
 }
 
-RTDevice select_rt_device(const RTDevice& engine_device) {
-  auto new_target_device_opt = get_most_compatible_device(engine_device);
+RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_device) {
+  auto new_target_device_opt = get_most_compatible_device(engine_device, curr_device);
 
   // REVIEW: THIS DOES NOT LIST DLA PROBABLY, WHICH WE SHOULD
   // TODO: I think this logic could be way simpler at execution time since if the tensors arent on the right
@@ -89,7 +89,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     if (is_switch_required(curr_device, compiled_engine->device_info)) {
       // Scan through available CUDA devices and set the CUDA device context correctly
-      RTDevice device = select_rt_device(compiled_engine->device_info);
+      RTDevice device = select_rt_device(compiled_engine->device_info, curr_device);
       set_rt_device(device);
 
       // Target device is new device
 
@@ -7,9 +7,16 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device) {
+c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device, const RTDevice& curr_device) {
   LOG_DEBUG("Target Device: " << target_device);
   auto device_options = find_compatible_devices(target_device);
+  RTDevice current_device;
+  if (current_device.id == -1) {
+    current_device = get_current_device();
+  } else {
+    current_device = curr_device;
+  }
+
   if (device_options.size() == 0) {
     return {};
   } else if (device_options.size() == 1) {
@@ -21,10 +28,20 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
   dev_list << "[" << std::endl;
   for (auto device : device_options) {
     dev_list << "    " << device << ',' << std::endl;
-    if (device.device_name == target_device.device_name && best_match.device_name != target_device.device_name) {
-      best_match = device;
-    } else if (device.device_name == target_device.device_name && best_match.device_name == target_device.device_name) {
-      if (device.id == target_device.id && best_match.id != target_device.id) {
+    if (device.device_name == target_device.device_name) {
+      // First priority is selecting a candidate which agrees with the current device ID
+      // If such a device is found, we can select it and break out of the loop
+      if (device.id == current_device.id && best_match.id != current_device.id) {
+        best_match = device;
+        break;
+      }
+      // Second priority is selecting a candidate which agrees with the target device ID
+      // At deserialization time, the current device and target device may not agree
+      else if (device.id == target_device.id && best_match.id != target_device.id) {
+        best_match = device;
+      }
+      // If no such GPU ID is found, select the first available candidate GPU
+      else if (best_match.device_name != target_device.device_name) {
         best_match = device;
       }
     }
 
@@ -26,7 +26,9 @@ typedef enum {
   SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
 } SerializedInfoIndex;
 
-c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device);
+c10::optional<RTDevice> get_most_compatible_device(
+    const RTDevice& target_device,
+    const RTDevice& curr_device = RTDevice());
 std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device);
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
 
@@ -216,7 +216,7 @@ nvinfer1::Dims squeezeDims(const nvinfer1::Dims& d, int pos, bool use_zeros, boo
       // Replace all instances of -1, indicating dynamic dimension
       // with 0, indicating copy the dimension from another tensor
       // (Generally used for reshape operations)
-      if (use_zeros && d.d[i] == -1) {
+      if (use_zeros && d.d[i] == -1 && i < pos) {
         dims.d[j] = 0;
         // If zeros already exist in the dimensions (empty tensor),
         // Replace all instances of 0, indicating empty dimension
 
@@ -60,6 +60,8 @@ class DataType {
   enum Value : int8_t {
     /// INT64
     kLong,
+    /// FP64
+    kDouble,
     /// FP32
     kFloat,
     /// FP16
 
@@ -97,6 +97,8 @@ at::ScalarType toAtenDataType(DataType value) {
       return at::kInt;
     case DataType::kLong:
       return at::kLong;
+    case DataType::kDouble:
+      return at::kDouble;
     case DataType::kBool:
       return at::kBool;
     case DataType::kFloat:
@@ -119,7 +121,8 @@ nvinfer1::TensorFormat toTRTTensorFormat(TensorFormat value) {
 
 DataType::DataType(c10::ScalarType t) {
   TORCHTRT_CHECK(
-      t == at::kHalf || t == at::kFloat || t == at::kChar || t == at::kLong || t == at::kInt || t == at::kBool,
+      t == at::kHalf || t == at::kFloat || t == at::kChar || t == at::kLong || t == at::kDouble || t == at::kInt ||
+          t == at::kBool,
       "Data type is unsupported (" << t << ")");
   switch (t) {
     case at::kHalf:
@@ -134,6 +137,9 @@ DataType::DataType(c10::ScalarType t) {
     case at::kLong:
       value = DataType::kLong;
       break;
+    case at::kDouble:
+      value = DataType::kDouble;
+      break;
     case at::kBool:
       value = DataType::kBool;
       break;