diff --git a/backends/xnnpack/test/ops/test_check_quant_params.py b/backends/xnnpack/test/ops/test_check_quant_params.py
index b76935a9f72..a04751ca5f1 100644
--- a/backends/xnnpack/test/ops/test_check_quant_params.py
+++ b/backends/xnnpack/test/ops/test_check_quant_params.py
@@ -52,7 +52,7 @@ def _test_check_quant_message(self, ep_modifier, expected_message):
         torch._dynamo.reset()
         mod = torch.nn.Linear(10, 10)
         quantizer = XNNPACKQuantizer()
-        captured = export_for_training(mod, (torch.randn(1, 10),)).module()
+        captured = export_for_training(mod, (torch.randn(1, 10),), strict=True).module()
         quantizer.set_global(get_symmetric_quantization_config(is_per_channel=True))
         prepared = prepare_pt2e(captured, quantizer)
 
@@ -68,7 +68,6 @@ def _test_check_quant_message(self, ep_modifier, expected_message):
         self.assertEquals(str(context.exception), expected_message)
 
     def test_in_per_tensor_quant(self):
-
         for invalid_scale in [
             float("nan"),
             float("inf"),
diff --git a/examples/llm_manual/export_nanogpt.py b/examples/llm_manual/export_nanogpt.py
index 9de2e831e25..8c948479f2a 100644
--- a/examples/llm_manual/export_nanogpt.py
+++ b/examples/llm_manual/export_nanogpt.py
@@ -28,7 +28,7 @@
 # The torch.no_grad() call tells PyTorch to exclude training-specific logic.
 with sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
     m = export_for_training(
-        model, example_inputs, dynamic_shapes=dynamic_shape
+        model, example_inputs, dynamic_shapes=dynamic_shape, strict=True
     ).module()
     traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape, strict=True)
 
diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py
index 2246b8eeb15..25362788e31 100755
--- a/examples/mediatek/aot_utils/oss_utils/utils.py
+++ b/examples/mediatek/aot_utils/oss_utils/utils.py
@@ -30,7 +30,9 @@ def build_executorch_binary(
         if quant_dtype not in Precision:
             raise AssertionError(f"No support for Precision {quant_dtype}.")
 
-        captured_model = torch.export.export_for_training(model, inputs).module()
+        captured_model = torch.export.export_for_training(
+            model, inputs, strict=True
+        ).module()
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
         # calibration
diff --git a/examples/mediatek/model_export_scripts/llama.py b/examples/mediatek/model_export_scripts/llama.py
index 5da17727075..413df21d5cc 100644
--- a/examples/mediatek/model_export_scripts/llama.py
+++ b/examples/mediatek/model_export_scripts/llama.py
@@ -319,7 +319,7 @@ def export_to_et_ir(
     )
     print("Getting pre autograd ATen Dialect Graph")
     pre_autograd_aten_dialect = torch.export.export_for_training(
-        model, example_inputs, dynamic_shapes=dynamic_shapes
+        model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
     ).module()  # NOTE: Will be replaced with export
     quantizer = NeuropilotQuantizer()
     quantizer.setup_precision(getattr(Precision, precision))
diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
index 8fa948e7dc7..11c2f3834eb 100644
--- a/examples/models/phi-3-mini/export_phi-3-mini.py
+++ b/examples/models/phi-3-mini/export_phi-3-mini.py
@@ -65,7 +65,7 @@ def export(args) -> None:
         xnnpack_quantizer.set_global(xnnpack_quant_config)
 
         model = export_for_training(
-            model, example_inputs, dynamic_shapes=dynamic_shapes
+            model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
         ).module()
         model = prepare_pt2e(model, xnnpack_quantizer)  # pyre-fixme[6]
         model(*example_inputs)
diff --git a/examples/models/test/test_export.py b/examples/models/test/test_export.py
index 9a4ff7a35ed..306f54c0e89 100644
--- a/examples/models/test/test_export.py
+++ b/examples/models/test/test_export.py
@@ -29,7 +29,9 @@ def collect_executorch_and_eager_outputs(
         Returns a tuple containing the outputs of the eager mode model and the executorch mode model.
         """
         eager_model = eager_model.eval()
-        model = torch.export.export_for_training(eager_model, example_inputs).module()
+        model = torch.export.export_for_training(
+            eager_model, example_inputs, strict=True
+        ).module()
         edge_model = export_to_edge(model, example_inputs)
 
         executorch_prog = edge_model.to_executorch()
diff --git a/examples/portable/scripts/export_and_delegate.py b/examples/portable/scripts/export_and_delegate.py
index 6a8a28d5338..1c2adf67688 100644
--- a/examples/portable/scripts/export_and_delegate.py
+++ b/examples/portable/scripts/export_and_delegate.py
@@ -61,7 +61,7 @@ def export_composite_module_with_lower_graph():
     m_compile_spec = m.get_compile_spec()
 
     # pre-autograd export. eventually this will become torch.export
-    m = torch.export.export_for_training(m, m_inputs).module()
+    m = torch.export.export_for_training(m, m_inputs, strict=True).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
@@ -84,7 +84,7 @@ def forward(self, *args):
     m = CompositeModule()
     m = m.eval()
     # pre-autograd export. eventually this will become torch.export
-    m = torch.export.export_for_training(m, m_inputs).module()
+    m = torch.export.export_for_training(m, m_inputs, strict=True).module()
     composited_edge = export_to_edge(m, m_inputs)
 
     # The graph module is still runnerable
@@ -134,7 +134,7 @@ def get_example_inputs(self):
     m = Model()
     m_inputs = m.get_example_inputs()
     # pre-autograd export. eventually this will become torch.export
-    m = torch.export.export_for_training(m, m_inputs).module()
+    m = torch.export.export_for_training(m, m_inputs, strict=True).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
@@ -171,7 +171,7 @@ def export_and_lower_the_whole_graph():
 
     m_inputs = m.get_example_inputs()
     # pre-autograd export. eventually this will become torch.export
-    m = torch.export.export_for_training(m, m_inputs).module()
+    m = torch.export.export_for_training(m, m_inputs, strict=True).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index 6db0d82a274..f67150169dc 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -87,14 +87,14 @@
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    ep = torch.export.export_for_training(model, example_inputs)
+    ep = torch.export.export_for_training(model, example_inputs, strict=True)
     model = ep.module()
 
     if args.quantize:
         logging.info("Quantizing Model...")
         # TODO(T165162973): This pass shall eventually be folded into quantizer
         model = quantize(model, example_inputs, quant_type)
-        ep = torch.export.export_for_training(model, example_inputs)
+        ep = torch.export.export_for_training(model, example_inputs, strict=True)
 
     edge = to_edge_transform_and_lower(
         ep,
diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py
index 3e30c239215..90a6b94d02b 100644
--- a/examples/xnnpack/quantization/example.py
+++ b/examples/xnnpack/quantization/example.py
@@ -60,7 +60,9 @@ def verify_xnnpack_quantizer_matching_fx_quant_model(model_name, model, example_
     m = model
 
     # 1. pytorch 2.0 export quantization flow (recommended/default flow)
-    m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
+    m = torch.export.export_for_training(
+        m, copy.deepcopy(example_inputs), strict=True
+    ).module()
     quantizer = XNNPACKQuantizer()
     quantization_config = get_symmetric_quantization_config(is_per_channel=True)
     quantizer.set_global(quantization_config)
@@ -177,7 +179,9 @@ def main() -> None:
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    model = torch.export.export_for_training(model, example_inputs).module()
+    model = torch.export.export_for_training(
+        model, example_inputs, strict=True
+    ).module()
     start = time.perf_counter()
     quantized_model = quantize(model, example_inputs)
     end = time.perf_counter()
diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index 917dae32d74..e9320cf415d 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -76,7 +76,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = export_for_training(mlp, example_inputs).module()
+        model = export_for_training(mlp, example_inputs, strict=True).module()
         aten = export(model, example_inputs, strict=True)
         spec_key = "path"
         spec_value = "/a/b/c/d"
@@ -137,7 +137,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = export_for_training(mlp, example_inputs).module()
+        model = export_for_training(mlp, example_inputs, strict=True).module()
         aten = export(model, example_inputs, strict=True)
         edge = exir.to_edge(aten)
 
@@ -177,7 +177,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = export_for_training(mlp, example_inputs).module()
+        model = export_for_training(mlp, example_inputs, strict=True).module()
         edge = exir.to_edge(export(model, example_inputs, strict=True))
 
         with self.assertRaisesRegex(
@@ -229,7 +229,9 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            self.AddConst(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         delegated = edge.to_backend(PartitionerNoTagData())
 
@@ -308,7 +310,9 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            self.AddConst(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         delegated = edge.to_backend(PartitionerTagData())
 
@@ -383,7 +387,9 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            self.AddConst(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         delegated = edge.to_backend(PartitionerTagData())
 
@@ -471,7 +477,9 @@ def partition(
                 )
 
         inputs = (torch.ones(2, 2),)
-        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            ReuseConstData(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         exec_prog = edge.to_backend(PartitionerTagData()).to_executorch()
         executorch_module = _load_for_executorch_from_buffer(exec_prog.buffer)
@@ -531,7 +539,9 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            ReuseConstData(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         with self.assertRaises(RuntimeError) as error:
             _ = edge.to_backend(PartitionerTagData())
diff --git a/exir/backend/test/test_passes.py b/exir/backend/test/test_passes.py
index bc18f090238..1cdf494fa01 100644
--- a/exir/backend/test/test_passes.py
+++ b/exir/backend/test/test_passes.py
@@ -28,7 +28,9 @@ def forward(self, x):
                 z = x - self.const
                 return y, z
 
-        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            ReuseConstData(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(
             torch.export.export(model, (torch.ones(2, 2),), strict=True)
         )
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 362796146ee..d5ebf1ffae9 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -1751,8 +1751,8 @@ def forward(self, x):
         module_1(*example_inputs)
         module_2(*example_inputs)
 
-        ep1 = export_for_training(module_1, example_inputs)
-        ep2 = export_for_training(module_2, example_inputs)
+        ep1 = export_for_training(module_1, example_inputs, strict=True)
+        ep2 = export_for_training(module_2, example_inputs, strict=True)
 
         edge_program_manager = exir.to_edge(
             {"forward1": ep1, "forward2": ep2},
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 39dbd3f51d3..887ca39864a 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -1164,7 +1164,9 @@ def forward(self, query, key, value):
         value = torch.randn(32, 32, 32, 32)
 
         # Capture the model
-        m = torch.export.export_for_training(M(32), (query, key, value)).module()
+        m = torch.export.export_for_training(
+            M(32), (query, key, value), strict=True
+        ).module()
 
         # 8w16a quantization
         from torch.ao.quantization.observer import (
@@ -1405,8 +1407,7 @@ def quantize_model(
         ) -> Tuple[EdgeProgramManager, int, int]:
             # program capture
             m = torch.export.export_for_training(
-                m_eager,
-                example_inputs,
+                m_eager, example_inputs, strict=True
             ).module()
 
             quantizer = XNNPACKQuantizer()
diff --git a/exir/tests/test_quantization.py b/exir/tests/test_quantization.py
index 61e3410186e..0a0a85077bb 100644
--- a/exir/tests/test_quantization.py
+++ b/exir/tests/test_quantization.py
@@ -52,7 +52,7 @@ def test_resnet(self) -> None:
             m_copy = copy.deepcopy(m)
             # program capture
             m = torch.export.export_for_training(
-                m, copy.deepcopy(example_inputs)
+                m, copy.deepcopy(example_inputs), strict=True
             ).module()
 
             quantizer = XNNPACKQuantizer()
diff --git a/exir/tests/test_quantize_io_pass.py b/exir/tests/test_quantize_io_pass.py
index aab941b538c..ddc0294ba68 100644
--- a/exir/tests/test_quantize_io_pass.py
+++ b/exir/tests/test_quantize_io_pass.py
@@ -39,12 +39,14 @@ def _quantize(self, mod, example_inputs):
         operator_config = get_symmetric_quantization_config()
         quantizer.set_global(operator_config)
         m = torch.export.export_for_training(
-            mod, copy.deepcopy(example_inputs)
+            mod, copy.deepcopy(example_inputs), strict=True
         ).module()
         m = prepare_pt2e(m, quantizer)
         _ = m(*example_inputs)
         m = convert_pt2e(m)
-        exported_program = torch.export.export_for_training(m, example_inputs)
+        exported_program = torch.export.export_for_training(
+            m, example_inputs, strict=True
+        )
         return exported_program
 
     def _check_count(self, op, count, epm):
diff --git a/extension/export_util/utils.py b/extension/export_util/utils.py
index 2679930178a..aa3a736af3c 100644
--- a/extension/export_util/utils.py
+++ b/extension/export_util/utils.py
@@ -108,7 +108,7 @@ def export_to_exec_prog(
 ) -> ExecutorchProgramManager:
     m = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    m = export_for_training(m, example_inputs).module()
+    m = export_for_training(m, example_inputs, strict=True).module()
 
     core_aten_ep = _to_core_aten(
         m,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index cf3a1087cfb..8fcb5798327 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -234,6 +234,7 @@ def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
                     self.example_inputs,
                     kwargs=self.example_kwarg_inputs,
                     dynamic_shapes=dynamic_shape,
+                    strict=True,
                 )
         return exported_module
 
diff --git a/extension/llm/export/test_export_passes.py b/extension/llm/export/test_export_passes.py
index 12ce18ebb79..b0c5af7e65f 100644
--- a/extension/llm/export/test_export_passes.py
+++ b/extension/llm/export/test_export_passes.py
@@ -10,10 +10,7 @@
 
 class RemoveRedundantTransposesPassTest(unittest.TestCase):
     def _export(self, model, example_inputs):
-        exported_module = export_for_training(
-            model,
-            example_inputs,
-        )
+        exported_module = export_for_training(model, example_inputs, strict=True)
         return exported_module.module()
 
     def _check(self, model, example_inputs, key, before_count, after_count):