Added testcases and try catch

cehongwang · cehongwang · commit 33ca588f7965 · 2025-09-30T21:58:19.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -864,7 +864,9 @@ def preserve_module_specs(
 
     # Here we delete the frozen parameters from the graph module. Note this does not affect the submodules. We are going to delete the frozen parameters from the submodules in the convert_module function.
     # This is done to release CPU memory.
-    [delattr(gm, attr) for attr in dir(gm) if attr.startswith("_frozen_param")]
+    for attr in dir(gm):
+        if attr.startswith("_frozen_param"):
+            delattr(gm, attr)
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
@@ -1238,7 +1240,7 @@ def convert_exported_program_to_serialized_trt_engine(
 
     # Prepare torch_trt inputs
     trt_arg_inputs: Sequence[Input] = prepare_inputs(arg_inputs)
-    trt_kwarg_inputs: Optional[dict[Any, Any]] = prepare_inputs(kwarg_inputs)
+    trt_kwarg_inputs: Optional[dict[str, Any]] = prepare_inputs(kwarg_inputs)
     device = to_torch_tensorrt_device(device)
     enabled_precisions = {dtype._from(p) for p in enabled_precisions}
 
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -17,7 +17,7 @@
 from torch_tensorrt.dynamo.utils import (
     get_cpu_memory_usage,
     get_output_dtypes,
-    trim_memory,
+    release_memory,
 )
 
 logger = logging.getLogger(__name__)
@@ -108,8 +108,10 @@ def convert_module(
         )
 
     # Delete the frozen parameters from the module to release CPU memory
-    [delattr(module, attr) for attr in dir(module) if attr.startswith("_frozen_param")]
-    trim_memory()
+    for attr in dir(module):
+        if attr.startswith("_frozen_param"):
+            delattr(module, attr)
+    release_memory()
     logger.debug(
         f"CPU memory usage after clearing frozen parameters and building memory: {get_cpu_memory_usage()} MB"
     )
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
@@ -3,6 +3,7 @@
 import ctypes
 import gc
 import logging
+import platform
 import warnings
 from dataclasses import fields, replace
 from enum import Enum
@@ -866,6 +867,17 @@ def get_cpu_memory_usage() -> Any:
     return psutil.Process().memory_info().rss / 1024 / 1024
 
 
-def trim_memory() -> Any:
-    libc = ctypes.CDLL("libc.so.6")
-    return libc.malloc_trim(0)
+def release_memory() -> None:
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        torch.cuda.synchronize()
+
+    if platform.system() == "Linux":
+        try:
+            libc = ctypes.CDLL("libc.so.6")
+            if libc.malloc_trim(0) != 1:
+                logger.warning("Failed to release CPU memory.")
+        except Exception:
+            logger.warning("Failed to release CPU memory.")
diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py
@@ -54,6 +54,52 @@ def test_resnet18(ir):
     torch._dynamo.reset()
 
 
+def compile_one(idx: int, ir: str):
+    model = models.resnet18(pretrained=True).eval().to("cuda")
+    input = torch.randn((idx + 1, 3, 224, 224)).to("cuda")
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape, dtype=torch.float, format=torch.contiguous_format
+            )
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "enabled_precisions": {torch.float},
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "optimization_level": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    cos_sim = cosine_similarity(model(input), trt_mod(input))
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"In multiprocess compilation test, process {idx} failed: Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+
+@pytest.mark.unit
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"),
+    "torchvision is not installed",
+)
+def test_resnet18_multiprocess(ir):
+    import torch.multiprocessing as mp
+
+    mp.set_start_method("spawn", force=True)
+    procs = []
+    for i in range(3):
+        p = mp.Process(target=compile_one, args=(i, ir))
+        p.start()
+        procs.append(p)
+    for p in procs:
+        p.join()
+    torch._dynamo.reset()
+
+
 @pytest.mark.unit
 @unittest.skipIf(
     not importlib.util.find_spec("torchvision"),

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`from torch_tensorrt.dynamo.utils import (`
`18`	`18`	`get_cpu_memory_usage,`
`19`	`19`	`get_output_dtypes,`
`20`		`- trim_memory,`
	`20`	`+ release_memory,`
`21`	`21`	`)`
`22`	`22`
`23`	`23`	`logger = logging.getLogger(__name__)`
`@@ -108,8 +108,10 @@ def convert_module(`
`108`	`108`	`)`
`109`	`109`
`110`	`110`	`# Delete the frozen parameters from the module to release CPU memory`
`111`		`- [delattr(module, attr) for attr in dir(module) if attr.startswith("_frozen_param")]`
`112`		`- trim_memory()`
	`111`	`+ for attr in dir(module):`
	`112`	`+ if attr.startswith("_frozen_param"):`
	`113`	`+ delattr(module, attr)`
	`114`	`+ release_memory()`
`113`	`115`	`logger.debug(`
`114`	`116`	`f"CPU memory usage after clearing frozen parameters and building memory: {get_cpu_memory_usage()} MB"`
`115`	`117`	`)`