[None][fix] Replace PYTORCH_CUDA_ALLOC_CONF with PYTORCH_ALLOC_CONF to fix deprecation warning (#9294)

jiaganc · web-flow · commit 14762e028756 · 2025-11-27T12:22:01.000+08:00
Signed-off-by: Jiagan Cheng &lt;jiaganc@nvidia.com&gt;
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
@@ -28,7 +28,7 @@ FROM base AS devel
 
 #
 # NB: PyTorch requires this to be < 1.0
-ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
+ENV PYTORCH_ALLOC_CONF="garbage_collection_threshold:0.99999"
 
 # Copy all installation scripts at once to reduce layers
 COPY docker/common/install.sh \
diff --git a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md
@@ -250,7 +250,7 @@ Here is an example response, showing that the TensorRT LLM server returns “New
 ### Troubleshooting Tips
 
 * If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`.
-  * For running input/output sequence lengths of 8K/1K on H200, there is a known CUDA Out-Of-Memory issue caused by the PyTorch CUDA Caching Allocator fragmenting memory. As a workaround, you can set the environment variable `PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192`. For more details, please refer to the [PyTorch documentation on optimizing memory usage](https://docs.pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf).
+  * For running input/output sequence lengths of 8K/1K on H200, there is a known CUDA Out-Of-Memory issue caused by the PyTorch CUDA Caching Allocator fragmenting memory. As a workaround, you can set the environment variable `PYTORCH_ALLOC_CONF=max_split_size_mb:8192`. For more details, please refer to the [PyTorch documentation on optimizing memory usage](https://docs.pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf).
 * Ensure your model checkpoints are compatible with the expected format.
 * For performance issues, check GPU utilization with nvidia-smi while the server is running.
 * If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed.
diff --git a/enroot/Makefile b/enroot/Makefile
@@ -39,7 +39,7 @@ run_sqsh:
 		--container-image "$(SQSH_PATH)" \
 		--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR) \
 		--container-mount-home --container-remap-root \
-		--export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.99999 \
+		--export PYTORCH_ALLOC_CONF=garbage_collection_threshold:0.99999 \
 		$(RUN_CMD)
 
 endif
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
@@ -13,7 +13,7 @@
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
 IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511200955-9055
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511200955-9055
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511200955-9055
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511200955-9055
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511271125-9294
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511271125-9294
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511271125-9294
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511271125-9294
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -965,15 +965,15 @@ def _adjust_torch_mem_fraction():
     #     torch.cuda._set_allocator_settings (added in PyTorch 2.8.0-rc1)
     #   or a similar API is available, the warning below should be removed
     #   and the allocator GC threshold be set via the new API instead.
-    torch_allocator_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+    torch_allocator_config = os.environ.get("PYTORCH_ALLOC_CONF", "")
     torch_mem_threshold_advised = (
         torch.cuda.get_allocator_backend() == "native"
         and "expandable_segments:True" not in torch_allocator_config)
     torch_mem_threshold_set = "garbage_collection_threshold:" in torch_allocator_config
     if torch_mem_threshold_advised and not torch_mem_threshold_set:
         logger.warning(
             "It is recommended to incl. 'garbage_collection_threshold:0.???' or 'backend:cudaMallocAsync'"
-            " or 'expandable_segments:True' in PYTORCH_CUDA_ALLOC_CONF.")
+            " or 'expandable_segments:True' in PYTORCH_ALLOC_CONF.")
 
     # NOTE: Even if a memory threshold was not set (cf. warning above), setting a memory
     #       fraction < 1.0 is beneficial, because

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ FROM base AS devel`
`28`	`28`
`29`	`29`	`#`
`30`	`30`	`# NB: PyTorch requires this to be < 1.0`
`31`		`-ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"`
	`31`	`+ENV PYTORCH_ALLOC_CONF="garbage_collection_threshold:0.99999"`
`32`	`32`
`33`	`33`	`# Copy all installation scripts at once to reduce layers`
`34`	`34`	`COPY docker/common/install.sh \`