-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDockerfile.rocm
More file actions
31 lines (26 loc) · 1.19 KB
/
Dockerfile.rocm
File metadata and controls
31 lines (26 loc) · 1.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# Start from ROCm capable base image
FROM rocm/pytorch-training:v25.4
RUN pip3 uninstall tb-nightly -y
WORKDIR /workspace
# Copy resources from AMDiffusionBenchmark and install requirements
COPY . AMDiffusionBenchmark
RUN cd AMDiffusionBenchmark && \
pip3 install --no-cache-dir --upgrade pip packaging && \
pip3 install --no-cache-dir -r requirements.txt
# Setup HF's home
ENV HF_HOME /workspace/huggingface
# Fix torch.compile by increasing torch.dynamo.cache_limit
ENV TORCH_DYNAMO_CACHE_SIZE_LIMIT 64
# Set ENV variables that maximize perf:
# Enable hipBLASLt as the backend for optimized GEMM (General Matrix Multiply) operations in rocBLAS
ENV ROCBLAS_USE_HIPBLASLT 1
# Enable CUDA Lightweight (LT) optimizations for the addmm operation in PyTorch
ENV DISABLE_ADDMM_CUDA_LT 0
# Force the HIP runtime to place kernel arguments directly in device memory instead of host memory.
ENV HIP_FORCE_DEV_KERNARG 1
# Set NCCL operations to default-priority streams.
ENV TORCH_NCCL_HIGH_PRIORITY 1
# Configure the maximum number of hardware queues per GPU in the HIP runtime, enabling better parallelism.
ENV GPU_MAX_HW_QUEUES 6
# Set workdir to AMDiffusionBenchmark
WORKDIR /workspace/AMDiffusionBenchmark