AMDiffusionBenchmark/Dockerfile.rocm at main · ROCm/AMDiffusionBenchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# Start from ROCm capable base image
FROM rocm/pytorch-training:v25.4
RUN pip3 uninstall tb-nightly -y
WORKDIR /workspace

# Copy resources from AMDiffusionBenchmark and install requirements
COPY . AMDiffusionBenchmark
RUN cd AMDiffusionBenchmark && \
    pip3 install --no-cache-dir --upgrade pip packaging && \
    pip3 install --no-cache-dir -r requirements.txt

# Setup HF's home
ENV HF_HOME /workspace/huggingface

# Fix torch.compile by increasing torch.dynamo.cache_limit
ENV TORCH_DYNAMO_CACHE_SIZE_LIMIT 64

# Set ENV variables that maximize perf:
# Enable hipBLASLt as the backend for optimized GEMM (General Matrix Multiply) operations in rocBLAS
ENV ROCBLAS_USE_HIPBLASLT 1
# Enable CUDA Lightweight (LT) optimizations for the addmm operation in PyTorch
ENV DISABLE_ADDMM_CUDA_LT 0
# Force the HIP runtime to place kernel arguments directly in device memory instead of host memory.
ENV HIP_FORCE_DEV_KERNARG 1
# Set NCCL operations to default-priority streams.
ENV TORCH_NCCL_HIGH_PRIORITY 1
# Configure the maximum number of hardware queues per GPU in the HIP runtime, enabling better parallelism.
ENV GPU_MAX_HW_QUEUES 6

# Set workdir to AMDiffusionBenchmark
WORKDIR /workspace/AMDiffusionBenchmark