Skip to content

Commit 125b386

Browse files
committed
Detect Nvidia arch automatically also for multiple GPUs and compile with right flags and forward compatibility
1 parent 1cce307 commit 125b386

File tree

1 file changed

+23
-6
lines changed
  • epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu

1 file changed

+23
-6
lines changed

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -180,15 +180,32 @@ ifeq ($(BACKEND),cuda)
180180
# NVidia CUDA architecture flags
181181
# See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
182182
# See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
183-
# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
184-
# This will embed device code for 70, and PTX for 70+.
183+
# Default: detect all compute capability (e.g., "8.0", "8.6", "9.0"), unique and sorted from lowest to higherst
184+
# then we embed device code for each compute capability, and for the highest PTX (forward-compatible)
185+
# use nvidia-smi and validate output with grep before going forward
186+
DETECTED_CC := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+\.[0-9]+$$' | tr -d '.' | sort -un)
185187
# One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
186188
# Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
187-
MADGRAPH_CUDA_ARCHITECTURE ?= 70
188-
###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
189-
###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
190189
comma:=,
191-
GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
190+
MADGRAPH_CUDA_ARCHITECTURE ?= $(foreach arch,$(DETECTED_CC),$(arch)$(comma))
191+
# Convert to space-separated list for looping
192+
MADGRAPH_CUDA_ARCH_LIST ?= $(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE))
193+
194+
# Fallback if detection failed (box has CUDA selected but probe failed)
195+
ifeq ($(strip $(MADGRAPH_CUDA_ARCH_LIST)),)
196+
# Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster)
197+
# This will embed device code for 70, and PTX for 70+
198+
MADGRAPH_CUDA_ARCHITECTURE := 70
199+
MADGRAPH_CUDA_ARCH_LIST := 70
200+
$(info Automatic compute capability detection failed; defaulting to $(MADGRAPH_CUDA_ARCHITECTURE))
201+
$(info Override with: make MADGRAPH_CUDA_ARCHITECTURE=<comma-separated list of architectures>)
202+
endif
203+
204+
# Build for every detected SM, and add one PTX for the highest SM (forward-compatibility)
205+
HIGHEST_SM := $(lastword $(MADGRAPH_CUDA_ARCH_LIST))
206+
GENCODE_FLAGS := $(foreach arch,$(MADGRAPH_CUDA_ARCH_LIST),-gencode arch=compute_$(arch),code=sm_$(arch))
207+
GENCODE_PTX := -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
208+
GPUARCHFLAGS := $(GENCODE_FLAGS) $(GENCODE_PTX)
192209
GPUFLAGS += $(GPUARCHFLAGS)
193210

194211
# Other NVidia-specific flags

0 commit comments

Comments
 (0)