Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,5 @@ To make sure the code is performing as expected on GPU devices, I have:
- [ ] Ran the code on MI200+ GPUs and ensure the new features performed as expected (the GPU results match the CPU results)
- [ ] Enclosed the new feature via `nvtx` ranges so that they can be identified in profiles
- [ ] Ran a Nsight Systems profile using `./mfc.sh run XXXX --gpu -t simulation --nsys`, and have attached the output file (`.nsys-rep`) and plain text results to this PR
- [ ] Ran an Omniperf profile using `./mfc.sh run XXXX --gpu -t simulation --omniperf`, and have attached the output file and plain text results to this PR.
- [ ] Ran a Rocprof Systems profile using `./mfc.sh run XXXX --gpu -t simulation --rsys --hip-trace`, and have attached the output file and plain text results to this PR.
- [ ] Ran my code using various numbers of different GPUs (1, 2, and 8, for example) in parallel and made sure that the results scale similarly to what happens if you run without the new code/feature
6 changes: 3 additions & 3 deletions docs/documentation/running.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,13 @@ Learn more about NVIDIA Nsight Compute [here](https://docs.nvidia.com/nsight-com


#### AMD GPUs
- Rocprof (ROC): `./mfc.sh run ... -t simulation --roc --hip-trace [rocprof flags]` allows one to visualize MFC's system-wide performance with [Perfetto UI](https://ui.perfetto.dev/).
- Rocprof Systems (RSYS): `./mfc.sh run ... -t simulation --rsys --hip-trace [rocprof flags]` allows one to visualize MFC's system-wide performance with [Perfetto UI](https://ui.perfetto.dev/).
When used, `--roc` will run the simulation and generate files in the case directory for all targets.
`results.json` can then be imported in [Perfetto's UI](https://ui.perfetto.dev/).
Learn more about AMD Rocprof [here](https://rocm.docs.amd.com/projects/rocprofiler/en/docs-5.5.1/rocprof.html)
It is best to run case files with few timesteps to keep the report file sizes manageable.
- Omniperf (OMNI): `./mfc.sh run ... -t simulation --omni [omniperf flags]` allows one to conduct kernel-level profiling with [AMD's Omniperf](https://rocm.docs.amd.com/projects/omniperf/en/latest/index.html).
When used, `--omni` will output profiling information for all subroutines, including rooflines, cache usage, register usage, and more, after the simulation is run.
- Rocprof Compute (RCU): `./mfc.sh run ... -t simulation --rcu -n <name> [rocprof-compute flags]` allows one to conduct kernel-level profiling with [ROCm Compute Profiler](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/what-is-rocprof-compute.html).
When used, `--rcu` will output profiling information for all subroutines, including rooflines, cache usage, register usage, and more, after the simulation is run.
Adding this argument will moderately slow down the simulation and run the MFC executable several times.
For this reason, it should only be used with case files with few timesteps.

Expand Down
4 changes: 2 additions & 2 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ def add_common_arguments(p, mask = None):
run.add_argument("--clean", action="store_true", default=False, help="Clean the case before running.")
run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.")
run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.")
run.add_argument("--omni", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM omniperf.")
run.add_argument("--roc", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM rocprof.")
run.add_argument("--rcu", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM rocprof-compute.")
run.add_argument("--rsys", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM rocprof-systems.")

# BENCH
add_common_arguments(bench)
Expand Down
14 changes: 7 additions & 7 deletions toolchain/mfc/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,17 @@ def __profiler_prepend() -> typing.List[str]:

return ["nsys", "profile", "--stats=true", "--trace=mpi,nvtx,openacc"] + ARG("nsys")

if ARG("omni") is not None:
if not does_command_exist("omniperf"):
raise MFCException("Failed to locate [bold red]ROCM Omniperf[/bold red] (omniperf).")
if ARG("rcu") is not None:
if not does_command_exist("rocprof-compute"):
raise MFCException("Failed to locate [bold red]ROCM rocprof-compute[/bold red] (rocprof-compute).")

return ["omniperf", "profile"] + ARG("omni") + ["--"]
return ["rocprof-compute", "profile", "-n", ARG("name").replace('-', '_').replace('.', '_')] + ARG("rcu") + ["--"]

if ARG("roc") is not None:
if ARG("rsys") is not None:
if not does_command_exist("rocprof"):
raise MFCException("Failed to locate [bold red]ROCM rocprof[/bold red] (rocprof).")
raise MFCException("Failed to locate [bold red]ROCM rocprof-systems[/bold red] (rocprof-systems).")

return ["rocprof"] + ARG("roc")
return ["rocprof"] + ARG("rsys")

return []

Expand Down
6 changes: 3 additions & 3 deletions toolchain/modules
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ p-gpu nvhpc/24.5 hpcx/2.19-cuda cuda/12.1.1
p-gpu MFC_CUDA_CC=70,75,80,89,90 NVHPC_CUDA_HOME=$CUDA_HOME CC=nvc CXX=nvc++ FC=nvfortran

f OLCF Frontier
f-all cce/18.0.0 cpe/24.07 rocm/6.1.3 cray-mpich/8.1.28
f-all cray-fftw cray-hdf5 cray-python omniperf
f-gpu craype-accel-amd-gfx90a
f-all cpe/25.03 rocm/6.3.1
f-all cray-fftw cray-hdf5 cray-python
f-gpu craype-accel-amd-gfx90a rocprofiler-compute/3.0.0

d NCSA Delta
d-all python/3.11.6
Expand Down
14 changes: 13 additions & 1 deletion toolchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,19 @@ dependencies = [

# Chemistry
"cantera==3.1.0",
"pyrometheus == 1.0.3"
"pyrometheus == 1.0.3",

# Frontier Profiling
"astunparse==1.6.2",
"colorlover",
"dash>=1.12.0",
"pymongo",
"tabulate",
"tqdm",
"dash-svg",
"dash-bootstrap-components",
"kaleido",
"plotille"
]

[tool.hatch.metadata]
Expand Down
11 changes: 9 additions & 2 deletions toolchain/templates/frontier.mako
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ export MPICH_GPU_SUPPORT_ENABLED=1
${helpers.run_prologue(target)}

% if not mpi:
(set -x; ${profiler} "${target.get_install_binpath(case)}")
(set -x; \
% if target.name == 'simulation':
${profiler} \
% endif
"${target.get_install_binpath(case)}")
% else:
(set -x; srun \
% if engine == 'interactive':
Expand All @@ -54,7 +58,10 @@ export MPICH_GPU_SUPPORT_ENABLED=1
--gpus-per-task 1 --gpu-bind closest \
% endif
% endif
${profiler} "${target.get_install_binpath(case)}")
% if target.name == 'simulation':
${profiler} \
% endif
"${target.get_install_binpath(case)}")
% endif

${helpers.run_epilogue(target)}
Expand Down
Loading