diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b5e89a2b5e..63c14ad7ad 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -97,10 +97,11 @@ jobs: run: | cat pr/bench-${{ matrix.device }}.* 2>/dev/null || true cat master/bench-${{ matrix.device }}.* 2>/dev/null || true - - - name: Archive Logs + + # All other runners (non-Phoenix) just run without special env + - name: Archive Logs (Frontier) + if: always() && matrix.cluster != 'phoenix' uses: actions/upload-artifact@v4 - if: always() with: name: ${{ matrix.cluster }}-${{ matrix.device }} path: | diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index f58ef44721..a0e93f9052 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -2,7 +2,8 @@ n_ranks=12 -if [ "$job_device" = "gpu" ]; then +echo "My benchmarking device is:" $device +if [ "$device" = "gpu" ]; then n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1 device_opts="--gpu -g $gpu_ids" @@ -15,7 +16,7 @@ mkdir -p $currentdir export TMPDIR=$currentdir -if [ "$job_device" = "gpu" ]; then +if [ "$device" = "gpu" ]; then ./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks else ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index e8b6dd3484..91160dd73a 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -1,64 +1,107 @@ -#!/bin/bash - -set -e +#!/usr/bin/env bash +set -euo pipefail usage() { echo "Usage: $0 [script.sh] [cpu|gpu]" + exit 1 } -if [ ! -z "$1" ]; then - sbatch_script_contents=`cat $1` -else - usage - exit 1 -fi +[[ $# -eq 2 ]] || usage -sbatch_cpu_opts="\ +sbatch_script="$1" + +device="$2" +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" + +# read the body of the user script +sbatch_body=$(<"$sbatch_script") + +# common SBATCH directives +sbatch_common_opts="\ +#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name +#SBATCH --account=gts-sbryngelson3 # account +#SBATCH -N1 # nodes +#SBATCH -t 02:00:00 # walltime +#SBATCH -q embers # QOS +#SBATCH -o $job_slug.out # stdout+stderr +#SBATCH --mem-per-cpu=2G # default mem (overridden below) +" + +# CPU vs GPU overrides +if [[ "$device" == "cpu" ]]; then + sbatch_device_opts="\ #SBATCH -p cpu-small # partition #SBATCH --ntasks-per-node=24 # Number of cores per node required #SBATCH --mem-per-cpu=2G # Memory per core\ " - -sbatch_gpu_opts="\ +elif [[ "$device" == "gpu" ]]; then + sbatch_device_opts="\ #SBATCH -CL40S #SBATCH --ntasks-per-node=4 # Number of cores per node required #SBATCH -G2\ " - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="$sbatch_cpu_opts" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="$sbatch_gpu_opts" else - usage - exit 1 + usage fi -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" +# submit and capture the JobID +JOBID=$(sbatch <<-EOT | awk '{print $4}' + #!/usr/bin/env bash + ${sbatch_common_opts} + ${sbatch_device_opts} + + export job_slug="${job_slug}" + export device="${device}" -sbatch </dev/null 2>&1 || :' EXIT -EOT +# ────────── Poll until SLURM job finishes ────────── +while :; do + # Try sacct first + STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) + + # Fallback to squeue if sacct is empty + if [[ -z "$STATE" ]]; then + STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") + fi + + # If it’s one of SLURM’s terminal states, break immediately + case "$STATE" in + COMPLETED|FAILED|CANCELLED|TIMEOUT) + echo "βœ… SLURM job $JOBID reached terminal state: $STATE" + break + ;; + "") + echo "βœ… SLURM job $JOBID no longer in queue; assuming finished" + break + ;; + *) + echo "⏳ SLURM job $JOBID state: $STATE" + sleep 10 + ;; + esac +done +# Now retrieve the exit code and exit with it +EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) +echo "πŸ”š SLURM job $JOBID exit code: $EXIT_CODE" +exit "$EXIT_CODE" diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 6700e38c50..182d7a2532 100644 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -1,64 +1,100 @@ -#!/bin/bash - -set -e +#!/usr/bin/env bash +set -euo pipefail usage() { echo "Usage: $0 [script.sh] [cpu|gpu]" + exit 1 } -if [ ! -z "$1" ]; then - sbatch_script_contents=`cat $1` -else - usage - exit 1 -fi +[[ $# -eq 2 ]] || usage + +sbatch_script="$1" +device="$2" + +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" + +# read the body of the user script +sbatch_body=$(<"$sbatch_script") -sbatch_cpu_opts="\ -#SBATCH -p cpu-small # partition -#SBATCH --ntasks-per-node=24 # Number of cores per node required -#SBATCH --mem-per-cpu=2G # Memory per core\ +# common SBATCH directives +sbatch_common_opts="\ +#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name +#SBATCH --account=gts-sbryngelson3 # account +#SBATCH -N1 # nodes +#SBATCH -t 03:00:00 # walltime +#SBATCH -q embers # QOS +#SBATCH -o $job_slug.out # stdout+stderr +#SBATCH --mem-per-cpu=2G # default mem (overridden below) " -sbatch_gpu_opts="\ +# CPU vs GPU overrides +if [[ "$device" == "cpu" ]]; then + sbatch_device_opts="\ +#SBATCH -p cpu-small +#SBATCH --ntasks-per-node=24 +" +elif [[ "$device" == "gpu" ]]; then + sbatch_device_opts="\ #SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --ntasks-per-node=4 +#SBATCH -G2 " - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="$sbatch_cpu_opts" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="$sbatch_gpu_opts" else - usage - exit 1 + usage fi -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" +# submit and capture the JobID +JOBID=$(sbatch <<-EOT | awk '{print $4}' + #!/usr/bin/env bash + ${sbatch_common_opts} + ${sbatch_device_opts} -sbatch </dev/null 2>&1 || :' EXIT -EOT +# ────────── Poll until SLURM job finishes ────────── +while :; do + # Try sacct first + STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) + + # Fallback to squeue if sacct is empty + if [[ -z "$STATE" ]]; then + STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") + fi + + # If it’s one of SLURM’s terminal states, break immediately + case "$STATE" in + COMPLETED|FAILED|CANCELLED|TIMEOUT) + echo "βœ… SLURM job $JOBID reached terminal state: $STATE" + break + ;; + "") + echo "βœ… SLURM job $JOBID no longer in queue; assuming finished" + break + ;; + *) + echo "⏳ SLURM job $JOBID state: $STATE" + sleep 10 + ;; + esac +done +# Now retrieve the exit code and exit with it +EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) +echo "πŸ”š SLURM job $JOBID exit code: $EXIT_CODE" +exit "$EXIT_CODE" diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 5582e9f6d5..60b9920f51 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -1,13 +1,19 @@ #!/bin/bash +tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build +currentdir=$tmpbuild/run-$(( RANDOM % 900 )) +mkdir -p $tmpbuild +mkdir -p $currentdir +export TMPDIR=$currentdir + +n_test_threads=8 + build_opts="" if [ "$job_device" = "gpu" ]; then build_opts="--gpu" fi -./mfc.sh test --dry-run -j 8 $build_opts - -n_test_threads=8 +./mfc.sh test --dry-run -j $n_test_threads $build_opts if [ "$job_device" = "gpu" ]; then gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node @@ -18,4 +24,7 @@ fi ./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix +sleep 10 +rm -rf "$currentdir" || true +unset TMPDIR diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2948b11333..db618bea46 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -101,6 +101,7 @@ jobs: group: phoenix labels: ${{ matrix.lbl }} env: + NODE_OPTIONS: ${{ matrix.lbl == 'gt' && '--max-old-space-size=2048' || '' }} ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: @@ -125,7 +126,7 @@ jobs: - name: Archive Logs uses: actions/upload-artifact@v4 - if: always() + if: matrix.lbl == 'frontier' with: name: logs-${{ strategy.job-index }}-${{ matrix.device }} path: test-${{ matrix.device }}.out diff --git a/misc/starting-phoenix-runners.md b/misc/starting-phoenix-runners.md new file mode 100644 index 0000000000..5e77fbd189 --- /dev/null +++ b/misc/starting-phoenix-runners.md @@ -0,0 +1,110 @@ +# Launching Phoenix Runners + +The Phoenix runners were repeatedly failing due to a network error. +Spencer managed to fix it via [this PR](https://github.com/MFlowCode/MFC/pull/933) and by running things through a socks5 proxy on each login node that holds a runner. +These are documented for Spencer or his next of kin. + +__The runners are started via the following process__ + +1. Log in to the login node via `ssh login-phoenix-rh9-.pace.gatech.edu`. `` can be `1` through `6` on Phoenix. + * Detour: Make sure no stray `ssh` daemons are sitting around: `pkill -9 sshd`. + * You can probably keep your terminal alive via `fuser -k -9 ~/nohup.out`, which kills (signal 9) whatever process is writing to that no-hangup file (the daemon we care about) +2. Log back into the same login node because you may have just nuked your session + * Detour: Make sure stray runners on that login node are dead (one liner): `pkill -9 -f -E 'run.sh|Runner.listener|Runner.helper'` + * If cautious, check that no runner processes are left over. `top` followed by `u` and `` and return. +3. Execute from your home directory: `nohup ssh -N -D 1080 -vvv login-phoenix-rh9-.pace.gatech.edu &`, replacing `` with the login node number + * This starts a proxy to tunnel a new ssh session through +4. Navigate to your runner's directory (or create a runner directory if you need). + * Right now they are in Spencer's `scratch/mfc-runners/action-runner-` +5. Run the alias `start_runner`, which dumps output `~/runner.out` + * If one doesn't have this alias yet, create and source it in your `.bashrc` or similar: +```bash +alias start_runner=' \ + http_proxy="socks5://localhost:1080" \ + https_proxy="socks5://localhost:1080" \ + no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org" \ + NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org" \ + RUNNER_DEBUG=1 \ + ACTIONS_STEP_DEBUG=1 \ + GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4 \ + DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00 \ + DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20 \ + DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5 \ + nohup ./run.sh > ~/runner.out 2>&1 &' +``` +6. You're done + + +### For inquisitive minds + +__Why the `start_runner` alias?__ + +1. `alias start_runner='…'` + Defines a new shell alias named `start_runner`. Whenever you run `start_runner`, the shell will execute everything between the single quotes as if you’d typed it at the prompt. + +2. `http_proxy="socks5://localhost:1080"` + Sets the `http_proxy` environment variable so that any HTTP traffic from the runner is sent through a SOCKS5 proxy listening on `localhost:1080`. + +3. `https_proxy="socks5://localhost:1080"` + Tells HTTPS-aware tools to use that same local SOCKS5 proxy for HTTPS requests. + +4. `no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` + Lists hosts and domains that should bypass the proxy entirely. Commonly used for internal or high-volume endpoints where you don’t want proxy overhead. + +5. `NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` + Same list as `no_proxy`β€”some programs only check the uppercase `NO_PROXY` variable. + +6. `RUNNER_DEBUG=1` + Enables debug-level logging in the GitHub Actions runner itself, so you’ll see more verbose internal messages in its logs. + +7. `ACTIONS_STEP_DEBUG=1` + Turns on step-level debug logging for actions you invokeβ€”handy if you need to trace exactly what each action is doing under the hood. + +8. `GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4` + Forces the runner to resolve DNS names to IPv4 addresses only. Useful if your proxy or network has spotty IPv6 support. + +9. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00` + For .NET–based tasks: sets the initial TCP keepalive timeout to 1 minute (after 1 minute of idle, a keepalive probe is sent). + +10. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20` + If the first keepalive probe gets no response, wait 20 seconds between subsequent probes. + +11. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5` + If probes continue to go unanswered, retry up to 5 times before declaring the connection dead. + +12. `nohup ./run.sh > ~/runner.out 2>&1 &` + - `nohup … &` runs `./run.sh` in the background and makes it immune to hangups (so it keeps running if you log out). + - `> ~/runner.out` redirects **stdout** to the file `runner.out` in your home directory. + - `2>&1` redirects **stderr** into the same file, so you get a combined log of everything the script prints. + +__Why the extra ssh command?__ + +1. `http_proxy="socks5://localhost:1080"` + Routes all HTTP traffic through a local SOCKS5 proxy on port 1080. + +2. `https_proxy="socks5://localhost:1080"` + Routes all HTTPS traffic through the same proxy. + +3. `no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` + Specifies hosts and domains that bypass the proxy entirely. Includes specific things that MFC's CMake will try to `wget` (e.g., `fftw`) or some other non `git` command. Allows `git clone` to work. + +4. `NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` + Same bypass list for applications that only check the uppercase variable. + +5. `RUNNER_DEBUG=1` + Enables verbose internal logging in the GitHub Actions runner. + +6. `GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4` + Forces DNS resolution to IPv4 to avoid IPv6 issues. + +7. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00` + (For .NET tasks) sends the first TCP keepalive probe after 1 minute of idle. + +8. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20` + Waits 20 seconds between subsequent TCP keepalive probes. + +9. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5` + Retries keepalive probes up to 5 times before closing the connection. + +10. `nohup ./run.sh > ~/runner.out 2>&1 &` + Runs `run.sh` in the background, immune to hangups, redirecting both stdout and stderr to `~/runner.out`.