Parallel-in-Time
diff --git a/‎pySDC/playgrounds/parallel/isend.py‎
Lines changed: 33 additions & 0 deletions b/‎pySDC/playgrounds/parallel/isend.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎pySDC/playgrounds/parallel/parallel.batch‎
Lines changed: 35 additions & 0 deletions b/‎pySDC/playgrounds/parallel/parallel.batch‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎pySDC/playgrounds/parallel/process.py‎
Lines changed: 35 additions & 0 deletions b/‎pySDC/playgrounds/parallel/process.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎pySDC/playgrounds/parallel/rma.py‎
Lines changed: 37 additions & 0 deletions b/‎pySDC/playgrounds/parallel/rma.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎pySDC/playgrounds/parallel/rma_async.py‎
Lines changed: 50 additions & 0 deletions b/‎pySDC/playgrounds/parallel/rma_async.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎pySDC/playgrounds/parallel/src_correct_pinning.sh‎
Lines changed: 131 additions & 0 deletions b/‎pySDC/playgrounds/parallel/src_correct_pinning.sh‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎pySDC/playgrounds/parallel/test_show_affinity_jureca.x‎
12.6 KB b/‎pySDC/playgrounds/parallel/test_show_affinity_jureca.x‎
12.6 KB
@@ -0,0 +1,33 @@
+from mpi4py import MPI
+import numpy as np
+import time
+
+
+def sleep(n):
+    tmp = np.random.rand(n)
+
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+
+comm.Barrier()
+
+t0 = time.time()
+
+if rank == 0:
+    sbuf = np.empty(40000000)
+    sbuf[0] = 0
+    sbuf[1:4] = np.random.rand(3)
+    req = comm.Isend(sbuf[:], dest=1, tag=99)
+    sleep(100000000)
+    req.wait()
+    print("[%02d] Original data %s" % (rank, sbuf))
+else:
+    rbuf = np.empty(40000000)
+    sleep(10000000)
+    comm.Recv(rbuf[:], source=0, tag=99)
+    print("[%02d] Received data %s" % (rank, rbuf))
+
+t1 = time.time()
+
+print(f'Rank: {rank} -- Time: {t1-t0}')
@@ -0,0 +1,35 @@
+#!/bin/bash -x
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=24
+#SBATCH --cpus-per-task=1
+#SBATCH --output=run.out
+#SBATCH --error=run.err
+#SBATCH --time=00:05:00
+#SBATCH --partition=devel
+
+export HWT=1
+export PIN=`./correct_pinning.sh`
+
+
+#export SCOREP_ENABLE_TRACING=1
+#export PATH=/p/project/ccstma/scorep/6.0-trunk-mrobefix_intel-parastation-papi/bin:$PATH
+#export PATH=/p/project/ccstma/scorep/6.0-trunk-mprobefix_intel-impi-papi/bin:$PATH
+
+#srun python -m scorep --mpp=mpi rma.py
+#srun python -m scorep --mpp=mpi isend.py
+#srun python -m scorep --mpp=mpi thread.py
+
+#srun python rma.py
+#srun python isend.py
+#srun --cpu_bind=sockets python thread.py -n 12
+#srun --cpu_bind=sockets --hint=multithread python thread.py -n 12
+
+echo -e "\n\nDEFAULT PINNING\n---------------------------\n"
+srun --label python thread.py -n 24
+echo -e "\n\nSOCKET PINNING\n---------------------------\n"
+srun --cpu_bind=sockets python thread.py -n 24
+echo -e "\n\nBROEMMEL PINNING\n---------------------------\n"
+srun $PIN --label python thread.py -n 24
+#srun $PIN --label ./show_affinity_jureca.x
+
+touch ready
@@ -0,0 +1,35 @@
+import mpi4py
+mpi4py.rc.threaded = True
+mpi4py.rc.thread_level = "funneled"
+# mpi4py.rc.profile('vt-hyb', logfile='threads')
+
+from mpi4py import MPI
+from threading import Thread
+
+MPI.COMM_WORLD.Barrier()
+
+# Understanding the Python GIL
+# David Beazley, http://www.dabeaz.com
+# PyCon 2010, Atlanta, Georgia
+# http://www.dabeaz.com/python/UnderstandingGIL.pdf
+
+# Consider this trivial CPU-bound function
+def countdown(n):
+    while n > 0:
+        n -= 1
+
+# Run it once with a lot of work
+COUNT = 10000000 # 10 millon
+tic = MPI.Wtime()
+countdown(COUNT)
+toc = MPI.Wtime()
+print ("sequential: %f seconds" % (toc-tic))
+
+# Now, subdivide the work across two threads
+t1 = Thread(target=countdown, args=(COUNT//2,))
+t2 = Thread(target=countdown, args=(COUNT//2,))
+tic = MPI.Wtime()
+for t in (t1, t2): t.start()
+for t in (t1, t2): t.join()
+toc = MPI.Wtime()
+print ("threaded:   %f seconds" % (toc-tic))
@@ -0,0 +1,37 @@
+from mpi4py import MPI
+import numpy as np
+import time
+
+
+def sleep(n):
+    tmp = np.random.rand(n)
+
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+
+t0 = time.time()
+
+if rank == 0:
+    sbuf = np.empty(40000000)
+    win = MPI.Win.Create(sbuf, comm=comm)
+    win.Lock(0, MPI.LOCK_EXCLUSIVE)
+    sbuf[0] = 0
+    sbuf[1:4] = np.random.rand(3)
+    win.Unlock(0)
+    sleep(100000000)
+    print("[%02d] Original data %s" % (rank, sbuf))
+else:
+    rbuf = np.empty(40000000)
+    win = MPI.Win.Create(None, comm=comm)
+    sleep(1000000)
+    win.Lock(0, MPI.LOCK_EXCLUSIVE)
+    win.Get(rbuf, 0)
+    win.Unlock(0)
+    print("[%02d] Received data %s" % (rank, rbuf))
+
+t1 = time.time()
+
+win.Free()
+
+print(f'Rank: {rank} -- Time: {t1-t0}')
@@ -0,0 +1,50 @@
+from mpi4py import MPI
+import numpy as np
+import time
+
+def sleep(n):
+    tmp = np.random.rand(n)
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+
+if rank == 0:
+    sbuf = np.empty(4)
+    win = MPI.Win.Create(sbuf, comm=comm)
+else:
+    rbuf = np.empty(4)
+    win = MPI.Win.Create(None, comm=comm)
+    # tmp = np.random.rand(int(10000000/2))
+
+group = win.Get_group()
+
+t0 = time.time()
+
+if rank == 0:
+    sleep(10000000)
+    # tmp = np.random.rand(100000000)
+    for i in range(3):
+        if i > 0:
+            sleep(100000000)
+            win.Wait()
+        sbuf[0] = i
+        sbuf[1:] = np.random.rand(3)
+        print("[%02d] Original data %s" % (rank, sbuf))
+        win.Post(group.Incl([1]))
+    win.Wait()
+else:
+    # tmp = np.random.rand(10000)
+    # tmp = np.random.rand(10000000)
+    # tmp = np.random.rand(1)
+    for i in range(3):
+        win.Start(group.Excl([1]))
+        win.Get(rbuf, 0)
+        win.Complete()
+        sleep(70000000)
+        print("[%02d] Received data %s" % (rank, rbuf))
+
+t1 = time.time()
+group.Free()
+win.Free()
+
+print(f'Rank: {rank} -- Time: {t1-t0}')
@@ -0,0 +1,131 @@
+#!/usr/bin/env bash
+# Script for a correct pinning w/ regard to hardware threads shared by a process
+# using SLURM variables. Also needs an additional HWT multiplicity set in the
+# environment.
+# To share two hardware threads of each core for a process:
+#    ...
+#    export HWT=2
+#    export PIN=`correct_pinning.sh`
+#    ...
+#    srun $PIN ...
+#    ...
+
+# echo settings? This will break srun integration...
+ECHO=${VERBOSE:-false}
+# to fix the integration, try
+#    export PIN=`correct_pinning.sh | grep cpu_bind`
+#
+
+# run for JURECA cluster or booster?
+# JUWELS should be covered anyway.
+MCA="CLS"
+echo "$SLURM_JOB_PARTITION" | grep -q booster && MCA="BOO"
+
+function print_config() {
+`$ECHO` && echo -e "\nHARDWARE CONFIG:"
+`$ECHO` && echo "cores per node: $PHYS_CORES_NODE"
+`$ECHO` && echo "CPUs per node: $SOCKETS"
+`$ECHO` && echo "cores per CPU: $PHYS_CORES_CPU"
+`$ECHO` && echo "hardware threads per core: $SMT"
+`$ECHO` && echo "hardware threads per node: $SLURM_CPUS_ON_NODE"
+`$ECHO` && echo -e "\nJOB CONFIG:"
+`$ECHO` && echo "tasks per node: $SLURM_NTASKS_PER_NODE"
+`$ECHO` && echo "hardware threads per task: $SLURM_CPUS_PER_TASK"
+`$ECHO` && echo "shared hardware threads per process: $HWT"
+}
+
+function pin_cluster() {
+SOCKETS=2
+SMT=2
+PHYS_CORES_NODE=$(($SLURM_CPUS_ON_NODE/$SMT))
+PHYS_CORES_CPU=$(($PHYS_CORES_NODE/$SOCKETS))
+
+print_config
+
+# exit straight away if we can't evenly distribute threads
+if [ $(($(($SLURM_CPUS_PER_TASK/$HWT))*$HWT)) != $SLURM_CPUS_PER_TASK ]
+then
+   `$ECHO` && echo "No nice disitribution of threads possible"
+   exit 1
+fi
+
+CPUid=0
+MASK="--cpu_bind=mask_cpu:"
+# loop per process on each node
+for PROC in `seq 1 $SLURM_NTASKS_PER_NODE`
+do
+   MAP=""
+   `$ECHO` && echo "process $PROC"
+   for CORE in `seq 1 $(($SLURM_CPUS_PER_TASK/$HWT))`
+   do
+      CPUid_=$CPUid
+      for HW in `seq 1 $HWT`
+      do
+	 MAP="$MAP,$CPUid_"
+	 ((CPUid_+=$PHYS_CORES_NODE))
+      done
+      ((CPUid++))
+   done
+   MAP_=`echo $MAP | sed  's/,/2^/' | sed 's/,/+2^/g'`
+   MAP=`echo $MAP | sed  's/,//'`
+   `$ECHO` && printf "map for process $PROC: %s\n" $MAP
+   MASK="$MASK,0x"`echo "obase=16; $MAP_" | bc`
+done
+MASK=`echo $MASK | sed 's/:,/:/'`
+echo $MASK
+}
+
+function pin_booster() {
+SOCKETS=1
+SMT=4
+PHYS_CORES_NODE=$(($SLURM_CPUS_ON_NODE/$SMT))
+PHYS_CORES_CPU=$(($PHYS_CORES_NODE/$SOCKETS))
+
+print_config
+
+# exit straight away if we can't evenly distribute threads
+if [ $(($(($SLURM_CPUS_PER_TASK/$HWT))*$HWT)) != $SLURM_CPUS_PER_TASK ]
+then
+   `$ECHO` && echo "No nice disitribution of threads possible"
+   exit 1
+fi
+
+CPUid=0
+MASK="--cpu_bind=mask_cpu:"
+# loop per process on each node
+for PROC in `seq 1 $SLURM_NTASKS_PER_NODE`
+do
+   MAP=""
+   `$ECHO` && echo "process $PROC"
+   for CORE in `seq 1 $(($SLURM_CPUS_PER_TASK/$HWT))`
+   do
+      CPUid_=$CPUid
+      for HW in `seq 1 $HWT`
+      do
+	 MAP="$MAP,$CPUid_"
+	 ((CPUid_+=$PHYS_CORES_CPU))
+      done
+      ((CPUid++))
+      if [ $CPUid -eq $PHYS_CORES_CPU ] && [ $HWT -eq 2 ]
+      then
+	 ((CPUid+=$PHYS_CORES_CPU))
+      fi
+   done
+   MAP_=`echo $MAP | sed  's/,/2^/' | sed 's/,/+2^/g'`
+   MAP=`echo $MAP | sed  's/,//'`
+   `$ECHO` && printf "map for process $PROC: %s\n" $MAP
+   MASK="$MASK,0x"`echo "obase=16; $MAP_" | bc`
+done
+MASK=`echo $MASK | sed 's/:,/:/'`
+echo $MASK
+}
+
+if [ $MCA == "CLS" ] 
+then
+   pin_cluster
+elif [ $MCA == "BOO" ]
+then
+   pin_booster
+fi
+
+exit 0