Merge pull request #10 from gjbex/development

gjbex · web-flow · commit 7ce10fb68e18 · 2020-06-24T12:23:59.000+02:00
Various additions
diff --git a/docs/README.md b/docs/README.md
@@ -12,7 +12,7 @@ When you complete this training you will
     computing such as numpy, numexpr and numba;
   * be able to use Cython to improve your code's performance;
   * be able to wrap C, C++ and Fortran code to use it from Python;
-  * understand the opportunities and pitfalls of multithreaded
+  * understand the opportunities and pitfalls of multi-threaded
     programming with Python;
   * be able to write distributed application using MPI;
   * have an understanding of how frameworks for distributed
@@ -31,7 +31,7 @@ Total duration: 4 hours.
   | Cython                                      | 60 min.  |
   | coffee break                                | 10 min.  |
   | interfacing with C/C++/Fortran              | 30 min.  |
-  | multithreaded programming                   | 10 min.  |
+  | multi-threaded programming                   | 10 min.  |
   | MPI                                         | 45 min.  |
   | dask                                        | 15 min.  |
   | pyspark                                     | 20 min.  |
@@ -45,6 +45,12 @@ Slides are available in the
 as well as example code and hands-on material.
 
 
+## Software environment
+
+Instructions on [how to create the required software environment](software_stack.md)
+are available.
+
+
 ## Target audience
 
 This training is for you if you need to use Python for computationally
diff --git a/docs/software_stack.md b/docs/software_stack.md
@@ -0,0 +1,56 @@
+# Software stack
+
+This training requires a non-trivial software stack so using the conda package
+manager will simplify your life considerably.
+
+
+## git version control
+
+The repository for this training session is available on Github, and cloning this
+repository on you own machine will give you access to all training material.
+
+If you don't have a git client installed, consult the following [web page on how to
+install](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) one on
+your platform of choice.
+
+
+## Python & conda
+
+The most convenient way to install the required software is using the conda
+environment manager.  conda is available on Linux, MacOS and Windows.  You can use
+conda from the command line when you install miniconda which is available for
+[download](https://docs.conda.io/en/latest/miniconda.html).  The website provides
+installation instructions for each platform.
+
+Remember to install miniconda on a file system with enough free space since conda
+environments quickly take multiple gigabytes of disk space.
+
+Alternatively, you can install Anaconda, a GUI application to manage Python
+environments.  For Windows, this may be the most convenient option.  Anaconda is
+available for Windows, MacOS and Linux and can be downloaded from the
+[Anaconda website](https://www.anaconda.com/products/individual).
+
+
+## Training environment
+
+To create and use the conda environment for this training, open a terminal window and
+follow the steps below.
+
+1. Clone the Github repository:
+   ```bash
+   $ git clone git@github.com:gjbex/Python-for-HPC.git
+   ```
+2. Change into the newly created directory:
+   ```bash
+   $ cd Python-for-HPC
+   ```
+1. Create the conda environment for this training session:
+   ```bash
+   $ conda env create -f environment.yml
+   ```
+1. Activate the environment:
+   ```bash
+   $ conda activate python_for_hpc
+   ```
+
+Now you can run Python scripts in this terminal, or start a Jupyter notebook.
diff --git a/source-code/mpi4py/analyze_mpifitness_data.py b/source-code/mpi4py/analyze_mpifitness_data.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser, FileType
+import numpy as np
+import re
+import sys
+
+
+def accumulate(data_file):
+    regex = re.compile(r'^([a-z]+).+?(\d+\.\d+):\s+(\d+(?:\.\d+)(?:e[+-]\d+)?)')
+    timings = dict()
+    times = dict()
+    for line in data_file:
+        match = regex.search(line)
+        if match:
+            test = match.group(1)
+            time = float(match.group(2))
+            duration = float(match.group(3))
+            if test not in timings:
+                timings[test] = list()
+                times[test] = list()
+            timings[test].append(duration)
+            times[test].append(time)
+    data = dict()
+    for test in timings:
+        data[test] = np.array([times[test], timings[test]]).T
+    return data
+
+
+def print_stats(test, data):
+    print(f'{test}:')
+    print(f'    min:    {data.min()}')
+    print(f'    median: {np.median(data)}')
+    print(f'    mean:   {data.mean()}')
+    print(f'    max:    {data.max()}')
+    print(f'    stddev: {np.std(data)}')
+    print(f'    n:      {len(data)}')
+
+
+def main():
+    arg_parser = ArgumentParser(description='analyze mpifitness data')
+    arg_parser.add_argument('file', type=FileType('r'), help='file to analyse')
+    options = arg_parser.parse_args()
+    timings = accumulate(options.file)
+    for test, data_list in timings.items():
+        print_stats(test, data_list[:, 1])
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/source-code/mpi4py/mpifitness.py b/source-code/mpi4py/mpifitness.py
@@ -18,11 +18,11 @@ def acknowledge(comm):
     comm.barrier()
     rank = comm.Get_rank()
     size = comm.Get_size()
-    print(f'process {rank} out of {size}')
+    print(f'acknowledge {rank} out of {size}')
     comm.barrier()
 
 
-def pingpong(comm, nr_iters, msg_size):
+def pingpong(comm, nr_iters, msg_size, file):
     comm.barrier()
     rank = comm.Get_rank()
     size = comm.Get_size()
@@ -39,7 +39,8 @@ def pingpong(comm, nr_iters, msg_size):
                             print(f'{rank} received {msg}, expected {dest}',
                                   file=sys.stderr)
                             comm.Abort(1)
-                        print(f'{rank} -> {dest} pingpong: {end_time - start_time}')
+                        print(f'pingpong {rank} -> {dest} {start_time}: '
+                              f'{end_time - start_time}', file=file)
                     if rank == dest:
                         start_time = time.time()
                         msg = comm.recv(source=source)
@@ -49,11 +50,12 @@ def pingpong(comm, nr_iters, msg_size):
                             print(f'{rank} received {msg}, expected {source}',
                                   file=sys.stderr)
                             comm.Abort(1)
-                        print(f'{rank} -> {source} pingpong: {end_time - start_time}')
+                        print(f'pingpong {rank} -> {source} {start_time}: '
+                              f'{end_time - start_time}', file=file)
     comm.barrier()
                     
 
-def broadcast(comm, nr_iters, msg_size):
+def broadcast(comm, nr_iters, msg_size, file):
     comm.barrier()
     rank = comm.Get_rank()
     size = comm.Get_size()
@@ -65,14 +67,15 @@ def broadcast(comm, nr_iters, msg_size):
             start_time = time.time()
             msg = comm.bcast(msg, root=root)
             end_time = time.time()
-            print(f'{root} -> {rank} bcast: {end_time - start_time}')
+            print(f'bcast {root} -> {rank} {start_time}: {end_time - start_time}',
+                  file=file)
             if msg != make_msg(root, msg_size):
                 print(f'{rank} received unexpected bcast message')
                 comm.Abort(2)
     comm.barrier()
 
 
-def scatter(comm, nr_iters, msg_size):
+def scatter(comm, nr_iters, msg_size, file):
     comm.barrier()
     rank = comm.Get_rank()
     size = comm.Get_size()
@@ -84,14 +87,15 @@ def scatter(comm, nr_iters, msg_size):
             start_time = time.time()
             msg = comm.scatter(msg, root=root)
             end_time = time.time()
-            print(f'{root} -> {rank} scatter: {end_time - start_time}')
+            print(f'scatter {root} -> {rank} {start_time}: {end_time - start_time}',
+                  file=file)
             if msg != make_msg(rank, msg_size):
                 print(f'{rank} received unexpected scatter message')
                 comm.Abort(2)
     comm.barrier()
 
 
-def gather(comm, nr_iters, msg_size):
+def gather(comm, nr_iters, msg_size, file):
     comm.barrier()
     rank = comm.Get_rank()
     size = comm.Get_size()
@@ -101,7 +105,8 @@ def gather(comm, nr_iters, msg_size):
             start_time = time.time()
             msg = comm.gather(msg, root=root)
             end_time = time.time()
-            print(f'{root} -> {rank} gather: {end_time - start_time}')
+            print(f'gather {root} -> {rank} {start_time}: {end_time - start_time}',
+                  file=file)
             if (rank == root):
                 if len(msg) != size:
                     print(f'{rank} received unexpected gather message')
@@ -113,7 +118,7 @@ def gather(comm, nr_iters, msg_size):
     comm.barrier()
 
 
-def alltoall(comm, nr_iters, msg_size):
+def alltoall(comm, nr_iters, msg_size, file):
     comm.barrier()
     rank = comm.Get_rank()
     size = comm.Get_size()
@@ -122,7 +127,7 @@ def alltoall(comm, nr_iters, msg_size):
         start_time = time.time()
         msg = comm.alltoall(msg)
         end_time = time.time()
-        print(f'{rank} alltoall: {end_time - start_time}')
+        print(f'alltoall {rank} {start_time}: {end_time - start_time}', file=file)
         if len(msg) != size:
             print(f'{rank} received unexpected alltoall message')
             comm.Abort(2)
@@ -133,7 +138,7 @@ def alltoall(comm, nr_iters, msg_size):
     comm.barrier()
 
 
-def reduce(comm, nr_iters, msg_size):
+def reduce(comm, nr_iters, msg_size, file):
     comm.barrier()
     rank = comm.Get_rank()
     size = comm.Get_size()
@@ -143,7 +148,8 @@ def reduce(comm, nr_iters, msg_size):
             start_time = time.time()
             msg = comm.reduce(msg, op=MPI.SUM, root=root)
             end_time = time.time()
-            print(f'{root} -> {rank} reduce: {end_time - start_time}')
+            print(f'reduce {root} -> {rank} {start_time}: {end_time - start_time}',
+                  file=file)
     comm.barrier()
 
 
@@ -155,6 +161,7 @@ def main():
         print(f'# acknowledgment')
     acknowledge(comm)
     arg_parser = ArgumentParser(description='MPI performance benchmark')
+    arg_parser.add_argument('file_base', help='base file name for performance info')
     arg_parser.add_argument('--nr_pingpongs', type=int, default=10,
                             help='number of ping-pong iterations to perform')
     arg_parser.add_argument('--pingpong_size', type=int, default=8,
@@ -180,43 +187,44 @@ def main():
     arg_parser.add_argument('--reduce_size', type=int, default=8,
                             help='number of bytes for reduce message')
     options = arg_parser.parse_args()
-    comm.barrier()
-    if (rank == root):
-        print(f'# {options.nr_pingpongs} ping-pong iterations, '
-              f'size {options.pingpong_size}')
-    comm.barrier()
-    pingpong(comm, options.nr_pingpongs, options.pingpong_size)
-    comm.barrier()
-    if (rank == root):
-        print(f'# {options.nr_bcasts} broadcast iterations, '
-              f'size {options.bcast_size}')
-    comm.barrier()
-    broadcast(comm, options.nr_bcasts, options.bcast_size)
-    comm.barrier()
-    if (rank == root):
-        print(f'# {options.nr_scatters} scatter iterations, '
-              f'size {options.scatter_size}')
-    comm.barrier()
-    scatter(comm, options.nr_scatters, options.scatter_size)
-    comm.barrier()
-    if (rank == root):
-        print(f'# {options.nr_gathers} gather iterations, '
-              f'size {options.gather_size}')
-    comm.barrier()
-    gather(comm, options.nr_gathers, options.gather_size)
-    comm.barrier()
-    if (rank == root):
-        print(f'# {options.nr_alltoalls} alltoall iterations, '
-              f'size {options.alltoall_size}')
-    comm.barrier()
-    alltoall(comm, options.nr_alltoalls, options.alltoall_size)
-    comm.barrier()
-    if (rank == root):
-        print(f'# {options.nr_reduces} reduce iterations, '
-              f'size {options.reduce_size}')
-    comm.barrier()
-    reduce(comm, options.nr_reduces, options.reduce_size)
-    comm.barrier()
+    with open(f'{options.file_base}_{rank:04d}.txt', 'w') as file:
+        comm.barrier()
+        if (rank == root):
+            print(f'# {options.nr_pingpongs} ping-pong iterations, '
+                  f'size {options.pingpong_size}', file=file)
+        comm.barrier()
+        pingpong(comm, options.nr_pingpongs, options.pingpong_size, file)
+        comm.barrier()
+        if (rank == root):
+            print(f'# {options.nr_bcasts} broadcast iterations, '
+                  f'size {options.bcast_size}', file=file)
+        comm.barrier()
+        broadcast(comm, options.nr_bcasts, options.bcast_size, file)
+        comm.barrier()
+        if (rank == root):
+            print(f'# {options.nr_scatters} scatter iterations, '
+                  f'size {options.scatter_size}', file=file)
+        comm.barrier()
+        scatter(comm, options.nr_scatters, options.scatter_size, file)
+        comm.barrier()
+        if (rank == root):
+            print(f'# {options.nr_gathers} gather iterations, '
+                  f'size {options.gather_size}', file=file)
+        comm.barrier()
+        gather(comm, options.nr_gathers, options.gather_size, file)
+        comm.barrier()
+        if (rank == root):
+            print(f'# {options.nr_alltoalls} alltoall iterations, '
+                  f'size {options.alltoall_size}', file=file)
+        comm.barrier()
+        alltoall(comm, options.nr_alltoalls, options.alltoall_size, file)
+        comm.barrier()
+        if (rank == root):
+            print(f'# {options.nr_reduces} reduce iterations, '
+                  f'size {options.reduce_size}', file=file)
+        comm.barrier()
+        reduce(comm, options.nr_reduces, options.reduce_size, file)
+        comm.barrier()
     return 0
 
 
diff --git a/source-code/mpi4py/plot_time_distribution.py b/source-code/mpi4py/plot_time_distribution.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser, FileType
+import matplotlib.pyplot as plt
+import seaborn as sns
+import sys
+from analyze_mpifitness_data import accumulate
+
+
+def main():
+    arg_parser = ArgumentParser(description='plot MPI time distirbution')
+    arg_parser.add_argument('--file', required=True, type=FileType('r'),
+                            help='file to plot data from')
+    arg_parser.add_argument('--test', required=True,
+                            choices=['pingpong', 'bcast', 'scatter', 'gather',
+                                     'alltoall', 'reduce'],
+                            help='test to visualize')
+    arg_parser.add_argument('--bins', type=int, default=5,
+                            help='number of bins in histogram')
+    arg_parser.add_argument('--rug', action='store_true', help='show rug')
+    arg_parser.add_argument('--log', action='store_true', help='use log x-axis')
+    options = arg_parser.parse_args()
+    timings = accumulate(options.file)
+    data = timings[options.test][:, 1]
+    if options.rug:
+        grid = sns.distplot(data, rug=True, hist=False)
+    else:
+        grid = sns.distplot(data, bins=options.bins)
+    if options.log:
+        grid.set(xscale='log')
+    grid.set(title=options.test)
+    plt.show()
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/source-code/mpi4py/plot_timeline.py b/source-code/mpi4py/plot_timeline.py