Add profiling docs with perf, gperftools, VTUne

robertodr · robertodr · commit d840bc514dbc · 2018-03-27T10:17:00.000-04:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -34,6 +34,7 @@ include(autocmake_ccache)
 include(windows)
 include(autocmake_definitions)
 include(code_coverage)
+include(gperftools)
 include(autocmake_int64)
 include(autocmake_omp)
 include(autocmake_safeguards)
diff --git a/cmake/autocmake.yml b/cmake/autocmake.yml
@@ -28,6 +28,7 @@ modules:
       - 'custom/windows.cmake'
       - '%(url_root)modules/definitions.cmake'
       - 'custom/code_coverage.cmake'
+      - 'custom/gperftools.cmake'
       - '%(url_root)modules/int64.cmake'
       - '%(url_root)modules/omp.cmake'
       - '%(url_root)modules/safeguards.cmake'
diff --git a/cmake/custom/FindLibunwind.cmake b/cmake/custom/FindLibunwind.cmake
@@ -0,0 +1,58 @@
+# This file is part of MADNESS
+# https://github.com/m-a-d-n-e-s-s/madness/blob/master/cmake/modules/FindLibunwind.cmake
+#
+# - Try to find Libunwind
+# Input variables:
+#  LIBUNWIND_ROOT_DIR     - The libunwind install directory
+#  LIBUNWIND_INCLUDE_DIR  - The libunwind include directory
+#  LIBUNWIND_LIBRARY      - The libunwind library directory
+# Output variables:
+#  LIBUNWIND_FOUND        - System has libunwind
+#  LIBUNWIND_INCLUDE_DIRS - The libunwind include directories
+#  LIBUNWIND_LIBRARIES    - The libraries needed to use libunwind
+#  LIBUNWIND_VERSION      - The version string for libunwind
+
+include(FindPackageHandleStandardArgs)
+
+if(NOT DEFINED LIBUNWIND_FOUND)
+
+  # Set default sarch paths for libunwind
+  if(LIBUNWIND_ROOT_DIR)
+    set(LIBUNWIND_INCLUDE_DIR ${LIBUNWIND_ROOT_DIR}/include CACHE PATH "The include directory for libunwind")
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+      set(LIBUNWIND_LIBRARY ${LIBUNWIND_ROOT_DIR}/lib64;${LIBUNWIND_ROOT_DIR}/lib CACHE PATH "The library directory for libunwind")
+    else()
+      set(LIBUNWIND_LIBRARY ${LIBUNWIND_ROOT_DIR}/lib CACHE PATH "The library directory for libunwind")
+    endif()
+  endif()
+
+  find_path(LIBUNWIND_INCLUDE_DIRS NAMES libunwind.h
+      HINTS ${LIBUNWIND_INCLUDE_DIR})
+
+  find_library(LIBUNWIND_LIBRARIES unwind
+      HINTS ${LIBUNWIND_LIBRARY})
+
+  # Get libunwind version
+  if(EXISTS "${LIBUNWIND_INCLUDE_DIRS}/libunwind-common.h")
+    file(READ "${LIBUNWIND_INCLUDE_DIRS}/libunwind-common.h" _libunwind_version_header)
+    string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_MAJOR[ \t]+([0-9]+).*" "\\1"
+        LIBUNWIND_MAJOR_VERSION "${_libunwind_version_header}")
+    string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_MINOR[ \t]+([0-9]+).*" "\\1"
+        LIBUNWIND_MINOR_VERSION "${_libunwind_version_header}")
+    string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_EXTRA[ \t]+([0-9]+).*" "\\1"
+        LIBUNWIND_MICRO_VERSION "${_libunwind_version_header}")
+    set(LIBUNWIND_VERSION "${LIBUNWIND_MAJOR_VERSION}.${LIBUNWIND_MINOR_VERSION}.${LIBUNWIND_MICRO_VERSION}")
+    unset(_libunwind_version_header)
+  endif()
+
+  # handle the QUIETLY and REQUIRED arguments and set LIBUNWIND_FOUND to TRUE
+  # if all listed variables are TRUE
+  find_package_handle_standard_args(Libunwind
+      FOUND_VAR LIBUNWIND_FOUND
+      VERSION_VAR LIBUNWIND_VERSION
+      REQUIRED_VARS LIBUNWIND_LIBRARIES LIBUNWIND_INCLUDE_DIRS)
+
+  mark_as_advanced(LIBUNWIND_INCLUDE_DIR LIBUNWIND_LIBRARY
+      LIBUNWIND_INCLUDE_DIRS LIBUNWIND_LIBRARIES)
+
+endif()
diff --git a/cmake/custom/gperftools.cmake b/cmake/custom/gperftools.cmake
@@ -1,17 +1,19 @@
-if(ENABLE_GPERFTOOLS OR ENABLE_TCMALLOC_MINIMAL)
+#.rst:
+#
+# Enable profiling with gperftools.
+#
+# Variables used::
+#
+#   ENABLE_GPERFTOOLS
+#
+# autocmake.yml configuration::
+#
+#   docopt: "--gperf Enable profiling with gperftools [default: False]."
+#   define: "'-DENABLE_GPERFTOOLS={0}'.format(arguments['--gperf'])"
 
-  if(ENABLE_GPERFTOOLS)
-    find_package(Gperftools COMPONENTS tcmalloc OPTIONAL_COMPONENTS profiler)
-  else()
-    find_package(Gperftools REQUIRED COMPONENTS tcmalloc_minimal)
-  endif()
-
-  # Set the config.h variables
-  if(GPERFTOOLS_FOUND AND ENABLE_TCMALLOC_MINIMAL)
-    set(MADNESS_HAS_GOOGLE_PERF_MINIMAL 1)
-  endif()
-  if(LIBUNWIND_FOUND)
-    set(MADNESS_HAS_LIBUNWIND 1)
-  endif()
+option_with_print(ENABLE_GPERFTOOLS "Enable profiling with gperftools" OFF)
 
+if(ENABLE_GPERFTOOLS)
+  message(STATUS "Linking against gperftools libraries for profiling")
+  find_package(Gperftools COMPONENTS tcmalloc profiler)
 endif()
diff --git a/default.nix b/default.nix
@@ -34,9 +34,7 @@ in
       doxygen
       exa
       ffmpeg
-      flameGraph
       gfortran
-      gperftools
       graphviz
       lcov
       pipenv
diff --git a/doc/programmers/profiling.rst b/doc/programmers/profiling.rst
@@ -0,0 +1,173 @@
+Profiling
+---------
+
+You should obtain profiling information before attempting any optimization of
+the code. There are many ways of obtaining this information, but we have only
+experimented with the following:
+
+#. Using Linux ``perf`` and related `tools <http://www.brendangregg.com/perf.html>`_.
+#. Using ``gperftools``.
+#. Using Intel VTune.
+
+Profiling should be done using the standalone executable ``run_pcm`` and any of
+the input files gathered under the ``tests/benchmark`` directory. These files
+are copied to the build directory. If you are lazy, you can run the profiling
+from the build directory:
+
+.. code-block:: bash
+
+   >>> cd tests/benchmark
+
+   >>> env PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
+          python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
+
+Using ``perf``
+==============
+
+``perf`` is a tool available on Linux. Though part of the kernel tools, it is
+not usually preinstalled on most Linux distributions. For visualization
+purposes we also need `additional tools <https://github.com/brendangregg/perf-tools>`_,
+in particular the `flame graph generation scripts <https://github.com/brendangregg/FlameGraph>`_
+Probably your distribution has them prepackaged already.
+``perf`` will trace all CPU events on your system, hence you might need to
+fiddle with some kernel set up files to get permissions to trace events.
+
+.. note::
+   ``perf`` **is NOT** available on ``stallo``. Even if it were, you would
+   probably not have permissions to record kernel traces.
+
+These are the instructions I used:
+
+1. Trace execution. This will save CPU stack traces to a ``perf.data`` file.
+   Successive runs do not overwrite this file.
+
+   .. code-block:: bash
+
+      >>> cd tests/benchmark
+
+      >>> perf record -F 99 -g -- env PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH python
+                    <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
+
+2. Get reports. There are different ways of getting a report from the
+   ``perf.data`` file. The following will generate a call tree.
+
+   .. code-block:: bash
+
+      >>> perf report --stdio
+
+3. Generate an interactive flame graph.
+
+   .. code-block:: bash
+
+      >>> perf script | stackcollapse-perf.pl > out.perf-folded
+
+      >>> cat out.perf-folded | flamegraph.pl > perf-run_pcm.svg
+
+Using ``gperftools``
+====================
+
+This set of tools was previously known as Google Performance Tools. The
+executable needs to be linked against the ``profiler``, ``tcmalloc``
+and ``unwind`` libraries.
+CMake will attempt to find them. If this fails, you will have to install them,
+you should either check if they are available for your distribution or compile
+from source.
+In principle, one could use the ``LD_PRELOAD`` mechanism to skip the *ad hoc*
+compilation of the executable.
+
+.. note::
+   ``gperftools`` **is** available on ``stallo``, but it's an ancient version.
+
+1. Configure the code with the ``--gperf`` option enabled. CPU and heap
+   profiling, together with heap-checking will be available.
+
+2. CPU profiling can be done with the following command:
+
+   .. code-block:: bash
+
+      >>> env CPUPROFILE=run_pcm.cpu.prof PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
+              python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
+
+  This will save the data to the ``run_pcm.cpu.prof`` file. To analyze the gathered
+  data we can use the ``pprof`` script:
+
+  .. code-block:: bash
+
+     >>> pprof --text <build_dir>/bin/run_pcm run_pcm.cpu.prof
+
+  This will print a table. Any row will look like the following:
+
+  .. code-block:: bash
+
+     2228   7.2%  24.8%    28872  93.4% pcm::utils::splineInterpolation
+
+  where the columns respectively report:
+
+  #. Number of profiling samples in this function.
+  #. Percentage of profiling samples in this function.
+  #. Percentage of profiling samples in the functions printed so far.
+  #. Number of profiling samples in this function and its callees.
+  #. Percentage of profiling samples in this function and its callees.
+  #. Function name.
+
+  For more details look `here <https://gperftools.github.io/gperftools/cpuprofile.html>`_
+
+3. Heap profiling can be done with the following command:
+
+   .. code-block:: bash
+
+      >>> env HEAPPROFILE=run_pcm.hprof PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
+              python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
+
+  This will output a series of datafiles ``run_pcm.hprof.0000.heap``,
+  ``run_pcm.hprof.0001.heap`` and so forth. You will have to kill execution
+  when enough samples have been collected.
+  Analysis of the heap profiling data can be done using ``pprof``. `Read more
+  here <https://gperftools.github.io/gperftools/heapprofile.html>`_
+
+
+Using Intel VTune
+=================
+
+This is probably the easiest way to profile the code.
+`VTune <https://software.intel.com/en-us/intel-vtune-amplifier-xe>`_ is Intel software, it might be possible to get a personal, free license.
+The instructions will hold on any machine where VTune is installed and you can
+look for more details on the `online documentation <https://software.intel.com/en-us/vtune-amplifier-help>`_
+You can, in principle, use the GUI. I haven't managed to do that though.
+
+On ``stallo``, start an interactive job and load the following modules:
+
+.. code-block:: bash
+
+   >>> module load intel/2018a
+
+   >>> module load CMake
+
+   >>> module load VTune
+
+   >>> export BOOST_INCLUDEDIR=/home/roberto/Software/boost/include
+
+   >>> export BOOST_LIBRARYDIR=/home/roberto/Software/boost/lib
+
+You will need to compile with optimizations activated, *i.e.* release mode.
+It is better to first parse the input file and then call ``run_pcm``:
+
+.. code-block:: bash
+
+   >>> cd <build_dir>/tests/benchmark
+
+   >>> env PYTHONPATH=../../lib64/python:$PYTHONPATH
+       python ../../bin/go_pcm.py --inp=standalone_bubble.pcm
+
+To start collecting hotspots:
+
+.. code-block:: bash
+
+   >>> amplxe-cl -collect hotspots ../../bin/run_pcm @standalone_bubble.pcm
+
+VTune will generate a folder ``r000hs`` with the collected results. A report
+for the hotspots can be generated with:
+
+.. code-block:: bash
+
+   >>> amplxe-cl -report hotspots -r r000hs > report
diff --git a/doc/programmers/programmers-manual.rst b/doc/programmers/programmers-manual.rst
@@ -10,5 +10,6 @@ PCMSolver Programmers' Manual
    cmake-usage
    versioning
    maintenance
+   profiling
    testing
    timer-class
diff --git a/setup.py b/setup.py
@@ -11,7 +11,6 @@
 from autocmake import configure
 from autocmake.external import docopt
 
-
 options = """
 Usage:
   ./setup.py [options] [<builddir>]
@@ -27,6 +26,7 @@
   --ccache=<USE_CCACHE>                  Toggle use of ccache <ON/OFF> [default: ON].
   --add-definitions=<STRING>             Add preprocesor definitions [default: ''].
   --coverage                             Enable code coverage [default: OFF].
+  --gperf                                Enable profiling with gperftools [default: False].
   --int64                                Enable 64bit integers [default: False].
   --omp                                  Enable OpenMP parallelization [default: False].
   --python=<PYTHON_INTERPRETER>          The Python interpreter (development version) to use. [default: ''].
@@ -53,12 +53,16 @@ def gen_cmake_command(options, arguments):
     """
     command = []
     command.append(arguments['--cmake-executable'])
-    command.append('-DCMAKE_Fortran_COMPILER={0} -DEXTRA_FCFLAGS="{1}"'.format(arguments['--fc'], arguments['--extra-fc-flags']))
-    command.append('-DCMAKE_C_COMPILER={0} -DEXTRA_CFLAGS="{1}"'.format(arguments['--cc'], arguments['--extra-cc-flags']))
-    command.append('-DCMAKE_CXX_COMPILER={0} -DEXTRA_CXXFLAGS="{1}"'.format(arguments['--cxx'], arguments['--extra-cxx-flags']))
+    command.append('-DCMAKE_Fortran_COMPILER={0} -DEXTRA_FCFLAGS="{1}"'.format(arguments['--fc'],
+                                                                               arguments['--extra-fc-flags']))
+    command.append('-DCMAKE_C_COMPILER={0} -DEXTRA_CFLAGS="{1}"'.format(arguments['--cc'],
+                                                                        arguments['--extra-cc-flags']))
+    command.append('-DCMAKE_CXX_COMPILER={0} -DEXTRA_CXXFLAGS="{1}"'.format(arguments['--cxx'],
+                                                                            arguments['--extra-cxx-flags']))
     command.append('-DUSE_CCACHE={0}'.format(arguments['--ccache']))
     command.append('-DPREPROCESSOR_DEFINITIONS="{0}"'.format(arguments['--add-definitions']))
     command.append('-DENABLE_CODE_COVERAGE={0}'.format(arguments['--coverage']))
+    command.append('-DENABLE_GPERFTOOLS={0}'.format(arguments['--gperf']))
     command.append('-DENABLE_64BIT_INTEGERS={0}'.format(arguments['--int64']))
     command.append('-DENABLE_OPENMP={0}'.format(arguments['--omp']))
     command.append('-DPYTHON_INTERPRETER="{0}"'.format(arguments['--python']))
@@ -88,22 +92,17 @@ def gen_cmake_command(options, arguments):
     sys.stderr.write(options)
     sys.exit(-1)
 
-
 # use extensions to validate/post-process args
 if configure.module_exists('extensions'):
     import extensions
     arguments = extensions.postprocess_args(sys.argv, arguments)
 
-
 root_directory = os.path.dirname(os.path.realpath(__file__))
 
-
 build_path = arguments['<builddir>']
 
-
 # create cmake command
 cmake_command = '{0} -H{1}'.format(gen_cmake_command(options, arguments), root_directory)
 
-
 # run cmake
 configure.configure(root_directory, build_path, cmake_command, arguments['--show'])
diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
@@ -12,6 +12,9 @@ if(STATIC_LIBRARY_ONLY)
 else()
   target_link_libraries(run_pcm pcm-shared)
 endif()
+if(ENABLE_GPERFTOOLS)
+  target_link_libraries(run_pcm ${GPERFTOOLS_LIBRARIES})
+endif()
 target_compile_options(run_pcm
   PRIVATE
     "$<$<CONFIG:DEBUG>:${EXDIAG_CXX_FLAGS}>"
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -20,6 +20,7 @@ add_subdirectory(bi_operators)
 add_subdirectory(cpcm)
 add_subdirectory(iefpcm)
 add_subdirectory(utils)
+add_subdirectory(benchmark)
 
 add_executable(unit_tests unit_tests.cpp
   $<TARGET_OBJECTS:bi_operators-tests>
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt