Skip to content

Commit d840bc5

Browse files
committed
Add profiling docs with perf, gperftools, VTUne
1 parent 3ff20c9 commit d840bc5

File tree

11 files changed

+262
-74
lines changed

11 files changed

+262
-74
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ include(autocmake_ccache)
3434
include(windows)
3535
include(autocmake_definitions)
3636
include(code_coverage)
37+
include(gperftools)
3738
include(autocmake_int64)
3839
include(autocmake_omp)
3940
include(autocmake_safeguards)

cmake/autocmake.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ modules:
2828
- 'custom/windows.cmake'
2929
- '%(url_root)modules/definitions.cmake'
3030
- 'custom/code_coverage.cmake'
31+
- 'custom/gperftools.cmake'
3132
- '%(url_root)modules/int64.cmake'
3233
- '%(url_root)modules/omp.cmake'
3334
- '%(url_root)modules/safeguards.cmake'

cmake/custom/FindLibunwind.cmake

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# This file is part of MADNESS
2+
# https://github.com/m-a-d-n-e-s-s/madness/blob/master/cmake/modules/FindLibunwind.cmake
3+
#
4+
# - Try to find Libunwind
5+
# Input variables:
6+
# LIBUNWIND_ROOT_DIR - The libunwind install directory
7+
# LIBUNWIND_INCLUDE_DIR - The libunwind include directory
8+
# LIBUNWIND_LIBRARY - The libunwind library directory
9+
# Output variables:
10+
# LIBUNWIND_FOUND - System has libunwind
11+
# LIBUNWIND_INCLUDE_DIRS - The libunwind include directories
12+
# LIBUNWIND_LIBRARIES - The libraries needed to use libunwind
13+
# LIBUNWIND_VERSION - The version string for libunwind
14+
15+
include(FindPackageHandleStandardArgs)
16+
17+
if(NOT DEFINED LIBUNWIND_FOUND)
18+
19+
# Set default sarch paths for libunwind
20+
if(LIBUNWIND_ROOT_DIR)
21+
set(LIBUNWIND_INCLUDE_DIR ${LIBUNWIND_ROOT_DIR}/include CACHE PATH "The include directory for libunwind")
22+
if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
23+
set(LIBUNWIND_LIBRARY ${LIBUNWIND_ROOT_DIR}/lib64;${LIBUNWIND_ROOT_DIR}/lib CACHE PATH "The library directory for libunwind")
24+
else()
25+
set(LIBUNWIND_LIBRARY ${LIBUNWIND_ROOT_DIR}/lib CACHE PATH "The library directory for libunwind")
26+
endif()
27+
endif()
28+
29+
find_path(LIBUNWIND_INCLUDE_DIRS NAMES libunwind.h
30+
HINTS ${LIBUNWIND_INCLUDE_DIR})
31+
32+
find_library(LIBUNWIND_LIBRARIES unwind
33+
HINTS ${LIBUNWIND_LIBRARY})
34+
35+
# Get libunwind version
36+
if(EXISTS "${LIBUNWIND_INCLUDE_DIRS}/libunwind-common.h")
37+
file(READ "${LIBUNWIND_INCLUDE_DIRS}/libunwind-common.h" _libunwind_version_header)
38+
string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_MAJOR[ \t]+([0-9]+).*" "\\1"
39+
LIBUNWIND_MAJOR_VERSION "${_libunwind_version_header}")
40+
string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_MINOR[ \t]+([0-9]+).*" "\\1"
41+
LIBUNWIND_MINOR_VERSION "${_libunwind_version_header}")
42+
string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_EXTRA[ \t]+([0-9]+).*" "\\1"
43+
LIBUNWIND_MICRO_VERSION "${_libunwind_version_header}")
44+
set(LIBUNWIND_VERSION "${LIBUNWIND_MAJOR_VERSION}.${LIBUNWIND_MINOR_VERSION}.${LIBUNWIND_MICRO_VERSION}")
45+
unset(_libunwind_version_header)
46+
endif()
47+
48+
# handle the QUIETLY and REQUIRED arguments and set LIBUNWIND_FOUND to TRUE
49+
# if all listed variables are TRUE
50+
find_package_handle_standard_args(Libunwind
51+
FOUND_VAR LIBUNWIND_FOUND
52+
VERSION_VAR LIBUNWIND_VERSION
53+
REQUIRED_VARS LIBUNWIND_LIBRARIES LIBUNWIND_INCLUDE_DIRS)
54+
55+
mark_as_advanced(LIBUNWIND_INCLUDE_DIR LIBUNWIND_LIBRARY
56+
LIBUNWIND_INCLUDE_DIRS LIBUNWIND_LIBRARIES)
57+
58+
endif()

cmake/custom/gperftools.cmake

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1-
if(ENABLE_GPERFTOOLS OR ENABLE_TCMALLOC_MINIMAL)
1+
#.rst:
2+
#
3+
# Enable profiling with gperftools.
4+
#
5+
# Variables used::
6+
#
7+
# ENABLE_GPERFTOOLS
8+
#
9+
# autocmake.yml configuration::
10+
#
11+
# docopt: "--gperf Enable profiling with gperftools [default: False]."
12+
# define: "'-DENABLE_GPERFTOOLS={0}'.format(arguments['--gperf'])"
213

3-
if(ENABLE_GPERFTOOLS)
4-
find_package(Gperftools COMPONENTS tcmalloc OPTIONAL_COMPONENTS profiler)
5-
else()
6-
find_package(Gperftools REQUIRED COMPONENTS tcmalloc_minimal)
7-
endif()
8-
9-
# Set the config.h variables
10-
if(GPERFTOOLS_FOUND AND ENABLE_TCMALLOC_MINIMAL)
11-
set(MADNESS_HAS_GOOGLE_PERF_MINIMAL 1)
12-
endif()
13-
if(LIBUNWIND_FOUND)
14-
set(MADNESS_HAS_LIBUNWIND 1)
15-
endif()
14+
option_with_print(ENABLE_GPERFTOOLS "Enable profiling with gperftools" OFF)
1615

16+
if(ENABLE_GPERFTOOLS)
17+
message(STATUS "Linking against gperftools libraries for profiling")
18+
find_package(Gperftools COMPONENTS tcmalloc profiler)
1719
endif()

default.nix

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,7 @@ in
3434
doxygen
3535
exa
3636
ffmpeg
37-
flameGraph
3837
gfortran
39-
gperftools
4038
graphviz
4139
lcov
4240
pipenv

doc/programmers/profiling.rst

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
Profiling
2+
---------
3+
4+
You should obtain profiling information before attempting any optimization of
5+
the code. There are many ways of obtaining this information, but we have only
6+
experimented with the following:
7+
8+
#. Using Linux ``perf`` and related `tools <http://www.brendangregg.com/perf.html>`_.
9+
#. Using ``gperftools``.
10+
#. Using Intel VTune.
11+
12+
Profiling should be done using the standalone executable ``run_pcm`` and any of
13+
the input files gathered under the ``tests/benchmark`` directory. These files
14+
are copied to the build directory. If you are lazy, you can run the profiling
15+
from the build directory:
16+
17+
.. code-block:: bash
18+
19+
>>> cd tests/benchmark
20+
21+
>>> env PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
22+
python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
23+
24+
Using ``perf``
25+
==============
26+
27+
``perf`` is a tool available on Linux. Though part of the kernel tools, it is
28+
not usually preinstalled on most Linux distributions. For visualization
29+
purposes we also need `additional tools <https://github.com/brendangregg/perf-tools>`_,
30+
in particular the `flame graph generation scripts <https://github.com/brendangregg/FlameGraph>`_
31+
Probably your distribution has them prepackaged already.
32+
``perf`` will trace all CPU events on your system, hence you might need to
33+
fiddle with some kernel set up files to get permissions to trace events.
34+
35+
.. note::
36+
``perf`` **is NOT** available on ``stallo``. Even if it were, you would
37+
probably not have permissions to record kernel traces.
38+
39+
These are the instructions I used:
40+
41+
1. Trace execution. This will save CPU stack traces to a ``perf.data`` file.
42+
Successive runs do not overwrite this file.
43+
44+
.. code-block:: bash
45+
46+
>>> cd tests/benchmark
47+
48+
>>> perf record -F 99 -g -- env PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH python
49+
<build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
50+
51+
2. Get reports. There are different ways of getting a report from the
52+
``perf.data`` file. The following will generate a call tree.
53+
54+
.. code-block:: bash
55+
56+
>>> perf report --stdio
57+
58+
3. Generate an interactive flame graph.
59+
60+
.. code-block:: bash
61+
62+
>>> perf script | stackcollapse-perf.pl > out.perf-folded
63+
64+
>>> cat out.perf-folded | flamegraph.pl > perf-run_pcm.svg
65+
66+
Using ``gperftools``
67+
====================
68+
69+
This set of tools was previously known as Google Performance Tools. The
70+
executable needs to be linked against the ``profiler``, ``tcmalloc``
71+
and ``unwind`` libraries.
72+
CMake will attempt to find them. If this fails, you will have to install them,
73+
you should either check if they are available for your distribution or compile
74+
from source.
75+
In principle, one could use the ``LD_PRELOAD`` mechanism to skip the *ad hoc*
76+
compilation of the executable.
77+
78+
.. note::
79+
``gperftools`` **is** available on ``stallo``, but it's an ancient version.
80+
81+
1. Configure the code with the ``--gperf`` option enabled. CPU and heap
82+
profiling, together with heap-checking will be available.
83+
84+
2. CPU profiling can be done with the following command:
85+
86+
.. code-block:: bash
87+
88+
>>> env CPUPROFILE=run_pcm.cpu.prof PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
89+
python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
90+
91+
This will save the data to the ``run_pcm.cpu.prof`` file. To analyze the gathered
92+
data we can use the ``pprof`` script:
93+
94+
.. code-block:: bash
95+
96+
>>> pprof --text <build_dir>/bin/run_pcm run_pcm.cpu.prof
97+
98+
This will print a table. Any row will look like the following:
99+
100+
.. code-block:: bash
101+
102+
2228 7.2% 24.8% 28872 93.4% pcm::utils::splineInterpolation
103+
104+
where the columns respectively report:
105+
106+
#. Number of profiling samples in this function.
107+
#. Percentage of profiling samples in this function.
108+
#. Percentage of profiling samples in the functions printed so far.
109+
#. Number of profiling samples in this function and its callees.
110+
#. Percentage of profiling samples in this function and its callees.
111+
#. Function name.
112+
113+
For more details look `here <https://gperftools.github.io/gperftools/cpuprofile.html>`_
114+
115+
3. Heap profiling can be done with the following command:
116+
117+
.. code-block:: bash
118+
119+
>>> env HEAPPROFILE=run_pcm.hprof PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
120+
python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
121+
122+
This will output a series of datafiles ``run_pcm.hprof.0000.heap``,
123+
``run_pcm.hprof.0001.heap`` and so forth. You will have to kill execution
124+
when enough samples have been collected.
125+
Analysis of the heap profiling data can be done using ``pprof``. `Read more
126+
here <https://gperftools.github.io/gperftools/heapprofile.html>`_
127+
128+
129+
Using Intel VTune
130+
=================
131+
132+
This is probably the easiest way to profile the code.
133+
`VTune <https://software.intel.com/en-us/intel-vtune-amplifier-xe>`_ is Intel software, it might be possible to get a personal, free license.
134+
The instructions will hold on any machine where VTune is installed and you can
135+
look for more details on the `online documentation <https://software.intel.com/en-us/vtune-amplifier-help>`_
136+
You can, in principle, use the GUI. I haven't managed to do that though.
137+
138+
On ``stallo``, start an interactive job and load the following modules:
139+
140+
.. code-block:: bash
141+
142+
>>> module load intel/2018a
143+
144+
>>> module load CMake
145+
146+
>>> module load VTune
147+
148+
>>> export BOOST_INCLUDEDIR=/home/roberto/Software/boost/include
149+
150+
>>> export BOOST_LIBRARYDIR=/home/roberto/Software/boost/lib
151+
152+
You will need to compile with optimizations activated, *i.e.* release mode.
153+
It is better to first parse the input file and then call ``run_pcm``:
154+
155+
.. code-block:: bash
156+
157+
>>> cd <build_dir>/tests/benchmark
158+
159+
>>> env PYTHONPATH=../../lib64/python:$PYTHONPATH
160+
python ../../bin/go_pcm.py --inp=standalone_bubble.pcm
161+
162+
To start collecting hotspots:
163+
164+
.. code-block:: bash
165+
166+
>>> amplxe-cl -collect hotspots ../../bin/run_pcm @standalone_bubble.pcm
167+
168+
VTune will generate a folder ``r000hs`` with the collected results. A report
169+
for the hotspots can be generated with:
170+
171+
.. code-block:: bash
172+
173+
>>> amplxe-cl -report hotspots -r r000hs > report

doc/programmers/programmers-manual.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@ PCMSolver Programmers' Manual
1010
cmake-usage
1111
versioning
1212
maintenance
13+
profiling
1314
testing
1415
timer-class

setup.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from autocmake import configure
1212
from autocmake.external import docopt
1313

14-
1514
options = """
1615
Usage:
1716
./setup.py [options] [<builddir>]
@@ -27,6 +26,7 @@
2726
--ccache=<USE_CCACHE> Toggle use of ccache <ON/OFF> [default: ON].
2827
--add-definitions=<STRING> Add preprocesor definitions [default: ''].
2928
--coverage Enable code coverage [default: OFF].
29+
--gperf Enable profiling with gperftools [default: False].
3030
--int64 Enable 64bit integers [default: False].
3131
--omp Enable OpenMP parallelization [default: False].
3232
--python=<PYTHON_INTERPRETER> The Python interpreter (development version) to use. [default: ''].
@@ -53,12 +53,16 @@ def gen_cmake_command(options, arguments):
5353
"""
5454
command = []
5555
command.append(arguments['--cmake-executable'])
56-
command.append('-DCMAKE_Fortran_COMPILER={0} -DEXTRA_FCFLAGS="{1}"'.format(arguments['--fc'], arguments['--extra-fc-flags']))
57-
command.append('-DCMAKE_C_COMPILER={0} -DEXTRA_CFLAGS="{1}"'.format(arguments['--cc'], arguments['--extra-cc-flags']))
58-
command.append('-DCMAKE_CXX_COMPILER={0} -DEXTRA_CXXFLAGS="{1}"'.format(arguments['--cxx'], arguments['--extra-cxx-flags']))
56+
command.append('-DCMAKE_Fortran_COMPILER={0} -DEXTRA_FCFLAGS="{1}"'.format(arguments['--fc'],
57+
arguments['--extra-fc-flags']))
58+
command.append('-DCMAKE_C_COMPILER={0} -DEXTRA_CFLAGS="{1}"'.format(arguments['--cc'],
59+
arguments['--extra-cc-flags']))
60+
command.append('-DCMAKE_CXX_COMPILER={0} -DEXTRA_CXXFLAGS="{1}"'.format(arguments['--cxx'],
61+
arguments['--extra-cxx-flags']))
5962
command.append('-DUSE_CCACHE={0}'.format(arguments['--ccache']))
6063
command.append('-DPREPROCESSOR_DEFINITIONS="{0}"'.format(arguments['--add-definitions']))
6164
command.append('-DENABLE_CODE_COVERAGE={0}'.format(arguments['--coverage']))
65+
command.append('-DENABLE_GPERFTOOLS={0}'.format(arguments['--gperf']))
6266
command.append('-DENABLE_64BIT_INTEGERS={0}'.format(arguments['--int64']))
6367
command.append('-DENABLE_OPENMP={0}'.format(arguments['--omp']))
6468
command.append('-DPYTHON_INTERPRETER="{0}"'.format(arguments['--python']))
@@ -88,22 +92,17 @@ def gen_cmake_command(options, arguments):
8892
sys.stderr.write(options)
8993
sys.exit(-1)
9094

91-
9295
# use extensions to validate/post-process args
9396
if configure.module_exists('extensions'):
9497
import extensions
9598
arguments = extensions.postprocess_args(sys.argv, arguments)
9699

97-
98100
root_directory = os.path.dirname(os.path.realpath(__file__))
99101

100-
101102
build_path = arguments['<builddir>']
102103

103-
104104
# create cmake command
105105
cmake_command = '{0} -H{1}'.format(gen_cmake_command(options, arguments), root_directory)
106106

107-
108107
# run cmake
109108
configure.configure(root_directory, build_path, cmake_command, arguments['--show'])

src/bin/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ if(STATIC_LIBRARY_ONLY)
1212
else()
1313
target_link_libraries(run_pcm pcm-shared)
1414
endif()
15+
if(ENABLE_GPERFTOOLS)
16+
target_link_libraries(run_pcm ${GPERFTOOLS_LIBRARIES})
17+
endif()
1518
target_compile_options(run_pcm
1619
PRIVATE
1720
"$<$<CONFIG:DEBUG>:${EXDIAG_CXX_FLAGS}>"

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ add_subdirectory(bi_operators)
2020
add_subdirectory(cpcm)
2121
add_subdirectory(iefpcm)
2222
add_subdirectory(utils)
23+
add_subdirectory(benchmark)
2324

2425
add_executable(unit_tests unit_tests.cpp
2526
$<TARGET_OBJECTS:bi_operators-tests>

0 commit comments

Comments
 (0)