Skip to content

Commit 04bea56

Browse files
committed
Enable Papi high level stats within the iterate construct
1 parent 209a90d commit 04bea56

File tree

6 files changed

+112
-53
lines changed

6 files changed

+112
-53
lines changed

gibbon-compiler/src/Gibbon/Compiler.hs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ compileRTS Config{verbosity,optc,dynflags} = do
377377
++ (if pointer then " POINTER=1 " else "")
378378
++ (if parallel then " PARALLEL=1 " else "")
379379
++ (if bumpAlloc then " BUMPALLOC=1 " else "")
380+
++ (if papi then " PAPI=1 " else "")
380381
++ (" USER_CFLAGS=\"" ++ optc ++ "\"")
381382
++ (" VERBOSITY=" ++ show verbosity)
382383
execCmd
@@ -392,6 +393,7 @@ compileRTS Config{verbosity,optc,dynflags} = do
392393
rts_debug = gopt Opt_RtsDebug dynflags
393394
print_gc_stats = gopt Opt_PrintGcStats dynflags
394395
genGC = gopt Opt_GenGc dynflags
396+
papi = gopt Opt_PapiInstrumentation dynflags
395397

396398

397399
-- | Compile and run the generated code if appropriate
@@ -426,6 +428,10 @@ compileAndRunExe cfg@Config{backend,arrayInput,benchInput,mode,cfile,exefile} fp
426428
links = if pointer
427429
then " -lgc -lm "
428430
else " -lm "
431+
papi = gopt Opt_PapiInstrumentation (dynflags cfg)
432+
links' = if papi
433+
then links ++ "-l:libpapi.a "
434+
else links
429435
compile_program = do
430436
compileRTS cfg
431437
lib_dir <- getRTSBuildDir
@@ -436,7 +442,7 @@ compileAndRunExe cfg@Config{backend,arrayInput,benchInput,mode,cfile,exefile} fp
436442
++" -L" ++ lib_dir
437443
++ " -Wl,-rpath=" ++ lib_dir ++ " "
438444
++ outfile ++ " " ++ rts_o_path
439-
++ links ++ " -lgibbon_rts_ng"
445+
++ links' ++ " -lgibbon_rts_ng"
440446

441447
execCmd
442448
Nothing
@@ -524,6 +530,7 @@ compilationCmd C config = (cc config) ++" -std=gnu11 "
524530
++ (if not genGC then " -D_GIBBON_GENGC=0 " else " -D_GIBBON_GENGC=1 ")
525531
++ (if simpleWriteBarrier then " -D_GIBBON_SIMPLE_WRITE_BARRIER=1 " else " -D_GIBBON_SIMPLE_WRITE_BARRIER=0 ")
526532
++ (if lazyPromote then " -D_GIBBON_EAGER_PROMOTION=0 " else " -D_GIBBON_EAGER_PROMOTION=1 ")
533+
++ (if papi then " -D_GIBBON_ENABLE_PAPI " else "")
527534
where dflags = dynflags config
528535
bumpAlloc = gopt Opt_BumpAlloc dflags
529536
pointer = gopt Opt_Pointer dflags
@@ -534,6 +541,7 @@ compilationCmd C config = (cc config) ++" -std=gnu11 "
534541
genGC = gopt Opt_GenGc dflags
535542
simpleWriteBarrier = gopt Opt_SimpleWriteBarrier dflags
536543
lazyPromote = gopt Opt_NoEagerPromote dflags
544+
papi = gopt Opt_PapiInstrumentation dflags
537545

538546
-- |
539547
isBench :: Mode -> Bool

gibbon-compiler/src/Gibbon/DynFlags.hs

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,34 +14,36 @@ import Data.Set as S
1414
import Options.Applicative
1515

1616
data GeneralFlag
17-
= Opt_Gibbon1 -- ^ Set Opt_No_RemoveCopies & Opt_BigInfiniteRegions
18-
| Opt_Gibbon2 -- ^ Set Opt_RemoveCopies & Opt_InfiniteRegions
19-
| Opt_RemoveCopies -- ^ Calls to copy functions are converted to indirections
20-
| Opt_No_RemoveCopies -- ^ Unset Opt_RemoveCopies
21-
| Opt_InfiniteRegions -- ^ Use infinite regions
22-
| Opt_BigInfiniteRegions -- ^ Use big infinite regions
23-
| Opt_BenchPrint -- ^ Should the benchamrked function have its output printed?
24-
| Opt_Packed -- ^ Use packed representation
25-
| Opt_Pointer -- ^ Use pointer representation
26-
| Opt_BumpAlloc -- ^ Use bump-pointer allocation if using the non-packed backend
27-
| Opt_Warnc -- ^ Show warnings from the C compiler
28-
| Opt_DisableGC -- ^ Don't run the the garbage collector (used by Codegen).
29-
| Opt_No_PureAnnot -- ^ Don't use 'pure' annotations (a GCC optimization)
30-
| Opt_Fusion -- ^ Enable fusion.
31-
| Opt_Parallel -- ^ Fork/join parallelism.
32-
| Opt_RegionOnSpawn -- ^ Allocate into fresh regions for every spawn, not steal.
33-
| Opt_GhcTc -- ^ Typecheck with GHC before compiling with Gibbon.
34-
| Opt_RelativeOffsets -- ^ Enable relative offsets.
35-
| Opt_CountParRegions -- ^ Count and print the number of regions allocated for parallelism.
36-
| Opt_CountAllRegions -- ^ Count and print the number of all the regions allocated.
37-
| Opt_RtsDebug -- ^ Compile the RTS in debugging mode.
38-
| Opt_PrintGcStats -- ^ Record and print GC statistics.
39-
| Opt_GenGc -- ^ Use the new non-generational GC.
40-
| Opt_NoEagerPromote -- ^ Disable eager promotion.
41-
| Opt_SimpleWriteBarrier -- ^ Disables eliminate-indirection-chains optimization.
42-
| Opt_Layout_Local -- ^ Optimize the layout of Algebraic data types locally
43-
| Opt_Layout_Global -- ^ Optimize the layout of Algebraic data types globally
44-
| Opt_Layout_Use_Solver -- ^ Use the Solver to optimize the layout of the data types.
17+
= Opt_Gibbon1 -- ^ Set Opt_No_RemoveCopies & Opt_BigInfiniteRegions
18+
| Opt_Gibbon2 -- ^ Set Opt_RemoveCopies & Opt_InfiniteRegions
19+
| Opt_RemoveCopies -- ^ Calls to copy functions are converted to indirections
20+
| Opt_No_RemoveCopies -- ^ Unset Opt_RemoveCopies
21+
| Opt_InfiniteRegions -- ^ Use infinite regions
22+
| Opt_BigInfiniteRegions -- ^ Use big infinite regions
23+
| Opt_BenchPrint -- ^ Should the benchamrked function have its output printed?
24+
| Opt_Packed -- ^ Use packed representation
25+
| Opt_Pointer -- ^ Use pointer representation
26+
| Opt_BumpAlloc -- ^ Use bump-pointer allocation if using the non-packed backend
27+
| Opt_Warnc -- ^ Show warnings from the C compiler
28+
| Opt_DisableGC -- ^ Don't run the the garbage collector (used by Codegen).
29+
| Opt_No_PureAnnot -- ^ Don't use 'pure' annotations (a GCC optimization)
30+
| Opt_Fusion -- ^ Enable fusion.
31+
| Opt_Parallel -- ^ Fork/join parallelism.
32+
| Opt_RegionOnSpawn -- ^ Allocate into fresh regions for every spawn, not steal.
33+
| Opt_GhcTc -- ^ Typecheck with GHC before compiling with Gibbon.
34+
| Opt_RelativeOffsets -- ^ Enable relative offsets.
35+
| Opt_CountParRegions -- ^ Count and print the number of regions allocated for parallelism.
36+
| Opt_CountAllRegions -- ^ Count and print the number of all the regions allocated.
37+
| Opt_RtsDebug -- ^ Compile the RTS in debugging mode.
38+
| Opt_PrintGcStats -- ^ Record and print GC statistics.
39+
| Opt_GenGc -- ^ Use the new non-generational GC.
40+
| Opt_NoEagerPromote -- ^ Disable eager promotion.
41+
| Opt_SimpleWriteBarrier -- ^ Disables eliminate-indirection-chains optimization.
42+
| Opt_Layout_Local -- ^ Optimize the layout of Algebraic data types locally
43+
| Opt_Layout_Global -- ^ Optimize the layout of Algebraic data types globally
44+
| Opt_Layout_Use_Solver -- ^ Use the Solver to optimize the layout of the data types.
45+
| Opt_PapiInstrumentation -- ^ Enable PAPI instrumentation while compiling the gibbon binary.
46+
4547
deriving (Show,Read,Eq,Ord)
4648

4749
-- | Exactly like GHC's ddump flags.
@@ -120,7 +122,8 @@ dynflagsParser = DynFlags <$> (S.fromList <$> many gflagsParser) <*> (S.fromList
120122
flag' Opt_SimpleWriteBarrier (long "simple-write-barrier" <> help "Disables eliminate-indirection-chains optimization.") <|>
121123
flag' Opt_Layout_Local (long "opt-layout-local" <> help "Optimizes the Layout of Algebraic data types locally") <|>
122124
flag' Opt_Layout_Global (long "opt-layout-global" <> help "Optimizes the Layout of Algebraic data types globally") <|>
123-
flag' Opt_Layout_Use_Solver (long "opt-layout-use-solver" <> help "Use the solver instead of a Greedy Heuristic")
125+
flag' Opt_Layout_Use_Solver (long "opt-layout-use-solver" <> help "Use the solver instead of a Greedy Heuristic") <|>
126+
flag' Opt_PapiInstrumentation (long "enable-papi" <> help "Enable instrumentation using papi, extends the iterate timing function." )
124127

125128

126129
dflagsParser :: Parser DebugFlag

gibbon-compiler/src/Gibbon/Passes/Codegen.hs

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ codegenProg cfg prg@(Prog info_tbl sym_tbl funs mtal) =
326326
\#include <cilk/cilk.h>\n\
327327
\#include <cilk/cilk_api.h>\n\
328328
\#endif\n\n\
329+
\#ifdef _GIBBON_ENABLE_PAPI\n\
330+
\#include <papi.h>\n\
331+
\#endif\n\n\
329332
\/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\
330333
\ * Program starts here\n\
331334
\ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\
@@ -629,6 +632,8 @@ codegenTail venv fenv sort_fns (LetTimedT flg bnds rhs body) ty sync_deps =
629632
selftimed <- gensym "selftimed"
630633
times <- gensym "times"
631634
tmp <- gensym "tmp"
635+
papi_retval <- gensym "papi_retval"
636+
papi_region <- gensym "papi_region"
632637
let ident = case bnds of
633638
((v,_):_) -> v
634639
_ -> (toVar "")
@@ -659,13 +664,36 @@ codegenTail venv fenv sort_fns (LetTimedT flg bnds rhs body) ty sync_deps =
659664
, C.BlockStm [cstm| printf("itertime: %lf\n", $id:itertime); |]
660665
, C.BlockStm [cstm| gib_vector_inplace_update($id:times, $id:iters, &($id:itertime)); |]
661666
]
662-
in [ C.BlockStm [cstm| for (long long $id:iters = 0; $id:iters < gib_get_iters_param(); $id:iters ++) { $items:body } |]
663-
, C.BlockStm [cstm| gib_vector_inplace_sort($id:times, gib_compare_doubles); |]
664-
, C.BlockDecl [cdecl| double *$id:tmp = (double*) gib_vector_nth($id:times, (gib_get_iters_param() / 2)); |]
665-
, C.BlockDecl [cdecl| double $id:selftimed = *($id:tmp); |]
666-
, C.BlockDecl [cdecl| double $id:batchtime = gib_sum_timing_array($id:times); |]
667-
, C.BlockStm [cstm| gib_print_timing_array($id:times); |]
668-
, C.BlockStm [cstm| gib_vector_free($id:times); |]
667+
-- TODO: Find a better way to get a name for the region id.
668+
ifdef = "#ifdef _GIBBON_ENABLE_PAPI"
669+
endif = "#endif"
670+
body' = [ C.BlockStm [cstm| $escstm:ifdef |]
671+
, C.BlockStm [cstm| sprintf($id:papi_region, "%d", get_papi_region_id());|]
672+
, C.BlockDecl [cdecl| int $id:papi_retval = PAPI_hl_region_begin($id:papi_region);|]
673+
, C.BlockStm [cstm| if ( $id:papi_retval != PAPI_OK ) {
674+
exit(1);
675+
} |]
676+
, C.BlockStm [cstm| $escstm:endif |]
677+
] ++
678+
body ++
679+
[ C.BlockStm [cstm| $escstm:ifdef |]
680+
, C.BlockStm [cstm| $id:papi_retval = PAPI_hl_region_end($id:papi_region);|]
681+
, C.BlockStm [cstm| if ( $id:papi_retval != PAPI_OK ) {
682+
exit(1);
683+
} |]
684+
, C.BlockStm [cstm| increment_papi_region_id(); |]
685+
, C.BlockStm [cstm| $escstm:endif |]
686+
]
687+
in [ C.BlockStm [cstm| $escstm:ifdef |]
688+
, C.BlockDecl [cdecl| char $id:papi_region[128];|]
689+
, C.BlockStm [cstm| $escstm:endif |]
690+
, C.BlockStm [cstm| for (long long $id:iters = 0; $id:iters < gib_get_iters_param(); $id:iters ++) { $items:body' } |]
691+
, C.BlockStm [cstm| gib_vector_inplace_sort($id:times, gib_compare_doubles); |]
692+
, C.BlockDecl [cdecl| double *$id:tmp = (double*) gib_vector_nth($id:times, (gib_get_iters_param() / 2)); |]
693+
, C.BlockDecl [cdecl| double $id:selftimed = *($id:tmp); |]
694+
, C.BlockDecl [cdecl| double $id:batchtime = gib_sum_timing_array($id:times); |]
695+
, C.BlockStm [cstm| gib_print_timing_array($id:times); |]
696+
, C.BlockStm [cstm| gib_vector_free($id:times); |]
669697
])
670698

671699
-- else

gibbon-rts/Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# POINTER
1414
# PARALLEL
1515
# BUMPALLOC
16+
# PAPI
1617
#
1718
#
1819
# GC toggles:
@@ -69,6 +70,10 @@ ifeq ($(POINTER), 1)
6970
CFLAGS += -D_GIBBON_POINTER
7071
endif
7172

73+
ifeq ($(PAPI), 1)
74+
CFLAGS += -D_GIBBON_ENABLE_PAPI
75+
endif
76+
7277
ifeq ($(PARALLEL), 1)
7378
CFLAGS += -fcilkplus -D_GIBBON_PARALLEL
7479
endif
@@ -111,7 +116,6 @@ RUST_RTS_SO := libgibbon_rts_ng.so
111116
RUST_RTS_PATH := $(RUST_RTS_DIR)/target/$(MODE)/$(RUST_RTS_SO)
112117
RUST_SOURCES := $(shell find $(RUST_RTS_DIR) -type f -name *.rs)
113118

114-
115119
all: rts
116120

117121
rts: c_rts rs_rts
@@ -146,7 +150,7 @@ $(C_RTS_DIR)/%.o: $(C_RTS_DIR)/%.c
146150

147151
$(BUILD_DIR)/%.h: $(C_RTS_DIR)/%.h
148152
mkdir -p $(BUILD_DIR) && \
149-
ln -s $^ $@
153+
ln -s -f $^ $@
150154

151155
$(BUILD_DIR):
152156
mkdir -p $(BUILD_DIR)

gibbon-rts/rts-c/gibbon_rts.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,9 @@
3535
#include <cilk/cilk_api.h>
3636
#endif
3737

38-
39-
38+
#ifdef _GIBBON_ENABLE_PAPI
39+
#include <papi.h>
40+
#endif
4041

4142
/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4243
* Globals and their accessors
@@ -61,6 +62,8 @@ static int64_t gib_global_region_count = 0;
6162
// Invariant: should always be equal to max(sym_table_keys).
6263
static GibSym gib_global_gensym_counter = 0;
6364

65+
//PAPI: specify the region to instrument
66+
static uint64_t papi_region_id = 0;
6467

6568

6669
size_t gib_get_biginf_init_chunk_size(void)
@@ -128,6 +131,16 @@ GibSym gib_read_gensym_counter(void)
128131
return gib_global_gensym_counter;
129132
}
130133

134+
uint64_t get_papi_region_id(void)
135+
{
136+
return papi_region_id;
137+
}
138+
139+
void increment_papi_region_id(void)
140+
{
141+
papi_region_id++;
142+
}
143+
131144

132145
/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
133146
* Allocators

gibbon-rts/rts-c/gibbon_rts.h

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,19 @@
1414
* CPP macros used in the RTS:
1515
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1616
*
17-
* _GIBBON_VERBOSITY=int verbosity level for debug output
18-
* _GIBBON_DEBUG enables various assertions if present
19-
* _GIBBON_GCSTATS collect GC statistics if present
20-
* _GIBBON_PRINT_GCSTATS print GC statistics if present
21-
* _GIBBON_GENGC only use old reference counted GC set to 0
22-
* _GIBBON_BOUNDSCHECK boundscheck vector accesses
23-
* _GIBBON_BUMPALLOC_LISTS bump allocated linked lists
24-
* _GIBBON_BUMPALLOC_HEAP bump allocated gib_alloc
25-
* _GIBBON_POINTER pointer mode gib_alloc
26-
* _GIBBON_PARALLEL parallel mode
27-
* _GIBBON_EAGER_PROMOTION disable eager promotion if set to 0
28-
* _GIBBON_SIMPLE_WRITE_BARRIER disable eliminate-indirection-chains optimization
17+
* _GIBBON_VERBOSITY=int verbosity level for debug output
18+
* _GIBBON_DEBUG enables various assertions if present
19+
* _GIBBON_GCSTATS collect GC statistics if present
20+
* _GIBBON_PRINT_GCSTATS print GC statistics if present
21+
* _GIBBON_GENGC only use old reference counted GC set to 0
22+
* _GIBBON_BOUNDSCHECK boundscheck vector accesses
23+
* _GIBBON_BUMPALLOC_LISTS bump allocated linked lists
24+
* _GIBBON_BUMPALLOC_HEAP bump allocated gib_alloc
25+
* _GIBBON_POINTER pointer mode gib_alloc
26+
* _GIBBON_PARALLEL parallel mode
27+
* _GIBBON_EAGER_PROMOTION disable eager promotion if set to 0
28+
* _GIBBON_SIMPLE_WRITE_BARRIER disable eliminate-indirection-chains optimization
29+
* _GIBBON_ENABLE_PAPI enable instrumentation via papi
2930
*
3031
*/
3132

@@ -116,6 +117,8 @@ char *gib_read_bench_prog_param(void);
116117
char *gib_read_benchfile_param(void);
117118
char *gib_read_arrayfile_param(void);
118119
uint64_t gib_read_arrayfile_length_param(void);
120+
uint64_t get_papi_region_id(void);
121+
void increment_papi_region_id(void);
119122

120123
// Number of regions allocated.
121124
int64_t gib_read_region_count(void);

0 commit comments

Comments
 (0)