|
| 1 | +############################################################################## |
| 2 | +# (c) Crown copyright 2025 Met Office. All rights reserved. |
| 3 | +# The file LICENCE, distributed with this code, contains details of the terms |
| 4 | +# under which the code may be used. |
| 5 | +############################################################################## |
| 6 | +""" |
| 7 | +Optimisation script that replaces existing OpenMP parallelisation with |
| 8 | +PSyclone-generated directives to target loops over index i instead of |
| 9 | +index j. Trip count of j loops is 1 in LFRic, which prevents parallel |
| 10 | +execution. Private variables need to be declared explicitly as PSyclone |
| 11 | +analysis currently misses a scalar variable that a subroutine modifies in |
| 12 | +a parallel region. PSyclone thread safety checks need to be overridden; |
| 13 | +the subroutines can be safely parallelised. Compiler directives used in |
| 14 | +the original code are re-inserted for performance and consistency of output. |
| 15 | +""" |
| 16 | + |
| 17 | +import logging |
| 18 | +from psyclone.transformations import TransformationError |
| 19 | +from psyclone.psyir.nodes import (Loop, CodeBlock) |
| 20 | +from transmute_psytrans.transmute_functions import ( |
| 21 | + set_pure_subroutines, |
| 22 | + get_outer_loops, |
| 23 | + mark_explicit_privates, |
| 24 | + get_compiler, |
| 25 | + first_priv_red_init, |
| 26 | + match_lhs_assignments, |
| 27 | + OMP_PARALLEL_REGION_TRANS, |
| 28 | + OMP_DO_LOOP_TRANS_STATIC |
| 29 | +) |
| 30 | + |
| 31 | +# Variables in parallel region that need to be private |
| 32 | +private_variables = [ |
| 33 | + "alphal", "alx", "i", "j", "k", "km1", "kp1", "mux", "tmp", |
| 34 | + "frac_init", "kk", "kkm1", "kkp1", "qc", "qc_points", "qsl", |
| 35 | + "tlx", "qsi", "idx", "deltacl_c", "deltacf_c", "deltaql_c", |
| 36 | + "cf_c", "cfl_c", "cff_c" |
| 37 | +] |
| 38 | + |
| 39 | +# Subroutines that need to be declared as "pure" |
| 40 | +pure_subroutines = ["qsat", "qsat_mix", "qsat_wat", "qsat_wat_mix"] |
| 41 | + |
| 42 | +# Variables that appear on the left-hand side of assignments |
| 43 | +# for which PSyclone dependency errors can be ignored |
| 44 | +false_dep_vars = [ |
| 45 | + "qc_points", |
| 46 | + "idx" |
| 47 | +] |
| 48 | + |
| 49 | + |
| 50 | +class CompilerDirective(): |
| 51 | + """ |
| 52 | + Custom compiler directive class to avoid an issue |
| 53 | + with fparser.two.Fortran2003.Directive that will |
| 54 | + be resolved in an upcoming fparser release. |
| 55 | + """ |
| 56 | + def __init__(self, directive): |
| 57 | + self.directive = directive |
| 58 | + |
| 59 | + def tofortran(self): |
| 60 | + """ |
| 61 | + Return directive with prefix |
| 62 | + """ |
| 63 | + return "!DIR$ " + self.directive |
| 64 | + |
| 65 | + |
| 66 | +def trans(psyir): |
| 67 | + """ |
| 68 | + Apply OpenMP and Compiler Directives |
| 69 | + """ |
| 70 | + |
| 71 | + # Declare subroutines as pure to enable parallelisation |
| 72 | + # of the encompassing loops |
| 73 | + set_pure_subroutines(psyir, pure_subroutines) |
| 74 | + |
| 75 | + # Identify outer loops for setting up parallel regions |
| 76 | + outer_loops = [loop for loop in get_outer_loops(psyir) |
| 77 | + if not loop.ancestor(Loop)] |
| 78 | + |
| 79 | + # Check if first OpenMP region can be parallelised and |
| 80 | + # apply directives |
| 81 | + try: |
| 82 | + OMP_PARALLEL_REGION_TRANS.validate(outer_loops[0:2]) |
| 83 | + OMP_PARALLEL_REGION_TRANS.apply(outer_loops[0:2]) |
| 84 | + OMP_DO_LOOP_TRANS_STATIC.apply(outer_loops[0]) |
| 85 | + OMP_DO_LOOP_TRANS_STATIC.apply(outer_loops[1].walk(Loop)[1]) |
| 86 | + except (TransformationError, IndexError) as err: |
| 87 | + logging.warning("Parallelisation of the 1st region failed: %s", err) |
| 88 | + |
| 89 | + # Declare private symbols for the last loop nest explicitly, |
| 90 | + # PSyclone misses one |
| 91 | + mark_explicit_privates(outer_loops[2], private_variables) |
| 92 | + |
| 93 | + # Parallelise the second region and insert compiler directives |
| 94 | + # Add redundant variable initialisation to work around a known |
| 95 | + # PSyclone issue when using CCE |
| 96 | + try: |
| 97 | + if get_compiler() == 'cce': |
| 98 | + first_priv_red_init(outer_loops[2], ["i", "j", "k"]) |
| 99 | + |
| 100 | + OMP_PARALLEL_REGION_TRANS.validate(outer_loops[2:3]) |
| 101 | + OMP_PARALLEL_REGION_TRANS.apply(outer_loops[2]) |
| 102 | + |
| 103 | + # Insert before OpenMP directives to avoid PSyclone errors |
| 104 | + if get_compiler() == "cce": |
| 105 | + for loop in outer_loops[2].walk(Loop)[3:5]: |
| 106 | + cblock = CodeBlock([CompilerDirective("NOFISSION")], |
| 107 | + CodeBlock.Structure.STATEMENT) |
| 108 | + insert_at = loop.parent.children.index(loop) |
| 109 | + loop.parent.children.insert(insert_at, cblock) |
| 110 | + |
| 111 | + for loop in outer_loops[2].walk(Loop)[13:16]: |
| 112 | + cblock = CodeBlock([CompilerDirective("IVDEP")], |
| 113 | + CodeBlock.Structure.STATEMENT) |
| 114 | + insert_at = loop.parent.children.index(loop) |
| 115 | + loop.parent.children.insert(insert_at, cblock) |
| 116 | + |
| 117 | + for loop in outer_loops[2].walk(Loop)[2:7]: |
| 118 | + OMP_DO_LOOP_TRANS_STATIC.apply(loop) |
| 119 | + |
| 120 | + for loop in outer_loops[2].walk(Loop)[8:13:2]: |
| 121 | + # Check if any eligible variables appear on the LHS of |
| 122 | + # assignment expressions; these lead to false dependency |
| 123 | + # errors in the parallel loop transformation that can be |
| 124 | + # ignored |
| 125 | + ignore_deps_vars = match_lhs_assignments(loop, false_dep_vars) |
| 126 | + options = {} |
| 127 | + if len(ignore_deps_vars) > 0: |
| 128 | + options["ignore_dependencies_for"] = ignore_deps_vars |
| 129 | + |
| 130 | + OMP_DO_LOOP_TRANS_STATIC.apply(loop, options) |
| 131 | + |
| 132 | + except (TransformationError, IndexError) as err: |
| 133 | + logging.warning("Parallelisation of the 2nd region failed: %s", err) |
0 commit comments