diff --git a/tools/flang1/flang1exe/symtab.c b/tools/flang1/flang1exe/symtab.c index ad7cb62500..a423aef3a4 100644 --- a/tools/flang1/flang1exe/symtab.c +++ b/tools/flang1/flang1exe/symtab.c @@ -479,7 +479,8 @@ get_ieee_arith_intrin(char *nm) int getsymbol(const char *name) { - return getsym(name, strlen(name)); + int sym = getsym(name, strlen(name)); + return sym; } /** \brief Enter symbol with indicated name into symbol table, initialize diff --git a/tools/flang2/flang2exe/cgmain.cpp b/tools/flang2/flang2exe/cgmain.cpp index 797024be65..f89e9d2059 100644 --- a/tools/flang2/flang2exe/cgmain.cpp +++ b/tools/flang2/flang2exe/cgmain.cpp @@ -4301,7 +4301,7 @@ make_stmt(STMT_Type stmt_type, int ilix, bool deletable, SPTR next_bih_label, int alignment; INSTR_LIST *Curr_Instr; - DBGTRACEIN2(" type: %s ilix: %d", stmt_names[stmt_type], ilix) + DBGTRACEIN2(" type: %s ilix: %d", stmt_names[stmt_type], ilix); curr_stmt_type = stmt_type; if (last_stmt_is_branch && stmt_type != STMT_LABEL) { @@ -12267,7 +12267,7 @@ process_sptr_offset(SPTR sptr, ISZ_T off) } if ((flg.smp || (XBIT(34, 0x200) || gbl.usekmpc)) && (gbl.outlined || ISTASKDUPG(GBL_CURRFUNC))) { - if (sptr == ll_get_shared_arg(gbl.currsub)) { + if (sptr == ll_get_shared_arg(gbl.currsub) && !gbl.is_init_spmd_kernel) { LLTYPE(sptr) = make_ptr_lltype(make_lltype_from_dtype(DT_INT8)); } } @@ -14063,7 +14063,6 @@ process_formal_arguments(LL_ABI_Info *abi) /* Other by-value kinds. */ break; } - /* This op represents the real LLVM argument, not the local variable. */ arg_op = make_operand(); arg_op->ot_type = OT_VAR; @@ -14407,7 +14406,8 @@ INLINE void static add_property_struct(char *func_name, print_token("@"); print_token(func_name); - if (is_SPMD_mode(mode)) { + if (mode >= mode_target_teams_distribute_parallel_for + && mode <= mode_target_parallel_for_simd) { print_token("__exec_mode = weak constant i8 2\n"); } else { diff --git a/tools/flang2/flang2exe/exp_ftn.cpp b/tools/flang2/flang2exe/exp_ftn.cpp index a2ad8d618b..9de771c26e 100644 --- a/tools/flang2/flang2exe/exp_ftn.cpp +++ b/tools/flang2/flang2exe/exp_ftn.cpp @@ -4147,7 +4147,7 @@ exp_bran(ILM_OP opc, ILM *ilmp, int curilm) /***************************************************************/ void -exp_misc(ILM_OP opc, ILM *ilmp, int curilm) +exp_misc(ILM_OP opc, ILM *ilmp, int curilm, bool process_expanded) { int tmp; int ilix, listilix; @@ -4289,11 +4289,11 @@ exp_misc(ILM_OP opc, ILM *ilmp, int curilm) break; case IM_ENDF: - exp_end(ilmp, curilm, true); + exp_end(ilmp, curilm, true, process_expanded); break; case IM_END: - exp_end(ilmp, curilm, false); + exp_end(ilmp, curilm, false, process_expanded); break; case IM_BYVAL: diff --git a/tools/flang2/flang2exe/exp_ftn.h b/tools/flang2/flang2exe/exp_ftn.h index 9857b6a9cb..4832002ca6 100644 --- a/tools/flang2/flang2exe/exp_ftn.h +++ b/tools/flang2/flang2exe/exp_ftn.h @@ -53,7 +53,7 @@ void exp_bran(ILM_OP opc, ILM *ilmp, int curilm); /** \brief ... */ -void exp_misc(ILM_OP opc, ILM *ilmp, int curilm); +void exp_misc(ILM_OP opc, ILM *ilmp, int curilm, bool process_expanded = false); /** \brief ... diff --git a/tools/flang2/flang2exe/exp_rte.cpp b/tools/flang2/flang2exe/exp_rte.cpp index 011e8047c4..744a8c28b0 100644 --- a/tools/flang2/flang2exe/exp_rte.cpp +++ b/tools/flang2/flang2exe/exp_rte.cpp @@ -2139,7 +2139,7 @@ exp_alloca(ILM *ilmp) static void gen_funcret(finfo_t *); void -exp_end(ILM *ilmp, int curilm, bool is_func) +exp_end(ILM *ilmp, int curilm, bool is_func, bool process_expanded) { int tmp; int op1; @@ -2158,10 +2158,12 @@ exp_end(ILM *ilmp, int curilm, bool is_func) int ilix; if (flg.omptarget && !is_func) { if (XBIT(232, 0x40) && gbl.ompaccel_intarget && !OMPACCFUNCDEVG(gbl.currsub) /*is_gpu_output_file() */ ) { - ilix = ll_make_kmpc_target_deinit( - ompaccel_tinfo_get(gbl.currsub)->mode); - iltb.callfg = 1; - chk_block(ilix); + OMP_TARGET_MODE mode = ompaccel_tinfo_get(gbl.currsub)->mode; + if (!is_SPMD_mode(mode) && !process_expanded) { + ilix = ll_make_kmpc_target_deinit(mode); + iltb.callfg = 1; + chk_block(ilix); + } } } #endif diff --git a/tools/flang2/flang2exe/exp_rte.h b/tools/flang2/flang2exe/exp_rte.h index b8a5c9ce16..7d8394c9da 100644 --- a/tools/flang2/flang2exe/exp_rte.h +++ b/tools/flang2/flang2exe/exp_rte.h @@ -100,7 +100,7 @@ void exp_cgoto(ILM *ilmp, int curilm); /** \brief ... */ -void exp_end(ILM *ilmp, int curilm, bool is_func); +void exp_end(ILM *ilmp, int curilm, bool is_func, bool process_expanded = false); /** \brief ... diff --git a/tools/flang2/flang2exe/expand.cpp b/tools/flang2/flang2exe/expand.cpp index 12dd61e1ac..c395a8d026 100644 --- a/tools/flang2/flang2exe/expand.cpp +++ b/tools/flang2/flang2exe/expand.cpp @@ -54,6 +54,8 @@ #ifdef OMP_OFFLOAD_LLVM #include "tgtutil.h" #include "kmpcutil.h" +#include +#include #endif extern int in_extract_inline; /* Bottom-up auto-inlining */ @@ -62,6 +64,8 @@ static int create_ref(SPTR sym, int *pnmex, int basenm, int baseilix, int *pclen, int *pmxlen, int *prestype); static int jsr2qjsr(int); +SPTR +eval_ilm_check_if_skip(int ilmx, int *skip_expand = nullptr, int *process_expanded = nullptr); #define DO_PFO ((XBIT(148, 0x1000) && !XBIT(148, 0x4000)) || XBIT(148, 1)) /***************************************************************/ @@ -215,7 +219,6 @@ parse_im_file(const ILM *ilmp, int *lineno_out, int *findex_out, int *ftag_out) } /***************************************************************/ - /** \brief Expand ILMs to ILIs */ int expand(void) @@ -229,12 +232,39 @@ expand(void) int last_ftag = 0; int nextftag = 0, nextfindex = 0; int last_cpp_branch = 0; - + static int skip_expand; + static int skip_expand_sptr; + static std::map process_expanded_map = std::map(); + auto it = process_expanded_map.find(gbl.currsub); + int process_expanded = 0; + + //we are at the beginning of pragma expansion + //make sure that mploop_counter equals to zero + reset_mploop_counter(); + // we reset flag because we do not know if we generate initialization + // function for SPMD kernel (the function with kmpc_parallel_51 call) + // or the proper kernel code (the function which is passed as an argument + // to kmpc_parallel_51 call or generic kernel + gbl.is_init_spmd_kernel = false; + if (it != process_expanded_map.end()) + { + process_expanded = it->second; + } + else + { + process_expanded = 0; + } /* * NOTE, for an ILM: ilmx is needed to access the ILM_AUX area, ilmp is * needed to access the ILM area */ exp_init(); + + //set current target info if given target region was already processed + if(ompaccel_tinfo_get(gbl.currsub)) + { + ompaccel_tinfo_current_set(ompaccel_tinfo_get(gbl.currsub)); + } /* During expand, we want to generate unique proc ili each time a * proc ILM is processed. The assumption is that the scheduler will * cse a proc ili if it appears multiple times in a block. E.g., @@ -299,7 +329,13 @@ expand(void) ilmp = (ILM *)(ilmb.ilm_base + ilmx); opc = ILM_OPC(ilmp); - + /* Do not expand map statements for helper function for kmpc_parallel_51 */ + if ((opc == IM_MP_MAP || opc == IM_MP_EMAP) && process_expanded) + continue; + if (process_expanded) + { + gbl.ompoutlinedfunc = gbl.currsub; + } if (opc == IM_BR) { last_cpp_branch = ILM_OPND(ilmp, 1); } else if (opc == IM_LABEL) { @@ -318,8 +354,17 @@ expand(void) * variable operands */ if (IM_TRM(opc)) { int cur_label = BIH_LABEL(expb.curbih); - eval_ilm(ilmx); - } + if (!skip_expand){ + SPTR sptr1 = eval_ilm_check_if_skip(ilmx, &skip_expand, &process_expanded); + if (skip_expand) { + skip_expand_sptr = sptr1; + process_expanded_map[skip_expand_sptr] = 1; + ll_write_ilm_header((int)sptr1, ilmx); + restartRewritingILM(ilmx); + } + } else { + ll_rewrite_ilms(-1, ilmx, len); + }} else if (flg.smp && len) { ll_rewrite_ilms(-1, ilmx, len); } @@ -366,7 +411,6 @@ expand(void) new_callee_scope = 0; } while (opc != IM_END && opc != IM_ENDF); - if (DBGBIT(10, 2) && (bihb.stg_avail != 1)) { int bih; for (bih = 1; bih != 0; bih = BIH_NEXT(bih)) { @@ -423,6 +467,13 @@ expand(void) } else { fihb.nextfindex = fihb.currfindex = 1; } + if (skip_expand && !process_expanded) + { + process_expanded = 1; + unsetRewritingILM(); + } + skip_expand = 0; + return expb.nilms; } @@ -451,16 +502,65 @@ eval_ilm_argument1(int opr, ILM *ilmpx, int ilmx) } } /* eval_ilm_argument1 */ -void -eval_ilm(int ilmx) +static std::vector get_allocated_symbols(OMPACCEL_TINFO *orig_symbols) +{ + int num_of_symbols = orig_symbols->n_symbols; + char allocated_symbol_name[128]; + SPTR allocated_symbol; + std::vector init_symbols{}; + int store_instr; + int load_instr; + for (unsigned i = 0; i < num_of_symbols; ++i) { + if (PASSBYVALG(orig_symbols->symbols[i].device_sym) && + !PASSBYREFG(orig_symbols->symbols[i].device_sym)) + continue; + if (!DT_ISSCALAR(DTYPEG(orig_symbols->symbols[i].device_sym)) + && STYPEG(orig_symbols->symbols[i].host_sym) != ST_STRUCT) { + continue; + } + snprintf(allocated_symbol_name, sizeof(allocated_symbol_name), + ".allocated_symbol_%d", i); + allocated_symbol = getsymbol(allocated_symbol_name); + STYPEP(allocated_symbol, ST_VAR); + if (STYPEG(orig_symbols->symbols[i].host_sym) == ST_STRUCT) + DTYPEP(allocated_symbol,DT_CPTR); + else + DTYPEP(allocated_symbol, + get_type(2,TY_PTR,DTYPEG(orig_symbols->symbols[i].device_sym))); + SCP(allocated_symbol, SC_AUTO); + store_instr = ad4ili(IL_ST, + ad_acon(orig_symbols->symbols[i].device_sym,0), + ad_acon(allocated_symbol,0), + addnme(NT_VAR, allocated_symbol, 0,0), + MSZ_I8); + chk_block(store_instr); + load_instr = mk_ompaccel_ldsptr(allocated_symbol); + chk_block(load_instr); + + init_symbols.push_back(load_instr); + } + return init_symbols; + +} +void eval_ilm(int ilmx) +{ + eval_ilm_check_if_skip(ilmx, nullptr, nullptr); +} + +SPTR +eval_ilm_check_if_skip(int ilmx, int *skip_expand, int *process_expanded) { + SPTR sptr1 = SPTR_NULL; ILM *ilmpx; int noprs, /* number of operands in the ILM */ ilix, /* ili index */ tmp, /* temporary */ op1; /* operand 1 */ ILM_OP opcx; /**< ILM opcode of the ILM */ + static int mp_loop_nest_level; + const int mp_loop_second_nest_level = 2; + static bool omit_loop_nesting; int first_op = 0; @@ -478,13 +578,31 @@ eval_ilm(int ilmx) /* Set line no for EPARx */ gbl.lineno = ILM_OPND(ilmpx, 1); } - return; + return sptr1; } } if (EXPDBG(8, 2)) fprintf(gbl.dbgfil, "---------- eval ilm %d\n", ilmx); + if (flg.omptarget && gbl.ompaccel_intarget && !ll_ilm_is_rewriting()) { + if (opcx == IM_MPLOOP) { + if (++mp_loop_nest_level == mp_loop_second_nest_level) { + omit_loop_nesting = true; + } + } + else if ((opcx == IM_MPLOOPFINI) && + (mp_loop_nest_level == mp_loop_second_nest_level)) { + if (omit_loop_nesting) { + omit_loop_nesting = false; + } + } + else if (omit_loop_nesting) + { + //Do not expand ilm instructions for 2nd level of parallelism + return sptr1; + } + } if (!ll_ilm_is_rewriting()) { #ifdef OMP_OFFLOAD_LLVM @@ -510,12 +628,12 @@ eval_ilm(int ilmx) } } else if (opcx == IM_MP_EREDUCTION) { ompaccel_notify_reduction(false); - return; + return sptr1; } } if (ompaccel_is_reduction_region()) - return; + return sptr1; } #endif /*- @@ -614,7 +732,7 @@ eval_ilm(int ilmx) if (IM_I8(opcx)) ILM_RESTYPE(ilmx) = ILM_ISI8; - return; + return sptr1; } switch (IM_TYPE(opcx)) { /* special-cased ILM */ @@ -645,7 +763,10 @@ eval_ilm(int ilmx) break; case IMTY_MISC: /* miscellaneous */ - exp_misc(opcx, ilmpx, ilmx); + if (process_expanded && *process_expanded) + exp_misc(opcx, ilmpx, ilmx, true); + else + exp_misc(opcx, ilmpx, ilmx); break; case IMTY_FSTR: /* fortran string */ @@ -687,7 +808,12 @@ eval_ilm(int ilmx) /* We do not initialize spmd kernel library since we do not use spmd data * sharing model. It does extra work and allocates device on-chip memory. * */ - if (XBIT(232, 0x40) && gbl.ompaccel_intarget) { + if (XBIT(232, 0x40) && gbl.ompaccel_intarget && !*process_expanded) { + //TODO move initialization to separate function + std::vector allocated_symbols; + if (is_SPMD_mode(ompaccel_tinfo_get(gbl.currsub)->mode)) { + allocated_symbols = get_allocated_symbols(ompaccel_tinfo_get(gbl.currsub)); + } ilix = ll_make_kmpc_target_init(ompaccel_tinfo_get(gbl.currsub)->mode); /* Generate new control flow for generic kernel */ @@ -714,9 +840,24 @@ eval_ilm(int ilmx) exp_label(target_code_lab); if (is_SPMD_mode(ompaccel_tinfo_get(gbl.currsub)->mode)) { - iltb.callfg = 1; ilix = ll_make_kmpc_global_thread_num(); + iltb.callfg = 1; chk_block(ilix); + gbl.is_init_spmd_kernel = true; + sptr1 = ll_make_helper_function_for_kmpc_parallel_51((SPTR)0, ompaccel_tinfo_get(gbl.currsub)); + ilix = ll_make_kmpc_parallel_51(ilix, allocated_symbols, sptr1); + iltb.callfg = 1; + chk_block(ilix); + ilix = ll_make_kmpc_target_deinit(ompaccel_tinfo_get(gbl.currsub)->mode); + iltb.callfg = 1; + chk_block(ilix); + expb.curilt = addilt(expb.curilt, ad1ili(IL_EXIT, gbl.currsub)); + BIH_XT(expb.curbih) = 1; + BIH_LAST(expb.curbih) = 1; + wr_block(); + if (skip_expand && process_expanded && (*process_expanded == 0)){ + *skip_expand = 1; + } } iltb.callfg = 1; @@ -727,6 +868,7 @@ eval_ilm(int ilmx) #endif if (IM_I8(opcx)) ILM_RESTYPE(ilmx) = ILM_ISI8; + return sptr1; } /***************************************************************/ diff --git a/tools/flang2/flang2exe/expsmp.cpp b/tools/flang2/flang2exe/expsmp.cpp index 8cc3d1b50f..4f340845ac 100644 --- a/tools/flang2/flang2exe/expsmp.cpp +++ b/tools/flang2/flang2exe/expsmp.cpp @@ -69,8 +69,6 @@ std::list targetVector; int HasRequiresUnifiedSharedMemory = false; // AOCC End -static int incrOutlinedCnt(void); -static int decrOutlinedCnt(void); static int getOutlinedTemp(char *, int); static int isUnnamedCs(int); static int addMpUnp(void); @@ -3270,7 +3268,7 @@ no_pad_func(char *fname) NOPADP(sptr, 1); } -static int +int decrOutlinedCnt(void) { outlinedCnt--; @@ -3281,7 +3279,7 @@ decrOutlinedCnt(void) return outlinedCnt; } -static int +int incrOutlinedCnt(void) { parCnt++; diff --git a/tools/flang2/flang2exe/expsmp.h b/tools/flang2/flang2exe/expsmp.h index 3a17885011..923f737af2 100644 --- a/tools/flang2/flang2exe/expsmp.h +++ b/tools/flang2/flang2exe/expsmp.h @@ -114,4 +114,6 @@ void section_create_endblock(SPTR endLabel); /// \brief ... LLTask* llGetTask(int scope); +int incrOutlinedCnt(void); +int decrOutlinedCnt(void); #endif // EXPSMP_H_ diff --git a/tools/flang2/flang2exe/kmpcutil.cpp b/tools/flang2/flang2exe/kmpcutil.cpp index 72374f6c4d..0e3af9cf75 100644 --- a/tools/flang2/flang2exe/kmpcutil.cpp +++ b/tools/flang2/flang2exe/kmpcutil.cpp @@ -27,6 +27,7 @@ #define _GNU_SOURCE // for vasprintf() #include +#include #undef _GNU_SOURCE #include "kmpcutil.h" #include "error.h" @@ -188,10 +189,13 @@ static class ClassKmpcApiCalls break; case KMPC_API_SPMD_KERNEL_INIT: return {"__kmpc_spmd_kernel_init", IL_NONE, DT_VOID_NONE, 0}; + // AOCC Begin case KMPC_API_TARGET_INIT: return {"__kmpc_target_init_v1", IL_NONE, DT_INT, 0}; break; - // AOCC Begin + case KMPC_API_PARALLEL_51: + return {"__kmpc_parallel_51", IL_NONE, DT_INT, 0}; + break; #ifdef OMP_OFFLOAD_AMD case KMPC_API_TARGET_DEINIT: return {"__kmpc_target_deinit_v1", IL_NONE, DT_VOID_NONE, 0}; @@ -313,9 +317,11 @@ static const struct kmpc_api_entry_t kmpc_api_calls[] = { KMPC_FLAG_STR_FMT}, [KMPC_API_SPMD_KERNEL_INIT] = {"__kmpc_spmd_kernel_init", 0, DT_VOID_NONE, 0}, + // AOCC Begin [KMPC_API_TARGET_INIT] = {"__kmpc_target_init_v1", 0, DT_INT, 0}, - // AOCC Begin + [KMPC_API_PARALLEL_51] = {"__kmpc_parallel_51", 0, DT_INT, + 0}, #ifdef OMP_OFFLOAD_AMD [KMPC_API_TARGET_DEINIT] = {"__kmpc_target_deinit_v1", 0, DT_VOID_NONE, 0}, @@ -1730,10 +1736,19 @@ ll_make_kmpc_target_init(OMP_TARGET_MODE mode) int args[4]; args[3] = gen_null_arg(); /* ident */ - if (is_SPMD_mode(mode)) { + if (mode >= mode_target_teams_distribute_parallel_for && + mode <= mode_target_parallel_for_simd) { args[2] = ad_icon(2); /* SPMD Mode */ args[1] = ad_icon(0); /* UseGenericStateMachine */ - args[0] = ad_icon(0); /* RequiresFullRuntime */ + if (mode == mode_target_parallel) { + /* RequiresFullRuntime - kmpc_parallel_51 requires full runtime */ + args[0] = ad_icon(1); + } + else { + /* RequiresFullRuntime - Old Fortran OpenMP API does not require + * full runtime */ + args[0] = ad_icon(0); + } } else { args[2] = ad_icon(1); /* Generic mode */ args[1] = ad_icon(1); /* UseGenericStateMachine */ @@ -1742,9 +1757,129 @@ ll_make_kmpc_target_init(OMP_TARGET_MODE mode) return mk_kmpc_api_call(KMPC_API_TARGET_INIT, 4, arg_types, args); } +int get_n_symbols(OMPACCEL_TINFO *tinfo) +{ + int orig_n_symbols = tinfo->n_symbols; + int n_symbols = orig_n_symbols; + for (int i = 0; i < orig_n_symbols; ++i) { + //skip uninitialized symbols + if (DTYPEG(tinfo->symbols[i].device_sym) == 0) { + n_symbols--; + } + } + return n_symbols; +} + +bool check_if_skip_symbol(SPTR sym) +{ + if (DTYPEG(sym) == 0) + return true; + return false; +} + +int +ll_make_kmpc_parallel_51(int global_tid_sptr, + std::vector &symbols, + SPTR helper_func, + SPTR lower, + SPTR upper) +{ + static int id; + int n_symbols = get_n_symbols(ompaccel_tinfo_get(gbl.currsub)); + DTYPE arg_types[9]; + DTYPE void_ptr_t = DT_ADDR;//create_dtype_funcprototype(); + DTYPE void_ptr_ptr_t = get_type(2, TY_PTR, void_ptr_t); + DTYPE arr_dtype; + int args[9]; + + if (lower && upper) + n_symbols += 2; + + SPTR captured_vars = make_array_sptr(const_cast("captured_vars_addrs"), + void_ptr_t, + n_symbols); + int ilix; + int nme_args = add_arrnme(NT_ARR, + captured_vars, + addnme(NT_VAR, captured_vars, 0, 0), + 0, + ad_icon(0), + FALSE); + int j = 0; + int i = 0; + /* Store lower and upper bounds for loop distribution */ + if (lower && upper) { + ilix = mk_ompaccel_ldsptr(lower); + ilix = mk_ompaccel_store(ilix, + DT_INT8, + nme_args, + ad_acon(captured_vars, i * TARGET_PTRSIZE)); + chk_block(ilix); + i++; + ilix = mk_ompaccel_ldsptr(upper); + ilix = mk_ompaccel_store(ilix, + DT_INT8, + nme_args, + ad_acon(captured_vars, i * TARGET_PTRSIZE)); + chk_block(ilix); + i++; + } + for (; i < n_symbols; ++i) { + if (check_if_skip_symbol(ompaccel_tinfo_get(gbl.currsub)->symbols[i].device_sym)) + continue; + else if (PASSBYVALG(ompaccel_tinfo_get(gbl.currsub)->symbols[i].device_sym) && + !PASSBYREFG(ompaccel_tinfo_get(gbl.currsub)->symbols[i].device_sym)) { + ilix = mk_ompaccel_ldsptr(ompaccel_tinfo_get(gbl.currsub)->symbols[i].device_sym); + ilix = mk_ompaccel_store(ilix, + DT_INT8, + nme_args, + ad_acon(captured_vars, i * TARGET_PTRSIZE)); + } + else if (DT_ISSCALAR(DTYPEG(ompaccel_tinfo_get(gbl.currsub)->symbols[i].device_sym)) || + STYPEG(ompaccel_tinfo_get(gbl.currsub)->symbols[i].host_sym) == ST_STRUCT) { + ilix = mk_ompaccel_store(symbols[j++], + DT_INT8, + nme_args, + ad_acon(captured_vars, i * TARGET_PTRSIZE)); + } + else { + ilix = mk_ompaccel_ldsptr(ompaccel_tinfo_get(gbl.currsub)->symbols[i].device_sym); + ilix = mk_ompaccel_store(ilix, + DT_INT8, + nme_args, + ad_acon(captured_vars, i * TARGET_PTRSIZE)); + } + chk_block(ilix); + } + + arg_types[0] = DT_CPTR; /* ident */ + arg_types[1] = DT_INT; /* global_tid */ + arg_types[2] = DT_INT; /* if_expr */ + arg_types[3] = DT_INT; /* num_threads */ + arg_types[4] = DT_INT; /* proc_bind */ + arg_types[5] = void_ptr_t; /* fn */ + arg_types[6] = void_ptr_t; /* wrapper_fn */ + arg_types[7] = void_ptr_ptr_t; /* args */ + arg_types[8] = DT_INT8; /* n_args */ + + args[8] = gen_null_arg(); /* ident */ + args[7] = global_tid_sptr; /* global_tid */ + args[6] = ad_icon(1); /* if_expr */ + args[5] = ad_icon(-1); /* num_threads */ + args[4] = ad_icon(-1); /* proc_bind */ + if (helper_func) + args[3] = ad_acon(helper_func, 0); + else + args[3] = gen_null_arg(); + args[2] = gen_null_arg(); /* wrapper_fn */ + args[1] = ad_acon(captured_vars, 0); /* args */ + args[0] = ad_icon(n_symbols); /* n_args */ + + return mk_kmpc_api_call(KMPC_API_PARALLEL_51, 9, arg_types, args); +} + // AOCC Begin #ifdef OMP_OFFLOAD_AMD - int ll_make_kmpc_target_deinit(OMP_TARGET_MODE mode) { @@ -1752,9 +1887,18 @@ ll_make_kmpc_target_deinit(OMP_TARGET_MODE mode) int args[3]; args[2] = gen_null_arg(); /* ident */ - if (is_SPMD_mode(mode)) { + if (mode >= mode_target_teams_distribute_parallel_for && + mode <= mode_target_parallel_for_simd) { args[1] = ad_icon(2); /* SPMD Mode */ - args[0] = ad_icon(0); /* RequiresFullRuntime */ + if (mode == mode_target_parallel) { + /* RequiresFullRuntime - kmpc_parallel_51 requires full runtime */ + args[0] = ad_icon(1); + } + else { + /* RequiresFullRuntime - Old Fortran OpenMP API does not require + * full runtime */ + args[0] = ad_icon(0); + } } else { args[1] = ad_icon(1); /* Generic mode */ args[0] = ad_icon(1); /* RequiresFullRuntime */ diff --git a/tools/flang2/flang2exe/kmpcutil.h b/tools/flang2/flang2exe/kmpcutil.h index 696aaf026b..6619b1d3ec 100644 --- a/tools/flang2/flang2exe/kmpcutil.h +++ b/tools/flang2/flang2exe/kmpcutil.h @@ -28,7 +28,9 @@ #include "symtab.h" #include "ili.h" //AOCC Begin +typedef struct _OMPACCEL_TARGET OMPACCEL_TINFO; #include "llmputil.h" +#include //AOCC End /** \file * \brief Various definitions for the kmpc runtime @@ -175,6 +177,7 @@ enum { KMPC_API_TARGET_INIT, KMPC_API_SPMD_KERNEL_INIT, // AOCC Begin + KMPC_API_PARALLEL_51, #ifdef OMP_OFFLOAD_AMD KMPC_API_TARGET_DEINIT, KMPC_API_SPMD_KERNEL_DEINIT_V2, @@ -503,6 +506,15 @@ int ll_make_kmpc_for_static_init_simple_spmd(const loop_args_t *, int); int ll_make_kmpc_target_init(OMP_TARGET_MODE); // AOCC Begin +/** + \brief Generate kmpc_parallel_51 function call +*/ +int ll_make_kmpc_parallel_51(int global_tid_sptr, + std::vector &, + SPTR, + SPTR lower = (SPTR)0, + SPTR upper = (SPTR)0); + #ifdef OMP_OFFLOAD_AMD /** \brief kernel deinit @@ -526,6 +538,18 @@ int ll_make_kmpc_nvptx_parallel_reduce_nowait_simple_spmd(int, int, int, SPTR, S */ int ll_make_kmpc_nvptx_end_reduce_nowait(); +/** + \brief Get number of correctly initialized number of symbols. +*/ +int get_n_symbols(OMPACCEL_TINFO *tinfo); + +/** + \brief Check if given symbol should be skipped + If DTYPE of symbol is 0 then the symbol should not be passed + as an argument to kmpc_parallel_51 function +*/ +bool check_if_skip_symbol(SPTR sym); + /* End OpenMP Accelerator RT - non standard */ #endif #endif /* KMPC_RUNTIME_H_ */ diff --git a/tools/flang2/flang2exe/llassem.cpp b/tools/flang2/flang2exe/llassem.cpp index fc1b204f71..02b1e2cab9 100644 --- a/tools/flang2/flang2exe/llassem.cpp +++ b/tools/flang2/flang2exe/llassem.cpp @@ -406,8 +406,9 @@ find_ag(const char *ag_name) int hashval = name_to_hash(ag_name, strlen(ag_name)); for (gblsym = agb.hashtb[hashval]; gblsym; gblsym = AG_HASHLK(gblsym)) - if (!strcmp(ag_name, AG_NAME(gblsym))) + if (!strcmp(ag_name, AG_NAME(gblsym))){ return gblsym; + } return SPTR_NULL; } diff --git a/tools/flang2/flang2exe/ompaccel.cpp b/tools/flang2/flang2exe/ompaccel.cpp index 9d266499c3..e669e72542 100644 --- a/tools/flang2/flang2exe/ompaccel.cpp +++ b/tools/flang2/flang2exe/ompaccel.cpp @@ -66,6 +66,9 @@ // Should be in sync with clang::GPU::AMDGPUGpuGridValues in clang int warp_size_log2; int warp_size_log2_mask; +// count if we expand the second MPLOOP instruction +// inside single OpenMP pragma +int mploop_counter; // AOCC End #include "../../flang1/flang1exe/global.h" @@ -2758,7 +2761,25 @@ exp_ompaccel_mploop(ILM *ilmp, int curilm) ili = ll_make_kmpc_for_static_init(&loop_args); // AOCC end } else { - ili = ll_make_kmpc_for_static_init_simple_spmd(&loop_args, sched); + mploop_counter++; + if (mploop_counter != 2) + ili = ll_make_kmpc_for_static_init_simple_spmd(&loop_args, sched); + else { + std::vector allocated_symbols; + SPTR func_ptr = ll_make_helper_function_for_kmpc_parallel_51 + ((SPTR)0, + ompaccel_tinfo_get(gbl.currsub), + loop_args.lower, + loop_args.upper); + int ilix = ll_make_kmpc_global_thread_num(); + ilix = ll_make_kmpc_parallel_51(ilix, + allocated_symbols, + func_ptr, + loop_args.lower, + loop_args.upper); + iltb.callfg = 1; + chk_block(ilix); + } } break; default: @@ -3726,13 +3747,17 @@ ompaccel_set_target_declare() { } bool is_SPMD_mode(OMP_TARGET_MODE mode) { - if (mode >= mode_target_teams_distribute_parallel_for - && mode <= mode_target_parallel_for_simd) { + if (mode == mode_target_parallel) { return true; } return false; } +void reset_mploop_counter() +{ + mploop_counter = 0; +} + // AOCC End #endif /* Expander - OpenMP Accelerator Model */ diff --git a/tools/flang2/flang2exe/ompaccel.h b/tools/flang2/flang2exe/ompaccel.h index f9173b7487..b270afe6b1 100644 --- a/tools/flang2/flang2exe/ompaccel.h +++ b/tools/flang2/flang2exe/ompaccel.h @@ -590,4 +590,8 @@ void ompaccel_set_target_declare(); */ bool is_SPMD_mode(OMP_TARGET_MODE mode); // AOCC End +/** + \brief Reset counts of MPLOOP instruction + */ +void reset_mploop_counter(); #endif diff --git a/tools/flang2/flang2exe/ompaccel_x86.cpp b/tools/flang2/flang2exe/ompaccel_x86.cpp index 8bafa7e6b3..d0359fac04 100644 --- a/tools/flang2/flang2exe/ompaccel_x86.cpp +++ b/tools/flang2/flang2exe/ompaccel_x86.cpp @@ -170,7 +170,7 @@ void ompaccel_x86_fix_arg_types(SPTR func_sptr) { } OMPACCEL_TINFO *tinfo = ompaccel_tinfo_get(func_sptr); - + if (!tinfo) return; // Remember all the reduction symbols of func_sptr so that we can blacklist // them during the type update. std::set reduc_syms; diff --git a/tools/flang2/flang2exe/outliner.cpp b/tools/flang2/flang2exe/outliner.cpp index be22268474..e3862b2517 100644 --- a/tools/flang2/flang2exe/outliner.cpp +++ b/tools/flang2/flang2exe/outliner.cpp @@ -477,12 +477,16 @@ ll_get_shared_arg(SPTR func_sptr) } void -ll_make_ftn_outlined_params(int func_sptr, int paramct, DTYPE *argtype) +ll_make_ftn_outlined_params(int func_sptr, int paramct, DTYPE *argtype, OMPACCEL_TINFO *current_tinfo, bool has_bounds_args) { int count = 0; int sym, dtype; char name[MXIDLEN + 2]; int dpdscp = aux.dpdsc_avl; + int cnt = 0; + int number_of_prologue_args = 2; + if (has_bounds_args) + number_of_prologue_args += 2; //lower and upper bounds PARAMCTP(func_sptr, paramct); DPDSCP(func_sptr, dpdscp); @@ -491,7 +495,11 @@ ll_make_ftn_outlined_params(int func_sptr, int paramct, DTYPE *argtype) aux.dpdsc_size + paramct + 100); while (paramct--) { - sprintf(name, "%sArg%d", SYMNAME(func_sptr), count++); + if (current_tinfo && cnt >= number_of_prologue_args) + sprintf(name, "%s", + SYMNAME(ompaccel_tinfo_get(gbl.currsub)->symbols[cnt-number_of_prologue_args].device_sym)); + else + sprintf(name, "%sArg%d", SYMNAME(func_sptr), count++); sym = getsymbol(name); SCP(sym, SC_DUMMY); if (*argtype == DT_CPTR) { /* either i8* or actual type( pass by value). */ @@ -500,9 +508,31 @@ ll_make_ftn_outlined_params(int func_sptr, int paramct, DTYPE *argtype) DTYPEP(sym, *argtype); PASSBYVALP(sym, 1); } + argtype++; STYPEP(sym, ST_VAR); aux.dpdsc_base[dpdscp++] = sym; + //AOC begin + if (current_tinfo) + { + NEED((current_tinfo->n_symbols + 1), current_tinfo->symbols, OMPACCEL_SYM, + current_tinfo->sz_symbols, current_tinfo->sz_symbols * 2); + if (cnt >= 2) { + if (!(PASSBYVALG(sym) && !PASSBYREFG(sym) && DTYPEG(sym) == DT_INT8)) { + PASSBYVALP(sym, false); + PASSBYREFP(sym, true); + } + current_tinfo->symbols[current_tinfo->n_symbols].host_sym = + ompaccel_tinfo_get(gbl.currsub)->symbols[cnt-2].device_sym; + current_tinfo->symbols[current_tinfo->n_symbols].device_sym = + ompaccel_tinfo_get(gbl.currsub)->symbols[cnt-2].device_sym; + } + current_tinfo->symbols[current_tinfo->n_symbols].map_type = 0; + current_tinfo->symbols[current_tinfo->n_symbols].in_map = 0; + current_tinfo->n_symbols++; + cnt++; + } + //AOCC end } } @@ -1155,6 +1185,7 @@ ll_rewrite_ilms(int lineno, int ilmx, int len) /* replace host sptr with device sptrs, PLD keeps sptr in 2nd index */ op1Pld = ILM_OPND(ilmpx, 1); + //replace host sym to device sym ILM_OPND(ilmpx, 2) = ompaccel_tinfo_current_get_devsptr(ILM_SymOPND(ilmpx, 2)); // AOCC begin @@ -2416,7 +2447,6 @@ llMakeFtnOutlinedSignatureTarget(SPTR func_sptr, OMPACCEL_TINFO *current_tinfo, for (i = 0; i < current_tinfo->n_symbols; ++i) { SPTR sptr = current_tinfo->symbols[i].host_sym; - // AOCC begin if (XBIT(232, 0x1)) { if (orig_sptr_map.find(sptr) != orig_sptr_map.end()) { @@ -2647,6 +2677,78 @@ ompaccel_copy_arraydescriptors(SPTR arg_sptr) return device_symbol; } +static bool is_complex_type(DTYPE dt) +{ + if (dt == DT_DCMPLX){ + return true; + } + else if (dt == DT_CMPLX){ + return true; + } + return false; +} + +SPTR +ll_make_helper_function_for_kmpc_parallel_51(SPTR scope_sptr, + OMPACCEL_TINFO *orig_tinfo, + SPTR lower_bound, + SPTR upper_bound) +{ + OMPACCEL_TINFO *current_tinfo; + SPTR func_sptr; + + int max_nargs = orig_tinfo->n_symbols + + orig_tinfo->n_quiet_symbols + + orig_tinfo->n_reduction_symbols; + int func_args_cnt = get_n_symbols(orig_tinfo); + func_args_cnt += 2; // global_tid, bound_tid + target_info args + if (lower_bound && upper_bound) + func_args_cnt += 2; // + lower_bound + upper_bound + std::vector func_args(func_args_cnt); + auto *symbols = orig_tinfo->symbols; + bool has_bounds_args = lower_bound && upper_bound; + int i = 2; + func_args[0] = get_type(2, TY_PTR, DT_INT8);// global_tid + func_args[1] = get_type(2, TY_PTR, DT_INT8);// bound_tid + if (has_bounds_args) { + func_args[2] = DT_INT8; //lower_bound + func_args[3] = DT_INT8; //upper_bound + i += 2; + } + for (; i < func_args_cnt; i++) { + if(DT_ISSCALAR( DTYPEG(symbols->device_sym)) + && !is_complex_type(DTYPEG(symbols->device_sym))) { + func_args[i] = DT_CPTR; + } + else if (STYPEG(symbols->host_sym) == ST_STRUCT) { + func_args[i] = DT_CPTR; + } + else { + func_args[i] = DTYPEG(symbols->device_sym); + } + symbols++; + } + + func_sptr = create_target_outlined_func_sptr(scope_sptr, false); + CCSYMP(func_sptr, + 1); /* currently we make all CCSYM func varargs in Fortran. */ + CFUNCP(func_sptr, 1); + TASKFNP(func_sptr, FALSE); + ISTASKDUPP(func_sptr, FALSE); + OUTLINEDP(func_sptr, gbl.currsub); + FUNCLINEP(func_sptr, gbl.lineno); + STYPEP(func_sptr, ST_ENTRY); + DTYPEP(func_sptr, DT_VOID_NONE); + DEFDP(func_sptr, 1); + SCP(func_sptr, SC_STATIC); + ADDRTKNP(func_sptr, 1); + OMPACCFUNCDEVP(func_sptr, 1); + current_tinfo = ompaccel_tinfo_create(func_sptr, max_nargs); + ll_make_ftn_outlined_params(func_sptr, func_args_cnt, func_args.data(), current_tinfo, has_bounds_args); + ll_process_routine_parameters(func_sptr); + return func_sptr; +} + SPTR ll_make_outlined_ompaccel_func(SPTR stblk_sptr, SPTR scope_sptr, bool iskernel) { diff --git a/tools/flang2/flang2exe/outliner.h b/tools/flang2/flang2exe/outliner.h index 642e739e26..99129d86bc 100644 --- a/tools/flang2/flang2exe/outliner.h +++ b/tools/flang2/flang2exe/outliner.h @@ -26,6 +26,7 @@ #include "symtab.h" #include "ili.h" #include +#include "kmpcutil.h" extern FILE *par_file1; extern FILE *par_file2; @@ -246,7 +247,11 @@ void ilm_outlined_pad_ilm(int curilm); /** \brief ... */ -void ll_make_ftn_outlined_params(int func_sptr, int paramct, DTYPE *argtype); +void ll_make_ftn_outlined_params(int func_sptr, + int paramct, + DTYPE *argtype, + OMPACCEL_TINFO *current_tinfo = nullptr, + bool has_bound_args = false); /** \brief ... @@ -398,4 +403,9 @@ bool outlined_is_eliminated(ILM_OP opc); bool outlined_need_recompile(); void ll_set_ompaccel_currfunc(bool isILMrecompile); +SPTR +ll_make_helper_function_for_kmpc_parallel_51(SPTR scope_sptr, + OMPACCEL_TINFO *orig_tinfo, + SPTR lower_bound = SPTR(0), + SPTR upper_bound = SPTR(0)); #endif /* OUTLINER_H_ */ diff --git a/tools/flang2/flang2exe/tgtutil.h b/tools/flang2/flang2exe/tgtutil.h index cb034ed79f..d8d36ec76f 100644 --- a/tools/flang2/flang2exe/tgtutil.h +++ b/tools/flang2/flang2exe/tgtutil.h @@ -145,4 +145,9 @@ DTYPE ll_make_tgt_offload_entry(char *); void init_tgtutil(); +/** + \brief Create array sptr + */ +SPTR +make_array_sptr(char *name, DTYPE atype, int arraysize); #endif /* __TGT_RUNTIME_H__ */ diff --git a/tools/shared/utils/global.h b/tools/shared/utils/global.h index 14ef411d85..02988d5e0f 100644 --- a/tools/shared/utils/global.h +++ b/tools/shared/utils/global.h @@ -162,6 +162,7 @@ typedef struct { bool ompaccel_intarget; /* set when expander is in the openmp target construct */ bool ompaccel_isdevice; /* set when generating code for openmp target device */ SPTR teamPrivateArgs; /* keeps sptr that holds team private array */ + bool is_init_spmd_kernel; /* if TRUE, we generate initialization proceudre of SPMD kernel */ #endif } GBL;