diff --git a/.github/workflows/good_defines.txt b/.github/workflows/good_defines.txt index 9cbd6c8ede..a084e38fcc 100644 --- a/.github/workflows/good_defines.txt +++ b/.github/workflows/good_defines.txt @@ -24,6 +24,7 @@ SCREENING SCREEN_METHOD SDC SIMPLIFIED_SDC +SINGLE_PRECISION_JACOBIAN STRANG TRUE_SDC _OPENMP diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000..256143394a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,19 @@ +# Repository Guidelines + +## Project Structure & Module Organization +The repository is organized around modular physics kernels in `EOS/`, `networks/`, `integration/`, and companion directories such as `conductivity/`, `neutrinos/`, `opacity/`, and `rates/`. Shared math utilities and build scripts remain in `util/`, while reusable constants sit in `constants/`. Interfaces consumed by AMReX-based application codes are in `interfaces/`. Unit regression assets reside under `unit_test/`, and Sphinx documentation sources are in `Docs/` with release history tracked in `CHANGES.md`. + +## Build, Test, and Development Commands +Export `AMREX_HOME` and `MICROPHYSICS_HOME` before building so the GNU make stubs locate AMReX sources. Build a standalone unit test in place, e.g. `cd unit_test/burn_cell && make -j4`, which yields `main3d.gnu.ex`. Select physics at compile time with flags like `make NETWORK_DIR=aprox19 EOS_DIR=helmholtz`, and reset a directory via `make clean`. + +## Coding Style & Naming Conventions +C++ sources follow `.editorconfig`: four-space indentation, LF line endings, and tabs only inside Makefiles. Headers keep the uppercase `.H` suffix and implementations use `.cpp`. Favor the existing snake_case routine pattern (`nse_check`, `burn_cell`) and guard autogenerated directories such as `util/autodiff`. Run `.clang-tidy` with `make USE_CLANG_TIDY=TRUE` on significant changes. + +## Testing Guidelines +Each directory beneath `unit_test/` ships a `GNUmakefile`, `inputs_*` controls, and README notes; run the built binary with the matching inputs file (for example `./main3d.gnu.ex inputs_aprox13`). For new coverage, clone an existing `test_*` layout, describe the scenario, and rerun the most relevant case before posting. Capture scratch artifacts via `.gitignore` rather than committing them. + +## Commit & Pull Request Guidelines +Develop on topic branches and open pull requests against `development`; merges to `main` occur monthly. Keep commit subjects concise and imperative, mirroring history such as `add NSE network compatibility docs (#1852)`. Update `CHANGES.md` for user-facing fixes, link issues, and note required AMReX revisions. PRs should summarize physics choices, list tests run, and attach plots or logs when behavior shifts. + +## Documentation & Automation +Sphinx sources in `Docs/` back the published guide; update them with code changes and ensure `make -C Docs html` finishes cleanly. GitHub Actions publishes the result and exercises unit tests, so keep workflows green by limiting optional dependencies and recording new Python requirements in `requirements.txt`. diff --git a/Docs/source/ode_integrators.rst b/Docs/source/ode_integrators.rst index 6ee5bc5804..4644a347a5 100644 --- a/Docs/source/ode_integrators.rst +++ b/Docs/source/ode_integrators.rst @@ -113,6 +113,23 @@ For the other networks (usually pynucastro networks), the implementation is provided in ``Microphysics/util/linpack.H`` and is templated on the number of equations. Pivoting can be disabled by setting ``integrator.linalg_do_pivoting=0``. +.. index:: USE_SINGLE_PRECISION_JACOBIAN + +.. tip:: + + The storage for the Jacobian can take up the most memory when + integrating the reaction system. It is possible to store the + Jacobian as single-precision, by building with: + + :: + + USE_SINGLE_PRECISION_JACOBIAN=TRUE + + This can speed up the integration prevent the code from running out + of memory when run on GPUs. + + + Integration errors ================== diff --git a/integration/BackwardEuler/be_type.H b/integration/BackwardEuler/be_type.H index 0efae612a7..04d7d8f190 100644 --- a/integration/BackwardEuler/be_type.H +++ b/integration/BackwardEuler/be_type.H @@ -48,7 +48,7 @@ struct be_t { amrex::Real rtol_enuc; amrex::Array1D y; - ArrayUtil::MathArray2D<1, int_neqs, 1, int_neqs> jac; + ArrayUtil::MathArray2D jac; short jacobian_type; }; diff --git a/integration/Make.package b/integration/Make.package index 81f5c94887..9b8ab665f6 100644 --- a/integration/Make.package +++ b/integration/Make.package @@ -26,6 +26,9 @@ else CEXE_headers += integrator_setup_strang.H endif +ifeq ($(USE_SINGLE_PRECISION_JACOBIAN), TRUE) + DEFINES += -DSINGLE_PRECISION_JACOBIAN +endif ifeq ($(USE_ALL_NSE), TRUE) ifeq ($(USE_ALL_SDC), TRUE) diff --git a/integration/VODE/vode_dvode.H b/integration/VODE/vode_dvode.H index bbbf414398..aeb69725b1 100644 --- a/integration/VODE/vode_dvode.H +++ b/integration/VODE/vode_dvode.H @@ -170,14 +170,18 @@ int dvode (BurnT& state, DvodeT& vstate) if (vstate.n_step == 0) { #ifndef AMREX_USE_GPU + if (!state.suppress_failure_output) { std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: too much accuracy requested at start of integration" << amrex::ResetDisplay << std::endl; + } #endif return IERR_TOO_MUCH_ACCURACY_REQUESTED; } // Too much accuracy requested for machine precision. #ifndef AMREX_USE_GPU - std::cout << "DVODE: too much accuracy requested" << std::endl; + if (!state.suppress_failure_output) { + std::cout << "DVODE: too much accuracy requested" << std::endl; + } #endif for (int i = 1; i <= int_neqs; ++i) { vstate.y(i) = vstate.yh(i,1); @@ -197,7 +201,9 @@ int dvode (BurnT& state, DvodeT& vstate) if (kflag == -1) { // Error test failed repeatedly or with ABS(H) = HMIN. #ifndef AMREX_USE_GPU - std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: error test failed repeatedly or with abs(H) = HMIN" << amrex::ResetDisplay << std::endl; + if (!state.suppress_failure_output) { + std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: error test failed repeatedly or with abs(H) = HMIN" << amrex::ResetDisplay << std::endl; + } #endif // Set Y array, T, and optional output. for (int i = 1; i <= int_neqs; ++i) { @@ -211,7 +217,9 @@ int dvode (BurnT& state, DvodeT& vstate) else if (kflag == -2) { // Convergence failed repeatedly or with ABS(H) = HMIN. #ifndef AMREX_USE_GPU - std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: corrector convergence failed repeatedly or with abs(H) = HMIN" << amrex::ResetDisplay << std::endl; + if (!state.suppress_failure_output) { + std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: corrector convergence failed repeatedly or with abs(H) = HMIN" << amrex::ResetDisplay << std::endl; + } #endif // Set Y array, T, and optional output. for (int i = 1; i <= int_neqs; ++i) { diff --git a/integration/VODE/vode_type.H b/integration/VODE/vode_type.H index 3c246bfb50..a1df1f9bf2 100644 --- a/integration/VODE/vode_type.H +++ b/integration/VODE/vode_type.H @@ -143,11 +143,11 @@ struct dvode_t amrex::Array1D y; // Jacobian - ArrayUtil::MathArray2D<1, int_neqs, 1, int_neqs> jac; + ArrayUtil::MathArray2D jac; #ifdef ALLOW_JACOBIAN_CACHING // Saved Jacobian - ArrayUtil::MathArray2D<1, int_neqs, 1, int_neqs> jac_save; + ArrayUtil::MathArray2D jac_save; #endif // the Nordsieck history array diff --git a/integration/integrator_data.H b/integration/integrator_data.H index 2d4f7795cd..fb698318aa 100644 --- a/integration/integrator_data.H +++ b/integration/integrator_data.H @@ -52,6 +52,6 @@ constexpr int integrator_neqs () using IArray1D = amrex::Array1D; using RArray1D = amrex::Array1D; -using RArray2D = ArrayUtil::MathArray2D<1, INT_NEQS, 1, INT_NEQS>; +using RArray2D = ArrayUtil::MathArray2D; #endif diff --git a/integration/integrator_setup_strang.H b/integration/integrator_setup_strang.H index b18b6d5136..d63f246c1c 100644 --- a/integration/integrator_setup_strang.H +++ b/integration/integrator_setup_strang.H @@ -181,7 +181,7 @@ void integrator_cleanup (IntegratorT& int_state, BurnT& state, // If we failed, print out the current state of the integration. - if (! state.success) { + if (! state.success && !state.suppress_failure_output) { #ifndef AMREX_USE_GPU std::cout << amrex::Font::Bold << amrex::FGColor::Red << "[ERROR] integration failed in net" << amrex::ResetDisplay << std::endl; std::cout << "istate = " << istate << std::endl; diff --git a/integration/utils/circle_theorem.H b/integration/utils/circle_theorem.H index 957a2f3ada..85afc59d7e 100644 --- a/integration/utils/circle_theorem.H +++ b/integration/utils/circle_theorem.H @@ -17,7 +17,7 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void circle_theorem_sprad(const amrex::Real time, BurnT& state, T& int_state, amrex::Real& sprad) { - ArrayUtil::MathArray2D<1, INT_NEQS, 1, INT_NEQS> jac_array; + ArrayUtil::MathArray2D jac_array; if (integrator_rp::jacobian == 1) { jac(time, state, int_state, jac_array); diff --git a/interfaces/ArrayUtilities.H b/interfaces/ArrayUtilities.H index 37f6b51963..9b000eb77e 100644 --- a/interfaces/ArrayUtilities.H +++ b/interfaces/ArrayUtilities.H @@ -34,7 +34,7 @@ namespace ArrayUtil amrex::Real arr[(XHI-XLO+1)]; }; - template + template struct MathArray2D { AMREX_GPU_HOST_DEVICE AMREX_INLINE @@ -71,7 +71,7 @@ namespace ArrayUtil } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - amrex::Real get (const int i, const int j) const noexcept { + T get (const int i, const int j) const noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); return arr[i+j*(XHI-XLO+1)-(YLO*(XHI-XLO+1)+XLO)]; } @@ -84,18 +84,18 @@ namespace ArrayUtil } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - const amrex::Real& operator() (int i, int j) const noexcept { + const T& operator() (int i, int j) const noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); return arr[i+j*(XHI-XLO+1)-(YLO*(XHI-XLO+1)+XLO)]; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - amrex::Real& operator() (int i, int j) noexcept { + T& operator() (int i, int j) noexcept { AMREX_ASSERT(i >= XLO && i <= XHI && j >= YLO && j <= YHI); return arr[i+j*(XHI-XLO+1)-(YLO*(XHI-XLO+1)+XLO)]; } - amrex::Real arr[(XHI-XLO+1)*(YHI-YLO+1)]; + T arr[(XHI-XLO+1)*(YHI-YLO+1)]; }; namespace Math diff --git a/interfaces/burn_type.H b/interfaces/burn_type.H index bf7ce27b6d..817998455e 100644 --- a/interfaces/burn_type.H +++ b/interfaces/burn_type.H @@ -56,7 +56,15 @@ const int SVAR = SEDEN+1; const int SVAR_EVOLVE = SFX; // this is the data type of the dense Jacobian that the network wants. -using JacNetArray2D = ArrayUtil::MathArray2D<1, neqs, 1, neqs>; + +// the Jacobian can have a different type that our system +#ifdef SINGLE_PRECISION_JACOBIAN +using jac_t = float; +#else +using jac_t = amrex::Real; +#endif + +using JacNetArray2D = ArrayUtil::MathArray2D; struct burn_t { @@ -154,6 +162,9 @@ struct burn_t // integrator error code short error_code{}; + // suppress verbose failure diagnostics (useful for GPU retries) + bool suppress_failure_output{}; + }; diff --git a/networks/rhs.H b/networks/rhs.H index c096cc6d99..9d417cd2f3 100644 --- a/networks/rhs.H +++ b/networks/rhs.H @@ -1357,7 +1357,7 @@ void rhs (burn_t& burn_state, amrex::Array1D& ydot) // Analytical Jacobian AMREX_GPU_HOST_DEVICE AMREX_INLINE -void jac (burn_t& burn_state, ArrayUtil::MathArray2D<1, neqs, 1, neqs>& jac) +void jac (burn_t& burn_state, ArrayUtil::MathArray2D& jac) { rhs_state_t rhs_state; @@ -1521,7 +1521,7 @@ void actual_rhs (burn_t& state, amrex::Array1D& ydot) } AMREX_GPU_HOST_DEVICE AMREX_INLINE -void actual_jac (burn_t& state, ArrayUtil::MathArray2D<1, neqs, 1, neqs>& jac) +void actual_jac (burn_t& state, ArrayUtil::MathArray2D& jac) { RHS::jac(state, jac); } diff --git a/unit_test/burn_cell_primordial_chem/GNUmakefile b/unit_test/burn_cell_primordial_chem/GNUmakefile index 669e9839c7..c00cb402e2 100644 --- a/unit_test/burn_cell_primordial_chem/GNUmakefile +++ b/unit_test/burn_cell_primordial_chem/GNUmakefile @@ -2,7 +2,7 @@ PRECISION = DOUBLE PROFILE = FALSE # Set DEBUG to TRUE if debugging -DEBUG = TRUE +DEBUG = FALSE DIM = 1 @@ -15,7 +15,7 @@ USE_CUDA = FALSE USE_REACT = TRUE # Set USE_MICROPHYSICS_DEBUG to TRUE if debugging -USE_MICROPHYSICS_DEBUG = TRUE +USE_MICROPHYSICS_DEBUG = FALSE EBASE = main diff --git a/unit_test/burn_cell_primordial_chem/burn_cell.H b/unit_test/burn_cell_primordial_chem/burn_cell.H index f5ced4cb5e..797a68002e 100644 --- a/unit_test/burn_cell_primordial_chem/burn_cell.H +++ b/unit_test/burn_cell_primordial_chem/burn_cell.H @@ -1,9 +1,21 @@ #ifndef BURN_CELL_H #define BURN_CELL_H +#include +#include +#include #include +#include +#include #include - +#include +#include +#include +#include + +#include +#include +#include #include #include #include @@ -12,10 +24,11 @@ amrex::Real grav_constant = 6.674e-8; -AMREX_INLINE -auto burn_cell_c() -> int { +namespace { - burn_t state; +AMREX_INLINE +void init_burn_state(burn_t &state, amrex::Real density_scale, + amrex::Real temp_offset, bool print_initial) { amrex::Real numdens[NumSpec] = {-1.0}; @@ -67,23 +80,28 @@ auto burn_cell_c() -> int { } } - // Echo initial conditions at burn and fill burn state input - - std::cout << "Maximum Time (s): " << unit_test_rp::tmax << std::endl; - std::cout << "State Temperature (K): " << unit_test_rp::temperature << std::endl; for (int n = 0; n < NumSpec; ++n) { - std::cout << "Number Density input (" << short_spec_names_cxx[n] - << "): " << numdens[n] << std::endl; + numdens[n] *= density_scale; } - state.T = unit_test_rp::temperature; + if (print_initial) { + std::cout << "Maximum Time (s): " << unit_test_rp::tmax << std::endl; + std::cout << "State Temperature (K): " + << unit_test_rp::temperature + temp_offset << std::endl; + for (int n = 0; n < NumSpec; ++n) { + std::cout << "Number Density input (" << short_spec_names_cxx[n] + << "): " << numdens[n] << std::endl; + } + } + + state = burn_t{}; + state.T = unit_test_rp::temperature + temp_offset; // find the density in g/cm^3 amrex::Real rhotot = 0.0_rt; for (int n = 0; n < NumSpec; ++n) { state.xn[n] = numdens[n]; - rhotot += state.xn[n] * spmasses[n]; // spmasses contains the masses of all - // species, defined in EOS + rhotot += state.xn[n] * spmasses[n]; } state.rho = rhotot; @@ -105,158 +123,489 @@ auto burn_cell_c() -> int { } #ifdef DEBUG - for (int n = 0; n < NumSpec; ++n) { - std::cout << "Mass fractions (" << short_spec_names_cxx[n] - << "): " << mfracs[n] << std::endl; - std::cout << "Number Density conserved (" << short_spec_names_cxx[n] - << "): " << state.xn[n] << std::endl; + if (print_initial) { + for (int n = 0; n < NumSpec; ++n) { + std::cout << "Mass fractions (" << short_spec_names_cxx[n] + << "): " << mfracs[n] << std::endl; + std::cout << "Number Density conserved (" << short_spec_names_cxx[n] + << "): " << state.xn[n] << std::endl; + } } #endif - // call the EOS to set initial internal energy e eos(eos_input_rt, state); +} - // name of output file - std::ofstream state_over_time("state_over_time.txt"); - - // save the initial state -- we'll use this to determine - // how much things changed over the entire burn - burn_t state_in = state; +AMREX_INLINE +void enforce_post_burn_constraints(burn_t &state) { + + amrex::Real inmfracs[NumSpec] = {-1.0}; + amrex::Real insum = 0.0_rt; + for (int nn = 0; nn < NumSpec; ++nn) { + state.xn[nn] = amrex::max(state.xn[nn], small_x); + inmfracs[nn] = spmasses[nn] * state.xn[nn] / state.rho; + insum += inmfracs[nn]; + } - // output the data in columns, one line per timestep - state_over_time << std::setw(10) << "# Time"; - state_over_time << std::setw(15) << "Density"; - state_over_time << std::setw(15) << "Temperature"; - for (int x = 0; x < NumSpec; ++x) { - const std::string &element = short_spec_names_cxx[x]; - state_over_time << std::setw(15) << element; + for (int nn = 0; nn < NumSpec; ++nn) { + inmfracs[nn] /= insum; + // update the number densities with conserved mass fractions + state.xn[nn] = inmfracs[nn] * state.rho / spmasses[nn]; } - state_over_time << std::endl; - amrex::Real t = 0.0; + // update the number density of electrons due to charge conservation + balance_charge(state); - state_over_time << std::setw(10) << t; - state_over_time << std::setw(15) << state.rho; - state_over_time << std::setw(15) << state.T; - for (int x = 0; x < NumSpec; ++x) { - state_over_time << std::setw(15) << state.xn[x]; + // reconserve mass fractions post charge conservation + insum = 0.0_rt; + for (int nn = 0; nn < NumSpec; ++nn) { + state.xn[nn] = amrex::max(state.xn[nn], small_x); + inmfracs[nn] = spmasses[nn] * state.xn[nn] / state.rho; + insum += inmfracs[nn]; } - state_over_time << std::endl; - // loop over steps, burn, and output the current state - // the loop below is similar to that used in krome and pynucastro - amrex::Real dd = rhotot; - amrex::Real dd1 = 0.0_rt; + for (int nn = 0; nn < NumSpec; ++nn) { + inmfracs[nn] /= insum; + // update the number densities with conserved mass fractions + state.xn[nn] = inmfracs[nn] * state.rho / spmasses[nn]; + } - for (int n = 0; n < unit_test_rp::nsteps; n++) { + // get the updated T + eos(eos_input_re, state); +} - dd1 = dd; +} // namespace - amrex::Real rhotmp = 0.0_rt; - for (int nn = 0; nn < NumSpec; ++nn) { - rhotmp += state.xn[nn] * spmasses[nn]; +AMREX_INLINE +auto burn_cell_multi_c(int n_zones) -> int { + + constexpr amrex::Real rel_tol = 1.0e-10_rt; + constexpr amrex::Real abs_tol = 1.0e-12_rt; + + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(n_zones > 0, + "n_zones must be positive"); + + struct MultiZoneSummary { + int failures = 0; + int retries = 0; + amrex::Real t_min = std::numeric_limits::max(); + amrex::Real t_max = -std::numeric_limits::max(); + bool any_success = false; + }; + + auto run_multi_zone = [&](bool use_gpu, std::vector &final_states, + std::vector &final_times, + MultiZoneSummary &summary) -> bool { + amrex::Gpu::HostVector states_h(n_zones); + amrex::Gpu::HostVector dd_h(n_zones); + amrex::Gpu::HostVector dt_h(n_zones); + amrex::Gpu::HostVector time_h(n_zones); + std::vector states_before(n_zones); + std::vector active_h(n_zones, 1); + + summary = MultiZoneSummary{}; + int retry_total = 0; + + for (int iz = 0; iz < n_zones; ++iz) { + amrex::Real temp_offset = + 0.5_rt * unit_test_rp::temperature * + (static_cast(iz) / + static_cast(n_zones - 1) - + 0.5_rt); + init_burn_state(states_h[iz], 1.0_rt, temp_offset, false); + dd_h[iz] = states_h[iz].rho; + dt_h[iz] = 0.0_rt; + time_h[iz] = 0.0_rt; + active_h[iz] = 1; } - // find the freefall time - amrex::Real tff = std::sqrt(M_PI * 3.0 / (32.0 * rhotmp * grav_constant)); - amrex::Real dt = unit_test_rp::tff_reduc * tff; - // scale the density - dd += dt * (dd / tff); - -#ifdef DEBUG - std::cout << "step params " << dd << ", " << tff << ", " << rhotmp - << std::endl; -#endif - - // stop the test if dt is very small - if (dt < 10) { - break; + amrex::Gpu::DeviceVector states_d; + amrex::Gpu::DeviceVector dt_d; + if (use_gpu) { + states_d.resize(n_zones); + dt_d.resize(n_zones); } - // stop the test if we have reached very high densities - if (dd > 2e-6) { - break; + const int max_split_depth = 6; + constexpr amrex::Real min_split_dt = 1.0_rt; + + auto burn_backend = [&](burn_t &state, amrex::Real dt) { + state.time = 0.0_rt; + state.suppress_failure_output = true; + + if (use_gpu) { + amrex::AsyncArray state_device(&state, 1); + burn_t *state_ptr = state_device.data(); + + amrex::ParallelFor( + 1, [state_ptr, dt] AMREX_GPU_DEVICE(int idx) noexcept { + burner(state_ptr[idx], dt); + }); + + state_device.copyToHost(&state, 1); + amrex::Gpu::streamSynchronize(); + } else { + burner(state, dt); + } + }; + + std::function integrate_with_retries; + integrate_with_retries = [&](burn_t &state, amrex::Real dt, + int depth) -> bool { + burn_t backup = state; + burn_backend(state, dt); + if (state.success && state.e > 0.0_rt) { + state.suppress_failure_output = false; + return true; + } + if (depth >= max_split_depth || 0.5_rt * dt <= min_split_dt) { + state = backup; + state.suppress_failure_output = false; + return false; + } + amrex::Real half_dt = 0.5_rt * dt; + state = backup; + if (!integrate_with_retries(state, half_dt, depth + 1)) { + state.suppress_failure_output = false; + return false; + } + bool status = integrate_with_retries(state, half_dt, depth + 1); + state.suppress_failure_output = false; + return status; + }; + + for (int step = 0; step < unit_test_rp::nsteps; ++step) { + bool any_active = false; + + for (int iz = 0; iz < n_zones; ++iz) { + if (active_h[iz] == 0) { + dt_h[iz] = 0.0_rt; + continue; + } + + amrex::Real dd1 = dd_h[iz]; + amrex::Real rhotmp = 0.0_rt; + for (int nn = 0; nn < NumSpec; ++nn) { + rhotmp += states_h[iz].xn[nn] * spmasses[nn]; + } + amrex::Real tff = + std::sqrt(M_PI * 3.0_rt / (32.0_rt * rhotmp * grav_constant)); + amrex::Real dt = unit_test_rp::tff_reduc * tff; + amrex::Real dd_new = dd1 + dt * (dd1 / tff); + + if (dt < 10.0_rt || dd_new > 2.0e-6_rt) { + active_h[iz] = 0; + dt_h[iz] = 0.0_rt; + continue; + } + + amrex::Real ratio = dd_new / dd1; + for (int nn = 0; nn < NumSpec; ++nn) { + states_h[iz].xn[nn] *= ratio; + } + states_h[iz].rho *= ratio; + + states_h[iz].suppress_failure_output = true; + states_before[iz] = states_h[iz]; + + dd_h[iz] = dd_new; + dt_h[iz] = dt; + any_active = true; + } + + if (!any_active) { + break; + } + + if (use_gpu) { + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, states_h.begin(), + states_h.end(), states_d.begin()); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, dt_h.begin(), + dt_h.end(), dt_d.begin()); + amrex::Gpu::streamSynchronize(); + + burn_t *states_ptr = states_d.data(); + const amrex::Real *dt_ptr = dt_d.data(); + + amrex::ParallelFor( + n_zones, [states_ptr, dt_ptr] AMREX_GPU_DEVICE(int idx) noexcept { + amrex::Real dt = dt_ptr[idx]; + if (dt <= 0.0_rt) { + return; + } + burner(states_ptr[idx], dt); + }); + amrex::Gpu::streamSynchronize(); + + amrex::Gpu::copyAsync(amrex::Gpu::deviceToHost, states_d.begin(), + states_d.end(), states_h.begin()); + amrex::Gpu::streamSynchronize(); + } else { + burn_t *states_ptr = states_h.data(); + amrex::Real *dt_ptr = dt_h.data(); + amrex::ParallelForOMP( + n_zones, [states_ptr, dt_ptr, &burn_backend](int iz) noexcept { + amrex::Real dt = dt_ptr[iz]; + if (dt <= 0.0_rt) { + return; + } + burn_backend(states_ptr[iz], dt); + }); + } + + for (int iz = 0; iz < n_zones; ++iz) { + if (dt_h[iz] <= 0.0_rt) { + continue; + } + + auto &state = states_h[iz]; + + if (!state.success || state.e <= 0.0_rt) { + burn_t retry_state = states_before[iz]; + retry_state.suppress_failure_output = false; + ++retry_total; + if (integrate_with_retries(retry_state, dt_h[iz], 0)) { + state = retry_state; + } else { + state = states_before[iz]; + state.success = false; + active_h[iz] = 0; + dt_h[iz] = 0.0_rt; + continue; + } + } + + state.suppress_failure_output = false; + enforce_post_burn_constraints(state); + time_h[iz] += dt_h[iz]; + } } - std::cout << "step " << n << " done" << std::endl; - - // scale the number densities - for (int nn = 0; nn < NumSpec; ++nn) { - state.xn[nn] *= dd / dd1; + summary.failures = 0; + summary.t_min = std::numeric_limits::max(); + summary.t_max = -std::numeric_limits::max(); + summary.any_success = false; + summary.retries = retry_total; + + for (int iz = 0; iz < n_zones; ++iz) { + if (!states_h[iz].success || states_h[iz].e <= 0.0_rt) { + ++summary.failures; + continue; + } + summary.any_success = true; + summary.t_min = std::min(summary.t_min, states_h[iz].T); + summary.t_max = std::max(summary.t_max, states_h[iz].T); } - // input the scaled density in burn state - state.rho *= dd / dd1; - - // do the actual integration - burner(state, dt); - - // ensure positivity and normalize - amrex::Real inmfracs[NumSpec] = {-1.0}; - amrex::Real insum = 0.0_rt; - for (int nn = 0; nn < NumSpec; ++nn) { - state.xn[nn] = amrex::max(state.xn[nn], small_x); - inmfracs[nn] = spmasses[nn] * state.xn[nn] / state.rho; - insum += inmfracs[nn]; + final_states.assign(states_h.begin(), states_h.end()); + final_times.assign(time_h.begin(), time_h.end()); + + return (summary.failures == 0); + }; + + std::vector cpu_states; + std::vector cpu_times; + MultiZoneSummary cpu_summary; + auto cpu_start = std::chrono::steady_clock::now(); + bool cpu_success = run_multi_zone(false, cpu_states, cpu_times, cpu_summary); + auto cpu_end = std::chrono::steady_clock::now(); + std::chrono::duration cpu_wall = cpu_end - cpu_start; + + std::vector gpu_states; + std::vector gpu_times; + MultiZoneSummary gpu_summary; + auto gpu_start = std::chrono::steady_clock::now(); + bool gpu_success = run_multi_zone(true, gpu_states, gpu_times, gpu_summary); + auto gpu_end = std::chrono::steady_clock::now(); + std::chrono::duration gpu_wall = gpu_end - gpu_start; + + auto print_summary = [&](const std::string &label, + const MultiZoneSummary &summary) { + std::cout << label << " multi-zone burn complete; zones failed = " + << summary.failures << std::endl; + if (summary.any_success) { + std::cout << label << " multi-zone temperature range: [" + << summary.t_min << ", " << summary.t_max << "]" + << std::endl; + std::cout << label << " multi-zone retries applied: " + << summary.retries << std::endl; + } else { + std::cout << label << " multi-zone burn had no successful zones" + << std::endl; } - - for (int nn = 0; nn < NumSpec; ++nn) { - inmfracs[nn] /= insum; - // update the number densities with conserved mass fractions - state.xn[nn] = inmfracs[nn] * state.rho / spmasses[nn]; + }; + + print_summary("CPU", cpu_summary); + std::cout << "CPU multi-zone burn walltime (s): " << cpu_wall.count() + << std::endl; + print_summary("GPU", gpu_summary); + std::cout << "GPU multi-zone burn walltime (s): " << gpu_wall.count() + << std::endl; + + auto within_tol = [&](amrex::Real a, amrex::Real b) -> bool { + amrex::Real diff = std::abs(a - b); + amrex::Real scale = std::max(std::abs(a), std::abs(b)); + return diff <= (abs_tol + rel_tol * scale); + }; + + struct MismatchStats { + int count = 0; + amrex::Real max_abs_diff = 0.0_rt; + int max_zone = -1; + amrex::Real cpu_value_at_max = 0.0_rt; + amrex::Real gpu_value_at_max = 0.0_rt; + }; + + std::map mismatch_summary; + + struct NormAccumulator { + amrex::Real diff_l1 = 0.0_rt; + amrex::Real diff_l2_sq = 0.0_rt; + amrex::Real diff_linf = 0.0_rt; + amrex::Real ref_l1 = 0.0_rt; + amrex::Real ref_l2_sq = 0.0_rt; + amrex::Real ref_linf = 0.0_rt; + void record(amrex::Real diff, amrex::Real ref) { + amrex::Real abs_diff = std::abs(diff); + amrex::Real abs_ref = std::abs(ref); + diff_l1 += abs_diff; + diff_l2_sq += diff * diff; + diff_linf = std::max(diff_linf, abs_diff); + ref_l1 += abs_ref; + ref_l2_sq += ref * ref; + ref_linf = std::max(ref_linf, abs_ref); } - - // update the number density of electrons due to charge conservation - balance_charge(state); - - // reconserve mass fractions post charge conservation - insum = 0; - for (int nn = 0; nn < NumSpec; ++nn) { - state.xn[nn] = amrex::max(state.xn[nn], small_x); - inmfracs[nn] = spmasses[nn] * state.xn[nn] / state.rho; - insum += inmfracs[nn]; + }; + + std::map field_norms; + + auto record_mismatch = [&](int zone, const std::string &field, + amrex::Real cpu_value, amrex::Real gpu_value) { + auto &stats = mismatch_summary[field]; + stats.count++; + amrex::Real diff = std::abs(cpu_value - gpu_value); + if (stats.max_zone < 0 || diff > stats.max_abs_diff) { + stats.max_abs_diff = diff; + stats.max_zone = zone; + stats.cpu_value_at_max = cpu_value; + stats.gpu_value_at_max = gpu_value; } + }; - for (int nn = 0; nn < NumSpec; ++nn) { - inmfracs[nn] /= insum; - // update the number densities with conserved mass fractions - state.xn[nn] = inmfracs[nn] * state.rho / spmasses[nn]; + auto record_difference = [&](const std::string &field, amrex::Real cpu_value, + amrex::Real gpu_value, bool track_norm) { + if (!track_norm) { + return; + } + field_norms[field].record(cpu_value - gpu_value, cpu_value); + }; + + for (int iz = 0; iz < n_zones; ++iz) { + bool cpu_zone_success = + cpu_states[iz].success && cpu_states[iz].e > 0.0_rt; + bool gpu_zone_success = + gpu_states[iz].success && gpu_states[iz].e > 0.0_rt; + + if (cpu_zone_success != gpu_zone_success) { + record_mismatch(iz, "success_flag", + cpu_zone_success ? 1.0_rt : 0.0_rt, + gpu_zone_success ? 1.0_rt : 0.0_rt); + record_difference("success_flag", + cpu_zone_success ? 1.0_rt : 0.0_rt, + gpu_zone_success ? 1.0_rt : 0.0_rt, false); + continue; + } + if (!cpu_zone_success) { + continue; } - // get the updated T - eos(eos_input_re, state); + if (!within_tol(cpu_times[iz], gpu_times[iz])) { + record_mismatch(iz, "time", cpu_times[iz], gpu_times[iz]); + } + record_difference("time", cpu_times[iz], gpu_times[iz], false); - t += dt; + if (!within_tol(cpu_states[iz].rho, gpu_states[iz].rho)) { + record_mismatch(iz, "rho", cpu_states[iz].rho, gpu_states[iz].rho); + } + record_difference("rho", cpu_states[iz].rho, gpu_states[iz].rho, false); + if (!within_tol(cpu_states[iz].T, gpu_states[iz].T)) { + record_mismatch(iz, "T", cpu_states[iz].T, gpu_states[iz].T); + } + record_difference("temperature", cpu_states[iz].T, gpu_states[iz].T, true); + if (!within_tol(cpu_states[iz].e, gpu_states[iz].e)) { + record_mismatch(iz, "e", cpu_states[iz].e, gpu_states[iz].e); + } + record_difference("e", cpu_states[iz].e, gpu_states[iz].e, false); + for (int n = 0; n < NumSpec; ++n) { + if (!within_tol(cpu_states[iz].xn[n], gpu_states[iz].xn[n])) { + std::string label = + "xn(" + std::string(short_spec_names_cxx[n]) + ")"; + record_mismatch(iz, label, cpu_states[iz].xn[n], + gpu_states[iz].xn[n]); + } + std::string norm_label = + "number_density(" + std::string(short_spec_names_cxx[n]) + ")"; + record_difference(norm_label, cpu_states[iz].xn[n], + gpu_states[iz].xn[n], true); + } + } - state_over_time << std::setw(10) << t; - state_over_time << std::setw(15) << state.rho; - state_over_time << std::setw(12) << state.T; - for (int x = 0; x < NumSpec; ++x) { - state_over_time << std::setw(15) << state.xn[x]; + if (!field_norms.empty()) { + std::cout << "CPU/GPU multi-zone difference norms:" << std::endl; + auto safe_norm_ratio = [](amrex::Real num, amrex::Real denom) { + if (denom > 0.0_rt) { + return num / denom; + } + return (num == 0.0_rt) ? 0.0_rt + : std::numeric_limits::infinity(); + }; + auto print_norm = [&](const std::string &label, + const NormAccumulator &acc) { + amrex::Real diff_l2 = std::sqrt(acc.diff_l2_sq); + amrex::Real ref_l2 = std::sqrt(acc.ref_l2_sq); + std::cout << " " << label + << ": L1 = " << safe_norm_ratio(acc.diff_l1, acc.ref_l1) + << ", L2 = " << safe_norm_ratio(diff_l2, ref_l2) + << ", L_inf = " + << safe_norm_ratio(acc.diff_linf, acc.ref_linf) << std::endl; + }; + auto temp_it = field_norms.find("temperature"); + if (temp_it != field_norms.end()) { + print_norm("temperature", temp_it->second); + } + for (int n = 0; n < NumSpec; ++n) { + std::string label = + "number_density(" + std::string(short_spec_names_cxx[n]) + ")"; + auto it = field_norms.find(label); + if (it != field_norms.end()) { + print_norm(label, it->second); + } } - state_over_time << std::endl; + } else { + std::cout << "No comparable CPU/GPU multi-zone states to compute norms" + << std::endl; } - state_over_time.close(); - - // output diagnostics to the terminal - std::cout << "------------------------------------" << std::endl; - std::cout << "successful? " << state.success << std::endl; - - std::cout << "------------------------------------" << std::endl; - std::cout << "T initial = " << state_in.T << std::endl; - std::cout << "T final = " << state.T << std::endl; - std::cout << "Eint initial = " << state_in.e << std::endl; - std::cout << "Eint final = " << state.e << std::endl; - std::cout << "rho initial = " << state_in.rho << std::endl; - std::cout << "rho final = " << state.rho << std::endl; - - std::cout << "------------------------------------" << std::endl; - std::cout << "New number densities: " << std::endl; - for (int n = 0; n < NumSpec; ++n) { - std::cout << state.xn[n] << std::endl; + + if (mismatch_summary.empty()) { + std::cout << "CPU/GPU multi-zone states agree within tolerances" + << std::endl; + } else { + std::cout << "CPU/GPU mismatches detected:" << std::endl; + for (const auto &entry : mismatch_summary) { + const auto &field = entry.first; + const auto &stats = entry.second; + std::cout << " field " << field << " mismatched in " << stats.count + << " zone(s)"; + if (stats.max_zone >= 0) { + std::cout << "; largest difference at zone " << stats.max_zone + << " (CPU = " << stats.cpu_value_at_max + << ", GPU = " << stats.gpu_value_at_max << ")"; + } + std::cout << std::endl; + } } - return state.success; + return (cpu_success && gpu_success && mismatch_summary.empty()); } #endif diff --git a/unit_test/burn_cell_primordial_chem/inputs_primordial_chem b/unit_test/burn_cell_primordial_chem/inputs_primordial_chem index 27305ba634..d25657d102 100644 --- a/unit_test/burn_cell_primordial_chem/inputs_primordial_chem +++ b/unit_test/burn_cell_primordial_chem/inputs_primordial_chem @@ -26,6 +26,8 @@ unit_test.primary_species_12 = 1e-40 unit_test.primary_species_13 = 1e-40 unit_test.primary_species_14 = 0.0775e0 +unit_test.n_zones = 32**3 + # integrator runtime parameters # are we using primordial chemistry? then we use number densities integrator.use_number_densities = 1 @@ -43,7 +45,7 @@ integrator.X_reject_buffer = 1e100 # Set which jacobian to use # 1 = analytic jacobian # 2 = numerical jacobian -integrator.jacobian = 1 +integrator.jacobian = 2 # do you want to normalize abundances within VODE? (you don't!) integrator.renormalize_abundances = 0 # tolerances diff --git a/unit_test/burn_cell_primordial_chem/main.cpp b/unit_test/burn_cell_primordial_chem/main.cpp index 5e458f8936..622d0eabfa 100644 --- a/unit_test/burn_cell_primordial_chem/main.cpp +++ b/unit_test/burn_cell_primordial_chem/main.cpp @@ -23,11 +23,12 @@ int main(int argc, char *argv[]) { std::string const run_prefix = "burn_cell_primordial_chem_"; std::string input_run_prefix; pp.query("run_prefix", input_run_prefix); + int n_zones = 128; + pp.query("n_zones", n_zones); + amrex::Print() << "n_zones = " << n_zones << "\n"; AMREX_ALWAYS_ASSERT_WITH_MESSAGE(run_prefix == input_run_prefix, "input file is missing or incorrect!"); - std::cout << "starting the single zone burn..." << std::endl; - ParmParse ppa("amr"); init_unit_test(); @@ -39,7 +40,10 @@ int main(int argc, char *argv[]) { // C++ Network, RHS, screening, rates initialization network_init(); - success = burn_cell_c(); + std::cout << "\nstarting the multi-zone burn..." << std::endl; + int multi_success = burn_cell_multi_c(n_zones); + + success = multi_success; } amrex::Finalize(); diff --git a/unit_test/test_linear_algebra/test_linear_algebra.H b/unit_test/test_linear_algebra/test_linear_algebra.H index 0762163da3..caf5677f84 100644 --- a/unit_test/test_linear_algebra/test_linear_algebra.H +++ b/unit_test/test_linear_algebra/test_linear_algebra.H @@ -175,7 +175,7 @@ void linear_algebra() { // now try to output a Jacobian mask based on the actual Jacobian - ArrayUtil::MathArray2D<1, INT_NEQS, 1, INT_NEQS> jac; + ArrayUtil::MathArray2D jac; burn_t burn_state; burn_state.rho = 1.e8; diff --git a/unit_test/test_rhs/rhs_zones.H b/unit_test/test_rhs/rhs_zones.H index 87a3ec559a..54a7bfaba0 100644 --- a/unit_test/test_rhs/rhs_zones.H +++ b/unit_test/test_rhs/rhs_zones.H @@ -41,7 +41,7 @@ bool do_rhs (int i, int j, int k, amrex::Array4 const& state, const amrex::Array1D ydot; - ArrayUtil::MathArray2D<1, neqs, 1, neqs> jac; + ArrayUtil::MathArray2D jac; #ifdef NEW_NETWORK_IMPLEMENTATION RHS::rhs(burn_state, ydot);