diff --git a/framework/include/problems/FEProblemBase.h b/framework/include/problems/FEProblemBase.h index e65a7c00819a..c5700d18802b 100644 --- a/framework/include/problems/FEProblemBase.h +++ b/framework/include/problems/FEProblemBase.h @@ -509,6 +509,21 @@ class FEProblemBase : public SubProblem, public Restartable */ virtual void setException(const std::string & message); + /** + * Handle a currently-throwing exception. This method is to be + * called from the catch (...) block following a try block, when + * trying code that may throw an exception on one rank but not + * necessarily on all ranks in parallel. + * + * This must be followed up with by a call to + * checkExceptionAndStopSolve() after the catch blocks, so that + * non-exception-throwing ranks can be informed of the exception. + * Non-fatal exceptions will be recreated there as a MooseException + * to allow higher-level code to handle them; terminating exceptions + * will be thrown to allow higher-level code to unwind in sync. + */ + void handleException(const std::string & calling_method); + /** * Whether or not an exception has occurred. */ @@ -3250,6 +3265,10 @@ class FEProblemBase : public SubProblem, public Restartable /// Whether or not an exception has occurred bool _has_exception; + /// Whether or not the program is terminating with an unrecoverable + /// exception + bool _termination_exception; + /// Whether or not information about how many transfers have completed is printed bool _parallel_barrier_messaging; @@ -3302,12 +3321,6 @@ class FEProblemBase : public SubProblem, public Restartable const bool _use_hash_table_matrix_assembly; private: - /** - * Handle exceptions. Note that the result of this call will be a thrown MooseException. The - * caller of this method must determine how to handle the thrown exception - */ - void handleException(const std::string & calling_method); - /** * Helper for getting mortar objects corresponding to primary boundary ID, secondary boundary ID, * and displaced parameters, given some initial set diff --git a/framework/src/dirackernels/DiracKernelBase.C b/framework/src/dirackernels/DiracKernelBase.C index 39d96d8b1cf2..b5e5b6351dbc 100644 --- a/framework/src/dirackernels/DiracKernelBase.C +++ b/framework/src/dirackernels/DiracKernelBase.C @@ -10,6 +10,7 @@ // Moose includes #include "DiracKernelBase.h" #include "Assembly.h" +#include "FEProblemBase.h" #include "SystemBase.h" #include "Problem.h" #include "MooseMesh.h" @@ -147,147 +148,158 @@ DiracKernelBase::addPointWithValidId(Point p, unsigned id) unsigned int we_found_it = i_found_it; comm().max(we_found_it); - // If nobody found it in their local caches, it means we need to - // do the PointLocator look-up and update the caches. This is - // safe, because all processors have the same value of we_found_it. - if (!we_found_it) - { - const Elem * elem = _dirac_kernel_info.findPoint(p, _mesh, blockIDs()); - - // Only add the point to the cache on this processor if the Elem is local - if (elem && (elem->processor_id() == processor_id())) - { - // Add the point to the cache... - _point_cache[id] = std::make_pair(elem, p); - - // ... and to the reverse cache. - std::vector> & points = _reverse_point_cache[elem]; - points.push_back(std::make_pair(p, id)); - } - - // Call the other addPoint() method. This method ignores non-local - // and NULL elements automatically. - addPoint(elem, p, id); - return_elem = elem; - } - - // If the point was found in a cache, but not my cache, I'm not - // responsible for it. - // - // We can't return early here: then we aren't allowed to call any more - // parallel_only() functions in the remainder of this function! - if (we_found_it && !i_found_it) - return_elem = NULL; - // This flag may be set by the processor that cached the Elem because it // needs to call findPoint() (due to moving mesh, etc.). If so, we will // call it at the end of the while loop below. bool i_need_find_point = false; - // Now that we only cache local data, some processors may enter - // this if statement and some may not. Therefore we can't call - // any parallel_only() functions inside this if statement. - while (i_found_it) + try { - // We have something cached, now make sure it's actually the same Point. - // TODO: we should probably use this same comparison in the DiracKernelInfo code! - Point cached_point = (it->second).second; - - if (cached_point.relative_fuzzy_equals(p)) + // If nobody found it in their local caches, it means we need to + // do the PointLocator look-up and update the caches. This is + // safe, because all processors have the same value of we_found_it. + if (!we_found_it) { - // Find the cached element associated to this point - cached_elem = (it->second).first; + const Elem * elem = _dirac_kernel_info.findPoint(p, _mesh, blockIDs()); - // If the cached element's processor ID doesn't match ours, we - // are no longer responsible for caching it. This can happen - // due to adaptivity... - if (cached_elem->processor_id() != processor_id()) + // Only add the point to the cache on this processor if the Elem is local + if (elem && (elem->processor_id() == processor_id())) { - // Update the caches, telling them to drop the cached Elem. - // Analogously to the rest of the DiracKernel system, we - // also return NULL because the Elem is non-local. - updateCaches(cached_elem, NULL, p, id); - return_elem = NULL; - break; // out of while loop + // Add the point to the cache... + _point_cache[id] = std::make_pair(elem, p); + + // ... and to the reverse cache. + std::vector> & points = _reverse_point_cache[elem]; + points.push_back(std::make_pair(p, id)); } - bool active = cached_elem->active(); - bool contains_point = cached_elem->contains_point(p); + // Call the other addPoint() method. This method ignores non-local + // and NULL elements automatically. + addPoint(elem, p, id); + return_elem = elem; + } - // If the cached Elem is active and the point is still - // contained in it, call the other addPoint() method and - // return its result. - if (active && contains_point) - { - addPoint(cached_elem, p, id); - return_elem = cached_elem; - break; // out of while loop - } + // If the point was found in a cache, but not my cache, I'm not + // responsible for it. + // + // We can't return early here: then we aren't allowed to call any more + // parallel_only() functions in the remainder of this function! + if (we_found_it && !i_found_it) + return_elem = NULL; + + // Now that we only cache local data, some processors may enter + // this if statement and some may not. Therefore we can't call + // any parallel_only() functions inside this if statement. + while (i_found_it) + { + // We have something cached, now make sure it's actually the same Point. + // TODO: we should probably use this same comparison in the DiracKernelInfo code! + Point cached_point = (it->second).second; - // Is the Elem not active (been refined) but still contains the point? - // Then search in its active children and update the caches. - else if (!active && contains_point) + if (cached_point.relative_fuzzy_equals(p)) { - // Get the list of active children - std::vector active_children; - cached_elem->active_family_tree(active_children); - - // Linear search through active children for the one that contains p - for (unsigned c = 0; c < active_children.size(); ++c) - if (active_children[c]->contains_point(p)) - { - updateCaches(cached_elem, active_children[c], p, id); - addPoint(active_children[c], p, id); - return_elem = active_children[c]; - break; // out of for loop - } - - // If we got here without setting return_elem, it means the Point was - // found in the parent element, but not in any of the active - // children... this is not possible under normal - // circumstances, so something must have gone seriously - // wrong! - if (!return_elem) - mooseError("Error, Point not found in any of the active children!"); - - break; // out of while loop - } + // Find the cached element associated to this point + cached_elem = (it->second).first; - else if ( - // Is the Elem active but the point is not contained in it any - // longer? (For example, did the Mesh move out from under - // it?) Then we fall back to the expensive Point Locator - // lookup. TODO: we could try and do something more optimized - // like checking if any of the active neighbors contains the - // point. Update the caches. - (active && !contains_point) || - - // The Elem has been refined *and* the Mesh has moved out - // from under it, we fall back to doing the expensive Point - // Locator lookup. TODO: We could try and look in the - // active children of this Elem's neighbors for the Point. - // Update the caches. - (!active && !contains_point)) - { - i_need_find_point = true; - break; // out of while loop - } + // If the cached element's processor ID doesn't match ours, we + // are no longer responsible for caching it. This can happen + // due to adaptivity... + if (cached_elem->processor_id() != processor_id()) + { + // Update the caches, telling them to drop the cached Elem. + // Analogously to the rest of the DiracKernel system, we + // also return NULL because the Elem is non-local. + updateCaches(cached_elem, NULL, p, id); + return_elem = NULL; + break; // out of while loop + } + bool active = cached_elem->active(); + bool contains_point = cached_elem->contains_point(p); + + // If the cached Elem is active and the point is still + // contained in it, call the other addPoint() method and + // return its result. + if (active && contains_point) + { + addPoint(cached_elem, p, id); + return_elem = cached_elem; + break; // out of while loop + } + + // Is the Elem not active (been refined) but still contains the point? + // Then search in its active children and update the caches. + else if (!active && contains_point) + { + // Get the list of active children + std::vector active_children; + cached_elem->active_family_tree(active_children); + + // Linear search through active children for the one that contains p + for (unsigned c = 0; c < active_children.size(); ++c) + if (active_children[c]->contains_point(p)) + { + updateCaches(cached_elem, active_children[c], p, id); + addPoint(active_children[c], p, id); + return_elem = active_children[c]; + break; // out of for loop + } + + // If we got here without setting return_elem, it means the Point was + // found in the parent element, but not in any of the active + // children... this is not possible under normal + // circumstances, so something must have gone seriously + // wrong! + if (!return_elem) + mooseError("Error, Point not found in any of the active children!"); + + break; // out of while loop + } + + else if ( + // Is the Elem active but the point is not contained in it any + // longer? (For example, did the Mesh move out from under + // it?) Then we fall back to the expensive Point Locator + // lookup. TODO: we could try and do something more optimized + // like checking if any of the active neighbors contains the + // point. Update the caches. + (active && !contains_point) || + + // The Elem has been refined *and* the Mesh has moved out + // from under it, we fall back to doing the expensive Point + // Locator lookup. TODO: We could try and look in the + // active children of this Elem's neighbors for the Point. + // Update the caches. + (!active && !contains_point)) + { + i_need_find_point = true; + break; // out of while loop + } + + else + mooseError("We'll never get here!"); + } // if (cached_point.relative_fuzzy_equals(p)) else - mooseError("We'll never get here!"); - } // if (cached_point.relative_fuzzy_equals(p)) - else - mooseError("Cached Dirac point ", - cached_point, - " already exists with ID: ", - id, - " and does not match point ", - p, - "If Dirac sources are moving, please set 'allow_moving_sources' to true"); - - // We only want one iteration of this while loop at maximum. - i_found_it = false; - } // while (i_found_it) + mooseError("Cached Dirac point ", + cached_point, + " already exists with ID: ", + id, + " and does not match point ", + p, + "If Dirac sources are moving, please set 'allow_moving_sources' to true"); + + // We only want one iteration of this while loop at maximum. + i_found_it = false; + } // while (i_found_it) + } + catch (...) + { + _fe_problem.handleException("DiracKernelBase addPoint"); + } + + // If one processor had an error, all processors should unwind with + // that error + _fe_problem.checkExceptionAndStopSolve(); // We are back to all processors here because we do not return // early in the code above... diff --git a/framework/src/problems/DisplacedProblem.C b/framework/src/problems/DisplacedProblem.C index 0ac59ab70ee1..adc44f8e4916 100644 --- a/framework/src/problems/DisplacedProblem.C +++ b/framework/src/problems/DisplacedProblem.C @@ -318,19 +318,19 @@ DisplacedProblem::updateMesh(bool mesh_changing) else _geometric_search_data.update(); } - catch (MooseException & e) + catch (...) { - _mproblem.setException(e.what()); + _mproblem.handleException("updateMesh"); } - if (udmt.hasDisplacement()) - _mproblem.meshDisplaced(); - // The below call will throw an exception on all processes if any of our processes had an // exception above. This exception will be caught higher up the call stack and the error message // will be printed there _mproblem.checkExceptionAndStopSolve(/*print_message=*/false); + if (udmt.hasDisplacement()) + _mproblem.meshDisplaced(); + // Since the Mesh changed, update the PointLocator object used by DiracKernels. _dirac_kernel_info.updatePointLocator(_mesh); } diff --git a/framework/src/problems/FEProblemBase.C b/framework/src/problems/FEProblemBase.C index dbca15ebe019..9630b43e52d1 100644 --- a/framework/src/problems/FEProblemBase.C +++ b/framework/src/problems/FEProblemBase.C @@ -496,6 +496,7 @@ FEProblemBase::FEProblemBase(const InputParameters & parameters) _max_scalar_order(INVALID_ORDER), _has_time_integrator(false), _has_exception(false), + _termination_exception(false), _parallel_barrier_messaging(getParam("parallel_barrier_messaging")), _verbose_setup(getParam("verbose_setup")), _verbose_multiapps(getParam("verbose_multiapps")), @@ -6753,10 +6754,20 @@ FEProblemBase::checkExceptionAndStopSolve(bool print_message) TIME_SECTION("checkExceptionAndStopSolve", 5); + // Our ranks need to all be here in sync + parallel_object_only(); + // See if any processor had an exception. If it did, get back the // processor that the exception occurred on. unsigned int processor_id; + _communicator.maxloc(_termination_exception, processor_id); + + if (_termination_exception) + { + libmesh_terminate(); // Just continue terminating, but in sync + } + _communicator.maxloc(_has_exception, processor_id); if (_has_exception) @@ -7346,6 +7357,10 @@ FEProblemBase::computeResidualAndJacobian(const NumericVector & soln, // calling the system's stopSolve() method, it is now up to PETSc to return a // "diverged" reason during the next solve. } + catch (libMesh::TerminationException &) + { + throw; // We're dying; carry on. + } catch (...) { mooseError("Unexpected exception type"); @@ -7485,6 +7500,21 @@ FEProblemBase::handleException(const std::string & calling_method) // produce a non-zero exit code mooseError(create_exception_message("libMesh::PetscSolverException", e)); } + catch (const libMesh::TerminationException & e) + { + // If we're terminating, from a mooseError or from anything else + // that ought to be nearly unhandleable, then we need to keep + // terminating, not just set a different sort of exception that + // higher-level code might erroneously think it can just relax a + // timestep or something and try again. + // + // But we might need to terminate via exception on all + // processors, because if we're doing stack unwinding on one rank + // (whether we're going to be caught or if we're in a compiler + // that does stack unwinding regardless) we should be doing it on + // all ranks to ensure that we're in sync in parallel. + _termination_exception = true; + } catch (const std::exception & e) { // This might be libMesh detecting a degenerate Jacobian or matrix @@ -7580,6 +7610,10 @@ FEProblemBase::computeResidualTags(const std::set & tags) // calling the system's stopSolve() method, it is now up to PETSc to return a // "diverged" reason during the next solve. } + catch (libMesh::TerminationException &) + { + throw; // We're dying; carry on. + } catch (...) { mooseError("Unexpected exception type"); @@ -7739,6 +7773,10 @@ FEProblemBase::computeJacobianTags(const std::set & tags) // calling the system's stopSolve() method, it is now up to PETSc to return a // "diverged" reason during the next solve. } + catch (libMesh::TerminationException &) + { + throw; // We're dying; carry on. + } catch (...) { mooseError("Unexpected exception type"); @@ -7817,6 +7855,10 @@ FEProblemBase::computeBounds(NonlinearImplicitSystem & libmesh_dbg_var(sys), { mooseError("Irrecoverable exception: " + std::string(e.what())); } + catch (libMesh::TerminationException &) + { + throw; // We're dying; carry on. + } catch (...) { mooseError("Unexpected exception type"); diff --git a/framework/src/vectorpostprocessors/PointSamplerBase.C b/framework/src/vectorpostprocessors/PointSamplerBase.C index 1b2b0579294d..48877c0eeffc 100644 --- a/framework/src/vectorpostprocessors/PointSamplerBase.C +++ b/framework/src/vectorpostprocessors/PointSamplerBase.C @@ -87,17 +87,26 @@ PointSamplerBase::finalize() std::vector max_pid(_found_points.size()); _comm.maxloc(_found_points, max_pid); - for (MooseIndex(_found_points) i = 0; i < _found_points.size(); ++i) + try { - // _global_found_points should contain all 1's at this point (ie every point was found by a - // proc) - if (pid == 0 && !_global_found_points[i]) - mooseError("In ", name(), ", sample point not found: ", _points[i]); - - // only process that found the point has the value, and only process with max id should add - if (pid == max_pid[i] && _found_points[i]) - SamplerBase::addSample(_points[i], _ids[i], _point_values[i]); + for (MooseIndex(_found_points) i = 0; i < _found_points.size(); ++i) + { + // _global_found_points should contain all 1's at this point (ie every point was found by a + // proc) + if (pid == 0 && !_global_found_points[i]) + mooseError("In ", name(), ", sample point not found: ", _points[i]); + + // only process that found the point has the value, and only process with max id should add + if (pid == max_pid[i] && _found_points[i]) + SamplerBase::addSample(_points[i], _ids[i], _point_values[i]); + } } + catch (...) + { + _fe_problem.handleException("PointSamplerBase finalize"); + } + + _fe_problem.checkExceptionAndStopSolve(/*print_message=*/false); SamplerBase::finalize(); } diff --git a/modules/ray_tracing/src/userobjects/RayTracingStudy.C b/modules/ray_tracing/src/userobjects/RayTracingStudy.C index 855f35579125..8e84ba567063 100644 --- a/modules/ray_tracing/src/userobjects/RayTracingStudy.C +++ b/modules/ray_tracing/src/userobjects/RayTracingStudy.C @@ -1457,9 +1457,11 @@ RayTracingStudy::verifyUniqueRayIDs(const std::vector>::con // Mapping on rank 0 from ID -> processor ID std::map global_map; + std::string error_string = ""; + // Verify another processor's IDs against the global map on rank 0 - const auto check_ids = - [this, &global_map, &error_suffix](processor_id_type pid, const std::vector & ids) + const auto check_ids = [&global_map, &error_suffix, &error_string]( + processor_id_type pid, const std::vector & ids) { for (const RayID id : ids) { @@ -1467,18 +1469,18 @@ RayTracingStudy::verifyUniqueRayIDs(const std::vector>::con // Means that this ID already exists in the map if (!emplace_pair.second) - mooseError("Ray with ID ", - id, - " exists on ranks ", - emplace_pair.first->second, - " and ", - pid, - "\n", - error_suffix); + error_string += "Ray with ID " + std::to_string(id) + " exists on ranks " + + std::to_string(emplace_pair.first->second) + " and " + + std::to_string(pid) + "\n" + error_suffix; } }; Parallel::push_parallel_vector_data(_communicator, send_ids, check_ids); + + bool found_error = (error_string != ""); + _communicator.max(found_error); + if (found_error) + mooseError(error_string); } } diff --git a/modules/ray_tracing/src/userobjects/RepeatableRayStudyBase.C b/modules/ray_tracing/src/userobjects/RepeatableRayStudyBase.C index bad9bb76233d..d6959c779d7a 100644 --- a/modules/ray_tracing/src/userobjects/RepeatableRayStudyBase.C +++ b/modules/ray_tracing/src/userobjects/RepeatableRayStudyBase.C @@ -205,73 +205,92 @@ RepeatableRayStudyBase::verifyReplicatedRays() "private param '_define_rays_replicated' == true."; // First, verify that our _rays have unique IDs beacuse we will do mapping based on Ray ID - verifyUniqueRayIDs(_rays.begin(), - _rays.end(), - /* global = */ false, - "in _rays after calling defineRays()." + error_suffix); + try + { + verifyUniqueRayIDs(_rays.begin(), + _rays.end(), + /* global = */ false, + "in _rays after calling defineRays()." + error_suffix); + } + catch (...) + { + _fe_problem.handleException("verifyReplicatedRays"); + } + _fe_problem.checkExceptionAndStopSolve(/*print_message=*/false); // Tag for sending rays from rank 0 -> all other ranks const auto tag = comm().get_unique_tag(); - // Send a copy of the rays on rank 0 to all other processors for verification - if (_pid == 0) + try { - std::vector requests(n_processors() - 1); - auto request_it = requests.begin(); + // Send a copy of the rays on rank 0 to all other processors for verification + if (_pid == 0) + { + std::vector requests(n_processors() - 1); + auto request_it = requests.begin(); - for (processor_id_type pid = 0; pid < n_processors(); ++pid) - if (pid != 0) - comm().send_packed_range( - pid, parallelStudy(), _rays.begin(), _rays.end(), *request_it++, tag); + for (processor_id_type pid = 0; pid < n_processors(); ++pid) + if (pid != 0) + comm().send_packed_range( + pid, parallelStudy(), _rays.begin(), _rays.end(), *request_it++, tag); - Parallel::wait(requests); - } - // All other processors will receive and verify that their rays match the rays on rank 0 - else - { - // Map of RayID -> Ray for comparison from the Rays on rank 0 to the local rays - std::unordered_map ray_map; - ray_map.reserve(_rays.size()); - for (const auto & ray : _rays) - ray_map.emplace(ray->id(), ray.get()); - - // Receive the duplicated rays from rank 0 - std::vector> rank_0_rays; - rank_0_rays.reserve(_rays.size()); - comm().receive_packed_range( - 0, parallelStudy(), std::back_inserter(rank_0_rays), (std::shared_ptr *)nullptr, tag); - - // The sizes better match - if (rank_0_rays.size() != _rays.size()) - mooseError("The size of _rays on rank ", - _pid, - " does not match the size of rays on rank 0.", - error_suffix); - - // Make sure we have a matching local ray for each ray from rank 0 - for (const auto & ray : rank_0_rays) + Parallel::wait(requests); + } + // All other processors will receive and verify that their rays match the rays on rank 0 + else { - const auto find = ray_map.find(ray->id()); - if (find == ray_map.end()) - mooseError("A Ray was found on rank ", + // Map of RayID -> Ray for comparison from the Rays on rank 0 to the local rays + std::unordered_map ray_map; + ray_map.reserve(_rays.size()); + for (const auto & ray : _rays) + ray_map.emplace(ray->id(), ray.get()); + + // Receive the duplicated rays from rank 0 + std::vector> rank_0_rays; + rank_0_rays.reserve(_rays.size()); + comm().receive_packed_range(0, + parallelStudy(), + std::back_inserter(rank_0_rays), + (std::shared_ptr *)nullptr, + tag); + + // The sizes better match + if (rank_0_rays.size() != _rays.size()) + mooseError("The size of _rays on rank ", _pid, - " with an ID that does not exist on rank 0.", - error_suffix, - "\n\n", - ray->getInfo()); + " does not match the size of rays on rank 0.", + error_suffix); - const Ray * root_ray = find->second; - if (*root_ray != *ray) + // Make sure we have a matching local ray for each ray from rank 0 + for (const auto & ray : rank_0_rays) { - mooseError("A Ray was found on rank ", - _pid, - " that does not exist on rank 0.", - error_suffix, - "\n\nLocal ray:\n\n", - ray->getInfo(), - "\n\nRank 0 ray:\n\n", - root_ray->getInfo()); + const auto find = ray_map.find(ray->id()); + if (find == ray_map.end()) + mooseError("A Ray was found on rank ", + _pid, + " with an ID that does not exist on rank 0.", + error_suffix, + "\n\n", + ray->getInfo()); + + const Ray * root_ray = find->second; + if (*root_ray != *ray) + { + mooseError("A Ray was found on rank ", + _pid, + " that does not exist on rank 0.", + error_suffix, + "\n\nLocal ray:\n\n", + ray->getInfo(), + "\n\nRank 0 ray:\n\n", + root_ray->getInfo()); + } } } } + catch (...) + { + _fe_problem.handleException("verifyReplicatedRays"); + } + _fe_problem.checkExceptionAndStopSolve(/*print_message=*/false); }