Skip to content

Commit c305e7b

Browse files
authored
Fix std::terminate() caused by uncaught exception in ORCA (#1481)
### What does this commit do? This PR fixes a crash, `std::terminate()`, caused by an exception occurring in the catch block of `GPOPTOptimizedPlan()` during `gpopt_context.CloneErrorMsg()`. When `CloneErrorMsg` fails (typically due to OOM), it throws a `gpdb` error, causing `std::terminate()`. Postgres has graceful error handling for errors that occur while processing other errors - even during OOM, the `ErrorContext` has reserved space to preserve error messages. So rethrow the error to postgres handler. ### Test Plan Added a new test to the regression `gporca_fault` test. I added a query that errors out during optimization in orca and as the result falls into `catch` block where another exception in `CloneErrorMsg` is thrown via inject fault. Previously it resulted in the `std::terminate()` call & process crashing. Now catch the exception and let the postgres handle it with reserved buffer to preserve error message. ### Impact Instead of ```sql server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. ``` postgres reports an error (in case of OOM we will see it, not std::terminate with exiting processes): ```sql ERROR: ... ```
1 parent e20a488 commit c305e7b

File tree

6 files changed

+85
-5
lines changed

6 files changed

+85
-5
lines changed

src/backend/gpopt/CGPOptimizer.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,17 @@ CGPOptimizer::GPOPTOptimizedPlan(
6363
GPOS_CATCH_EX(ex)
6464
{
6565
// clone the error message before context free.
66+
BOOL clone_failed = false;
6667
CHAR *serialized_error_msg =
67-
gpopt_context.CloneErrorMsg(MessageContext);
68+
gpopt_context.CloneErrorMsg(MessageContext, &clone_failed);
6869
// clean up context
6970
gpopt_context.Free(gpopt_context.epinQuery, gpopt_context.epinPlStmt);
7071

7172
// Special handler for a few common user-facing errors. In particular,
7273
// we want to use the correct error code for these, in case an application
7374
// tries to do something smart with them.
7475

75-
if (GPOS_MATCH_EX(ex, gpdxl::ExmaGPDB, gpdxl::ExmiGPDBError))
76+
if (clone_failed || GPOS_MATCH_EX(ex, gpdxl::ExmaGPDB, gpdxl::ExmiGPDBError))
7677
{
7778
PG_RE_THROW();
7879
}

src/backend/gpopt/utils/COptTasks.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,34 @@ SOptContext::Free(SOptContext::EPin input, SOptContext::EPin output) const
162162
//
163163
//---------------------------------------------------------------------------
164164
CHAR *
165-
SOptContext::CloneErrorMsg(MemoryContext context) const
165+
SOptContext::CloneErrorMsg(MemoryContext context, BOOL *clone_failed) const
166166
{
167+
*clone_failed = false;
168+
167169
if (nullptr == context || nullptr == m_error_msg)
168170
{
169171
return nullptr;
170172
}
171-
return gpdb::MemCtxtStrdup(context, m_error_msg);
173+
174+
CHAR *error_msg;
175+
GPOS_TRY
176+
{
177+
#ifdef FAULT_INJECTOR
178+
if (gpdb::InjectFaultInOptTasks("opt_clone_error_msg") == FaultInjectorTypeSkip)
179+
{
180+
GpdbEreport(ERRCODE_INTERNAL_ERROR, ERROR, "Injected error", nullptr);
181+
}
182+
#endif
183+
error_msg = gpdb::MemCtxtStrdup(context, m_error_msg);
184+
}
185+
GPOS_CATCH_EX(ex)
186+
{
187+
error_msg = nullptr;
188+
*clone_failed = true;
189+
}
190+
GPOS_CATCH_END;
191+
192+
return error_msg;
172193
}
173194

174195

src/include/gpopt/utils/COptTasks.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ struct SOptContext
107107
void Free(EPin input, EPin epinOutput) const;
108108

109109
// Clone the error message in given context.
110-
CHAR *CloneErrorMsg(struct MemoryContextData *context) const;
110+
CHAR *CloneErrorMsg(struct MemoryContextData *context, BOOL *clone_failed) const;
111111

112112
// casting function
113113
static SOptContext *Cast(void *ptr);

src/test/regress/expected/gporca_faults.out

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,28 @@ SELECT gp_inject_fault('opt_relcache_translator_catalog_access', 'reset', 1);
7070
Success:
7171
(1 row)
7272

73+
-- Test to check that GPOPTOptimizedPlan() does not cause std::terminate() by throwing an uncaught exception.
74+
CREATE TABLE test_orca_uncaught_exc(a int, b int) DISTRIBUTED RANDOMLY;
75+
-- Since ORCA cannot optimize this query, an exception is generated. We then inject a second exception when the
76+
-- first is caught and verify that it is not propagated further (no std::terminate() is called and backend is alive).
77+
SELECT gp_inject_fault('opt_clone_error_msg', 'skip', 1);
78+
gp_inject_fault
79+
-----------------
80+
Success:
81+
(1 row)
82+
83+
SELECT sum(distinct a), count(distinct b) FROM test_orca_uncaught_exc;
84+
sum | count
85+
-----+-------
86+
| 0
87+
(1 row)
88+
89+
SELECT gp_inject_fault('opt_clone_error_msg', 'reset', 1);
90+
gp_inject_fault
91+
-----------------
92+
Success:
93+
(1 row)
94+
95+
-- start_ignore
96+
DROP TABLE test_orca_uncaught_exc;
97+
-- end_ignore

src/test/regress/expected/gporca_faults_optimizer.out

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,24 @@ SELECT gp_inject_fault('opt_relcache_translator_catalog_access', 'reset', 1);
6262
Success:
6363
(1 row)
6464

65+
-- Test to check that GPOPTOptimizedPlan() does not cause std::terminate() by throwing an uncaught exception.
66+
CREATE TABLE test_orca_uncaught_exc(a int, b int) DISTRIBUTED RANDOMLY;
67+
-- Since ORCA cannot optimize this query, an exception is generated. We then inject a second exception when the
68+
-- first is caught and verify that it is not propagated further (no std::terminate() is called and backend is alive).
69+
SELECT gp_inject_fault('opt_clone_error_msg', 'skip', 1);
70+
gp_inject_fault
71+
-----------------
72+
Success:
73+
(1 row)
74+
75+
SELECT sum(distinct a), count(distinct b) FROM test_orca_uncaught_exc;
76+
ERROR: Injected error (COptTasks.cpp:180)
77+
SELECT gp_inject_fault('opt_clone_error_msg', 'reset', 1);
78+
gp_inject_fault
79+
-----------------
80+
Success:
81+
(1 row)
82+
83+
-- start_ignore
84+
DROP TABLE test_orca_uncaught_exc;
85+
-- end_ignore

src/test/regress/sql/gporca_faults.sql

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,15 @@ SELECT * FROM func1_nosql_vol(5), foo;
3030

3131
-- The fault should *not* be hit above when optimizer = off, to reset it now.
3232
SELECT gp_inject_fault('opt_relcache_translator_catalog_access', 'reset', 1);
33+
34+
-- Test to check that GPOPTOptimizedPlan() does not cause std::terminate() by throwing an uncaught exception.
35+
CREATE TABLE test_orca_uncaught_exc(a int, b int) DISTRIBUTED RANDOMLY;
36+
-- Since ORCA cannot optimize this query, an exception is generated. We then inject a second exception when the
37+
-- first is caught and verify that it is not propagated further (no std::terminate() is called and backend is alive).
38+
SELECT gp_inject_fault('opt_clone_error_msg', 'skip', 1);
39+
SELECT sum(distinct a), count(distinct b) FROM test_orca_uncaught_exc;
40+
SELECT gp_inject_fault('opt_clone_error_msg', 'reset', 1);
41+
42+
-- start_ignore
43+
DROP TABLE test_orca_uncaught_exc;
44+
-- end_ignore

0 commit comments

Comments
 (0)