Skip to content

Commit 24ccd87

Browse files
zhrt123beeender
authored andcommitted
Fix bug: PLPY function causes master process reset. (#16856)
## Problem An error occurs in python lib when a plpython function is executed. After our analysis, in the user's cluster, a plpython UDF was running with the unstable network, and got a timeout error: `failed to acquire resources on one or more segments`. Then a plpython UDF was run in the same session, and the UDF failed with GC error. Here is the core dump: ``` 2023-11-24 10:15:18.945507 CST,,,p2705198,th2081832064,,,,0,,,seg-1,,,,,"LOG","00000","3rd party error log: #0 0x7f7c68b6d55b in frame_dealloc /home/cc/repo/cpython/Objects/frameobject.c:509:5 #1 0x7f7c68b5109d in gen_send_ex /home/cc/repo/cpython/Objects/genobject.c:108:9 #2 0x7f7c68af9ddd in PyIter_Next /home/cc/repo/cpython/Objects/abstract.c:3118:14 #3 0x7f7c78caa5c0 in PLy_exec_function /home/cc/repo/gpdb6/src/pl/plpython/plpy_exec.c:134:11 #4 0x7f7c78cb5ffb in plpython_call_handler /home/cc/repo/gpdb6/src/pl/plpython/plpy_main.c:387:13 #5 0x562f5e008bb5 in ExecMakeTableFunctionResult /home/cc/repo/gpdb6/src/backend/executor/execQual.c:2395:13 #6 0x562f5e0dddec in FunctionNext_guts /home/cc/repo/gpdb6/src/backend/executor/nodeFunctionscan.c:142:5 #7 0x562f5e0da094 in FunctionNext /home/cc/repo/gpdb6/src/backend/executor/nodeFunctionscan.c:350:11 apache#8 0x562f5e03d4b0 in ExecScanFetch /home/cc/repo/gpdb6/src/backend/executor/execScan.c:84:9 apache#9 0x562f5e03cd8f in ExecScan /home/cc/repo/gpdb6/src/backend/executor/execScan.c:154:10 #10 0x562f5e0da072 in ExecFunctionScan /home/cc/repo/gpdb6/src/backend/executor/nodeFunctionscan.c:380:9 apache#11 0x562f5e001a1c in ExecProcNode /home/cc/repo/gpdb6/src/backend/executor/execProcnode.c:1071:13 apache#12 0x562f5dfe6377 in ExecutePlan /home/cc/repo/gpdb6/src/backend/executor/execMain.c:3202:10 apache#13 0x562f5dfe5bf4 in standard_ExecutorRun /home/cc/repo/gpdb6/src/backend/executor/execMain.c:1171:5 apache#14 0x562f5dfe4877 in ExecutorRun /home/cc/repo/gpdb6/src/backend/executor/execMain.c:992:4 apache#15 0x562f5e857e69 in PortalRunSelect /home/cc/repo/gpdb6/src/backend/tcop/pquery.c:1164:4 apache#16 0x562f5e856d3f in PortalRun /home/cc/repo/gpdb6/src/backend/tcop/pquery.c:1005:18 apache#17 0x562f5e84607a in exec_simple_query /home/cc/repo/gpdb6/src/backend/tcop/postgres.c:1848:10 ``` ## Reproduce We can use a simple procedure to reproduce the above problem: - set timeout GUC: `gpconfig -c gp_segment_connect_timeout -v 5` and `gpstop -ari` - prepare function: ``` CREATE EXTENSION plpythonu; CREATE OR REPLACE FUNCTION test_func() RETURNS SETOF int AS $$ plpy.execute("select pg_backend_pid()") for i in range(0, 5): yield (i) $$ LANGUAGE plpythonu; ``` - exit from the current psql session. - stop the postmaster of segment: `gdb -p "the pid of segment postmaster"` - enter a psql session. - call `SELECT test_func();` and get error ``` gpadmin=# select test_func(); ERROR: function "test_func" error fetching next item from iterator (plpy_elog.c:121) DETAIL: Exception: failed to acquire resources on one or more segments CONTEXT: Traceback (most recent call last): PL/Python function "test_func" ``` - quit gdb and make postmaster runnable. - call `SELECT test_func();` again and get panic ``` gpadmin=# SELECT test_func(); server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. The connection to the server was lost. Attempting reset: Failed. !> ``` ## Analysis - There is an SPI call in test_func(): `plpy.execute()`. - Then coordinator will start a subtransaction by PLy_spi_subtransaction_begin(); - Meanwhile, if the segment cannot receive the instruction from the coordinator, the subtransaction beginning procedure return fails. - BUT! The Python processor does not know whether an error happened and does not clean its environment. - Then the next plpython UDF in the same session will fail due to the wrong Python environment. ## Solution - Use try-catch to catch the exception caused by PLy_spi_subtransaction_begin() - set the python error indicator by PLy_spi_exception_set() Co-authored-by: Chen Mulong <[email protected]>
1 parent 8e8efda commit 24ccd87

File tree

6 files changed

+125
-13
lines changed

6 files changed

+125
-13
lines changed

src/backend/access/transam/xact.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5560,6 +5560,7 @@ void
55605560
BeginInternalSubTransaction(const char *name)
55615561
{
55625562
TransactionState s = CurrentTransactionState;
5563+
SIMPLE_FAULT_INJECTOR("begin_internal_sub_transaction");
55635564

55645565
if (Gp_role == GP_ROLE_DISPATCH)
55655566
{

src/pl/plpython/expected/plpython_subtransaction.out

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,3 +399,48 @@ CONTEXT: Traceback (most recent call last):
399399
PL/Python function "cursor_close_aborted_subxact", line 7, in <module>
400400
cur.close()
401401
PL/Python function "cursor_close_aborted_subxact"
402+
-- error report test in subtransaction begin
403+
-- prepare function
404+
CREATE OR REPLACE FUNCTION test_func() RETURNS SETOF int AS
405+
$$
406+
plpy.execute("select pg_backend_pid()")
407+
408+
for i in range(0, 5):
409+
yield (i)
410+
411+
$$ LANGUAGE plpython3u;
412+
-- inject fault and wait for trigger
413+
select gp_inject_fault_infinite('begin_internal_sub_transaction', 'error', 1);
414+
gp_inject_fault_infinite
415+
--------------------------
416+
Success:
417+
(1 row)
418+
419+
SELECT test_func();
420+
ERROR: function "test_func" error fetching next item from iterator
421+
DETAIL: spiexceptions.FaultInject: fault triggered, fault name:'begin_internal_sub_transaction' fault type:'error'
422+
CONTEXT: Traceback (most recent call last):
423+
PL/Python function "test_func"
424+
select gp_wait_until_triggered_fault('begin_internal_sub_transaction', 1, 1);
425+
gp_wait_until_triggered_fault
426+
-------------------------------
427+
Success:
428+
(1 row)
429+
430+
select gp_inject_fault('begin_internal_sub_transaction', 'reset', 1);
431+
gp_inject_fault
432+
-----------------
433+
Success:
434+
(1 row)
435+
436+
SELECT test_func();
437+
test_func
438+
-----------
439+
0
440+
1
441+
2
442+
3
443+
4
444+
(5 rows)
445+
446+
DROP FUNCTION test_func();

src/pl/plpython/plpy_cursorobject.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ PLy_cursor_query(const char *query)
9898
oldcontext = CurrentMemoryContext;
9999
oldowner = CurrentResourceOwner;
100100

101-
PLy_spi_subtransaction_begin(oldcontext, oldowner);
101+
if(!PLy_spi_subtransaction_begin(oldcontext, oldowner))
102+
return NULL;
102103

103104
PG_TRY();
104105
{
@@ -196,7 +197,8 @@ PLy_cursor_plan(PyObject *ob, PyObject *args)
196197
oldcontext = CurrentMemoryContext;
197198
oldowner = CurrentResourceOwner;
198199

199-
PLy_spi_subtransaction_begin(oldcontext, oldowner);
200+
if(!PLy_spi_subtransaction_begin(oldcontext, oldowner))
201+
return NULL;
200202

201203
PG_TRY();
202204
{
@@ -333,7 +335,8 @@ PLy_cursor_iternext(PyObject *self)
333335
oldcontext = CurrentMemoryContext;
334336
oldowner = CurrentResourceOwner;
335337

336-
PLy_spi_subtransaction_begin(oldcontext, oldowner);
338+
if(!PLy_spi_subtransaction_begin(oldcontext, oldowner))
339+
return NULL;
337340

338341
PG_TRY();
339342
{
@@ -403,7 +406,8 @@ PLy_cursor_fetch(PyObject *self, PyObject *args)
403406
oldcontext = CurrentMemoryContext;
404407
oldowner = CurrentResourceOwner;
405408

406-
PLy_spi_subtransaction_begin(oldcontext, oldowner);
409+
if(!PLy_spi_subtransaction_begin(oldcontext, oldowner))
410+
return NULL;
407411

408412
PG_TRY();
409413
{

src/pl/plpython/plpy_spi.c

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ PLy_spi_prepare(PyObject *self, PyObject *args)
8484
oldcontext = CurrentMemoryContext;
8585
oldowner = CurrentResourceOwner;
8686

87-
PLy_spi_subtransaction_begin(oldcontext, oldowner);
87+
if(!PLy_spi_subtransaction_begin(oldcontext, oldowner))
88+
return NULL;
8889

8990
PG_TRY();
9091
{
@@ -238,7 +239,8 @@ PLy_spi_execute_plan(PyObject *ob, PyObject *list, int64 limit)
238239
oldcontext = CurrentMemoryContext;
239240
oldowner = CurrentResourceOwner;
240241

241-
PLy_spi_subtransaction_begin(oldcontext, oldowner);
242+
if(!PLy_spi_subtransaction_begin(oldcontext, oldowner))
243+
return NULL;
242244

243245
PG_TRY();
244246
{
@@ -334,7 +336,8 @@ PLy_spi_execute_query(char *query, int64 limit)
334336
oldcontext = CurrentMemoryContext;
335337
oldowner = CurrentResourceOwner;
336338

337-
PLy_spi_subtransaction_begin(oldcontext, oldowner);
339+
if(!PLy_spi_subtransaction_begin(oldcontext, oldowner))
340+
return NULL;
338341

339342
PG_TRY();
340343
{
@@ -595,7 +598,9 @@ PLy_rollback(PyObject *self, PyObject *args)
595598
* MemoryContext oldcontext = CurrentMemoryContext;
596599
* ResourceOwner oldowner = CurrentResourceOwner;
597600
*
598-
* PLy_spi_subtransaction_begin(oldcontext, oldowner);
601+
* if(!PLy_spi_subtransaction_begin(oldcontext, oldowner))
602+
* return NULL;
603+
*
599604
* PG_TRY();
600605
* {
601606
* <call SPI functions>
@@ -612,12 +617,48 @@ PLy_rollback(PyObject *self, PyObject *args)
612617
* These utilities take care of restoring connection to the SPI manager and
613618
* setting a Python exception in case of an abort.
614619
*/
615-
void
620+
bool
616621
PLy_spi_subtransaction_begin(MemoryContext oldcontext, ResourceOwner oldowner)
617622
{
618-
BeginInternalSubTransaction(NULL);
619-
/* Want to run inside function's memory context */
620-
MemoryContextSwitchTo(oldcontext);
623+
PG_TRY();
624+
{
625+
/* Start subtransaction (could fail) */
626+
BeginInternalSubTransaction(NULL);
627+
/* Want to run inside function's memory context */
628+
MemoryContextSwitchTo(oldcontext);
629+
}
630+
PG_CATCH();
631+
{
632+
ErrorData *edata;
633+
PLyExceptionEntry *entry;
634+
PyObject *exc;
635+
636+
/* Ensure we restore original context and owner */
637+
MemoryContextSwitchTo(oldcontext);
638+
CurrentResourceOwner = oldowner;
639+
640+
/* Save error info */
641+
edata = CopyErrorData();
642+
FlushErrorState();
643+
644+
/* Look up the correct exception */
645+
entry = hash_search(PLy_spi_exceptions, &(edata->sqlerrcode),
646+
HASH_FIND, NULL);
647+
648+
/*
649+
* This could be a custom error code, if that's the case fallback to
650+
* SPIError
651+
*/
652+
exc = entry ? entry->exc : PLy_exc_spi_error;
653+
/* Make Python raise the exception */
654+
PLy_spi_exception_set(exc, edata);
655+
FreeErrorData(edata);
656+
657+
return false;
658+
}
659+
PG_END_TRY();
660+
661+
return true;
621662
}
622663

623664
void

src/pl/plpython/plpy_spi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ typedef struct PLyExceptionEntry
2222
} PLyExceptionEntry;
2323

2424
/* handling of SPI operations inside subtransactions */
25-
extern void PLy_spi_subtransaction_begin(MemoryContext oldcontext, ResourceOwner oldowner);
25+
extern bool PLy_spi_subtransaction_begin(MemoryContext oldcontext, ResourceOwner oldowner);
2626
extern void PLy_spi_subtransaction_commit(MemoryContext oldcontext, ResourceOwner oldowner);
2727
extern void PLy_spi_subtransaction_abort(MemoryContext oldcontext, ResourceOwner oldowner);
2828

src/pl/plpython/sql/plpython_subtransaction.sql

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,24 @@ SELECT cursor_in_subxact();
260260
SELECT cursor_aborted_subxact();
261261
SELECT cursor_plan_aborted_subxact();
262262
SELECT cursor_close_aborted_subxact();
263+
264+
-- error report test in subtransaction begin
265+
-- prepare function
266+
CREATE OR REPLACE FUNCTION test_func() RETURNS SETOF int AS
267+
$$
268+
plpy.execute("select pg_backend_pid()")
269+
270+
for i in range(0, 5):
271+
yield (i)
272+
273+
$$ LANGUAGE plpythonu;
274+
275+
-- inject fault and wait for trigger
276+
select gp_inject_fault_infinite('begin_internal_sub_transaction', 'error', 1);
277+
SELECT test_func();
278+
select gp_wait_until_triggered_fault('begin_internal_sub_transaction', 1, 1);
279+
select gp_inject_fault('begin_internal_sub_transaction', 'reset', 1);
280+
281+
SELECT test_func();
282+
283+
DROP FUNCTION test_func();

0 commit comments

Comments
 (0)