Skip to content

Commit 8719d2f

Browse files
jiaqizhomy-ship-it
authored andcommitted
FIX: Invalid relcache leak WARNING logged in autovacuum
The autovacuum launcher process periodically launches workers to vacuum the table. During this process, the UDF `pg_catalog.gp_acquire_sample_rows` will be called. Also the vacuum task always be canceled by launcher. The plan of `pg_catalog.gp_acquire_sample_rows` is: ``` QUERY PLAN --------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..45.02 rows=3000 width=32) Output: (gp_acquire_sample_rows('17018'::oid, 1250, false)) -> ProjectSet (cost=0.00..5.02 rows=1000 width=32) Output: gp_acquire_sample_rows('17018'::oid, 1250, false) -> Result (cost=0.00..0.01 rows=1 width=0) Optimizer: Postgres query optimizer (6 rows) ``` In actual examples, we often encounter relcache leaks caused by `pg_catalog.gp_acquire_sample_rows`. In fact, this warning is not caused by the UDF itself. The following are the complete steps to reproduce(not stable reproduce) 1. User use the insert/update/delete SQL. Auto-vacuum is enabled. 2. The auto-vacuum worker process call the `pg_catalog.gp_acquire_sample_rows` 2.1 The vacuum launches in master cancel the vacuum query. 2.2 The vacuum worker in master process the interrupt in the intercontect. So the gather motion will be aborted. 2.3 The segment do the tuple sender in the motion(`doSendTuple`).But it found the connection is NOT alive. Also it have not recv the SIGN INT in this time. So segment mark the `StopRequested` to true, and finish the current motion, and the function `pg_catalog.gp_acquire_sample_rows` in project set can't call the `table_close` in this time. 2.4 The segment call the `PortalDrop` to destory the resowner which inside the current portal, and current portal status won't be FAIL, because current segment still have not recv the SIGN INT. The resowner found the leaked relcache, log the WARNING. 3. After step2, segments recv the SIGN INT, But nothing to do.
1 parent 3ead998 commit 8719d2f

File tree

6 files changed

+55
-4
lines changed

6 files changed

+55
-4
lines changed

contrib/interconnect/udp/ic_udpifc.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3965,7 +3965,10 @@ receiveChunksUDPIFCLoop(ChunkTransportState *pTransportStates, ChunkTransportSta
39653965
/* check the potential errors in rx thread. */
39663966
checkRxThreadError();
39673967

3968-
/* do not check interrupts when holding the lock */
3968+
FaultInjector_InjectFaultIfSet("interconnect_stop_recv_chunk",
3969+
DDLNotSpecified,
3970+
"" /* databaseName */ ,
3971+
"" /* tableName */ );
39693972
ML_CHECK_FOR_INTERRUPTS(pTransportStates->teardownActive);
39703973

39713974
/*

src/backend/executor/nodeMotion.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#include "utils/wait_event.h"
3232
#include "miscadmin.h"
3333
#include "utils/memutils.h"
34-
34+
#include "tcop/pquery.h" /* ActivePortal */
3535

3636
/* #define MEASURE_MOTION_TIME */
3737

@@ -269,6 +269,7 @@ execMotionSender(MotionState *node)
269269

270270
if (node->stopRequested)
271271
{
272+
ActivePortal->stop_requested_in_motion = true;
272273
elog(gp_workfile_caching_loglevel, "Motion calling Squelch on child node");
273274
/* propagate stop notification to our children */
274275
ExecSquelchNode(outerNode, true);

src/backend/utils/mmgr/portalmem.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ CreatePortal(const char *name, bool allowDup, bool dupSilent)
224224
portal->atEnd = true; /* disallow fetches until query is set */
225225
portal->visible = true;
226226
portal->creation_time = GetCurrentStatementStartTimestamp();
227+
portal->stop_requested_in_motion = false;
227228

228229
if (IsResQueueEnabled())
229230
{
@@ -587,7 +588,7 @@ PortalDrop(Portal portal, bool isTopCommit)
587588
if (portal->resowner &&
588589
(!isTopCommit || portal->status == PORTAL_FAILED))
589590
{
590-
bool isCommit = (portal->status != PORTAL_FAILED);
591+
bool isCommit = (portal->status != PORTAL_FAILED) && !portal->stop_requested_in_motion;
591592

592593
ResourceOwnerRelease(portal->resowner,
593594
RESOURCE_RELEASE_BEFORE_LOCKS,

src/include/utils/portal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,9 @@ typedef struct PortalData
225225

226226
/* MPP: is this portal a CURSOR, or protocol level portal? */
227227
bool is_extended_query; /* simple or extended query protocol? */
228+
229+
/* current motion stop requested? */
230+
bool stop_requested_in_motion;
228231
} PortalData;
229232

230233
/*

src/test/regress/expected/vacuum_gp.out

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,3 +437,31 @@ SELECT reltuples, relname FROM pg_class WHERE oid='vac_reltuple_distortion'::reg
437437
1e+06 | vac_reltuple_distortion
438438
(1 row)
439439

440+
-- test wrong log relcache leak in pg_catalog.gp_acquire_sample_rows
441+
-- start_ignore
442+
drop table if exists relcache_leak_in_motion;
443+
NOTICE: table "relcache_leak_in_motion" does not exist, skipping
444+
-- end_ignore
445+
create table relcache_leak_in_motion(v1 int);
446+
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'v1' as the Apache Cloudberry data distribution key for this table.
447+
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
448+
insert into relcache_leak_in_motion values(generate_series(0, 10000));
449+
SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'interrupt', dbid)
450+
FROM gp_segment_configuration WHERE content = -1 and role='p';
451+
gp_inject_fault
452+
-----------------
453+
Success:
454+
(1 row)
455+
456+
analyze relcache_leak_in_motion;
457+
ERROR: canceling statement due to user request
458+
SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'reset', dbid)
459+
FROM gp_segment_configuration WHERE content = -1 and role='p';
460+
gp_inject_fault
461+
-----------------
462+
Success:
463+
(1 row)
464+
465+
-- start_ignore
466+
drop table if exists relcache_leak_in_motion;
467+
-- end_ignore

src/test/regress/sql/vacuum_gp.sql

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,4 +290,19 @@ VACUUM vac_reltuple_distortion;
290290
VACUUM vac_reltuple_distortion; -- 2nd call to VACUUM after ANALYZE
291291
SELECT reltuples, relname FROM pg_class WHERE oid='vac_reltuple_distortion'::regclass;
292292
VACUUM vac_reltuple_distortion;
293-
SELECT reltuples, relname FROM pg_class WHERE oid='vac_reltuple_distortion'::regclass;
293+
SELECT reltuples, relname FROM pg_class WHERE oid='vac_reltuple_distortion'::regclass;
294+
295+
-- test wrong log relcache leak in pg_catalog.gp_acquire_sample_rows
296+
-- start_ignore
297+
drop table if exists relcache_leak_in_motion;
298+
-- end_ignore
299+
create table relcache_leak_in_motion(v1 int);
300+
insert into relcache_leak_in_motion values(generate_series(0, 10000));
301+
SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'interrupt', dbid)
302+
FROM gp_segment_configuration WHERE content = -1 and role='p';
303+
analyze relcache_leak_in_motion;
304+
SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'reset', dbid)
305+
FROM gp_segment_configuration WHERE content = -1 and role='p';
306+
-- start_ignore
307+
drop table if exists relcache_leak_in_motion;
308+
-- end_ignore

0 commit comments

Comments
 (0)