Skip to content

Commit f966877

Browse files
craig[bot]rafissangles-n-daemons
committed
150991: pgwire: limit concurrent updates to last_login_time r=rafiss a=rafiss ### pgwire: limit concurrent updates to last_login_time This patch uses singleflight to ensure that each node only has one request in flight for updating the last_login_time. If there already is a request in flight, then that user is added to a set of pending users to update, and their time gets populated in the next singleflight. This will reduce write traffic to the system.users table. This includes a minor refactor to separate the login_time updating logic into a separate component. As part of this, we also now skip updating the last_login_time if running in a read-only tenant. ### sql/pgwire: use a single query to update last_login_time Rather than using a separate query for each user, now we will run one query to update multiple users at once. ### sql/pgwire: avoid frequent updates to last_login_time This patch adds a small LRU cache to avoid updating the estimated_last_login_time overly frequently for a single user. The cache will cause us to skip the update if the user's login time was last updated within the past 10 minutes. This requires us to update the tests to use a different user, since the existing test drops `testuser` and recreates, which means that the last_login_time will not be updated for that user as it's still in the LRU cache. ---- fixes #151051 fixes #150560 Release note: None 151041: security: fix memory accounting issue in client certificate cache r=angles-n-daemons a=angles-n-daemons The client certificate cache is a simple utility cache which keeps track of the nearest certificate expiration times by user. It does this by reading in certificates as they are used, and then updating the gauge associated with the user so that our operators can alert on expiring certificates. It has been identified that there is a memory accounting issue with this utility class, in that if we see the same certificate multiple times, we report that our cache is growing for each one. In reality however, though we may change the reference in our cache, the old value will be cleaned up, and memory should remain relatively low. This PR both fixes the accounting bug, and adds a limit so that the client certificate cache can't negatively affect SQL operation. Epic: none Fixes: #151040 Release note: Fixes a memory accounting issue with the client certificate cache, where we were accidentally reporting multiple allocations for the same memory. Co-authored-by: Rafi Shamim <[email protected]> Co-authored-by: Brian Dillmann <[email protected]>
3 parents 0af528e + 175db1c + 3cf4692 commit f966877

File tree

11 files changed

+248
-46
lines changed

11 files changed

+248
-46
lines changed

pkg/security/certificate_manager.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ import (
1313
"github.com/cockroachdb/cockroach/pkg/security/certnames"
1414
"github.com/cockroachdb/cockroach/pkg/security/clientcert"
1515
"github.com/cockroachdb/cockroach/pkg/security/username"
16+
"github.com/cockroachdb/cockroach/pkg/settings"
17+
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
1618
"github.com/cockroachdb/cockroach/pkg/util/log"
1719
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
1820
"github.com/cockroachdb/cockroach/pkg/util/log/severity"
@@ -196,15 +198,24 @@ func (cm *CertificateManager) RegisterSignalHandler(
196198
})
197199
}
198200

201+
var CertCacheMemLimit = settings.RegisterIntSetting(
202+
settings.SystemVisible,
203+
"security.clienc_cert.cache_memory_limit",
204+
"memory limit for the client certificate expiration cache",
205+
1<<29, // 512MiB
206+
)
207+
199208
// RegisterExpirationCache registers a cache for client certificate expiration.
200209
// It is called during server startup.
201210
func (cm *CertificateManager) RegisterExpirationCache(
202211
ctx context.Context,
203212
stopper *stop.Stopper,
204213
timeSrc timeutil.TimeSource,
205214
parentMon *mon.BytesMonitor,
215+
st *cluster.Settings,
206216
) error {
207-
m := mon.NewMonitorInheritWithLimit(mon.MakeName("client-expiration-caches"), 0 /* limit */, parentMon, true /* longLiving */)
217+
limit := CertCacheMemLimit.Get(&st.SV)
218+
m := mon.NewMonitorInheritWithLimit(mon.MakeName("client-expiration-caches"), limit, parentMon, true /* longLiving */)
208219
acc := m.MakeConcurrentBoundAccount()
209220
m.StartNoReserved(ctx, parentMon)
210221

pkg/security/clientcert/cert_expiry_cache.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,13 @@ func (c *Cache) Upsert(ctx context.Context, user string, serial string, newExpir
105105
c.cache[user] = map[string]certInfo{}
106106
}
107107

108-
err := c.account.Grow(ctx, CertInfoSize)
109-
if err != nil {
110-
log.Warningf(ctx, "no memory available to cache cert expiry: %v", err)
111-
c.evictLocked(ctx, user, serial)
108+
// if the serial hasn't been seen, report it in the memory accounting.
109+
if _, ok := c.cache[user][serial]; !ok {
110+
err := c.account.Grow(ctx, CertInfoSize)
111+
if err != nil {
112+
log.Warningf(ctx, "no memory available to cache cert expiry: %v", err)
113+
return
114+
}
112115
}
113116

114117
// insert / update the certificate expiration time.

pkg/security/clientcert/cert_expiry_cache_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ func TestAllocationTracking(t *testing.T) {
313313
cache.Upsert(ctx, "user1", "serial2", 120)
314314
cache.Upsert(ctx, "user2", "serial3", 65)
315315
clock.Advance(time.Minute)
316+
require.Equal(t, (3*certInfoSize)+(4*gaugeSize), account.Used())
316317
cache.Purge(ctx)
317318
cache.Upsert(ctx, "user2", "serial4", 75)
318319
require.Equal(t, certInfoSize+(2*gaugeSize), account.Used())
@@ -324,9 +325,21 @@ func TestAllocationTracking(t *testing.T) {
324325
cache.Upsert(ctx, "user1", "serial2", 120)
325326
cache.Upsert(ctx, "user2", "serial3", 65)
326327
cache.Upsert(ctx, "user2", "serial4", 75)
328+
require.Equal(t, (4*certInfoSize)+(4*gaugeSize), account.Used())
327329
cache.Clear()
328330
require.Equal(t, int64(0), account.Used())
329331
})
332+
333+
t.Run("overwriting an existing certificate does not change the reported memory allocation", func(t *testing.T) {
334+
cache, account := newCacheAndAccount(ctx, clock, stopper)
335+
require.Equal(t, int64(0), account.Used())
336+
cache.Upsert(ctx, "user1", "serial1", 100)
337+
require.Equal(t, certInfoSize+(2*gaugeSize), account.Used())
338+
cache.Upsert(ctx, "user1", "serial1", 100)
339+
require.Equal(t, certInfoSize+(2*gaugeSize), account.Used())
340+
cache.Upsert(ctx, "user1", "serial1", 100)
341+
require.Equal(t, certInfoSize+(2*gaugeSize), account.Used())
342+
})
330343
}
331344

332345
func TestExpiration(t *testing.T) {

pkg/server/server_sql.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -947,7 +947,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) {
947947
return nil, errors.Wrap(err, "initializing certificate manager")
948948
}
949949
err = certMgr.RegisterExpirationCache(
950-
ctx, cfg.stopper, &timeutil.DefaultTimeSource{}, rootSQLMemoryMonitor,
950+
ctx, cfg.stopper, &timeutil.DefaultTimeSource{}, rootSQLMemoryMonitor, cfg.Settings,
951951
)
952952
if err != nil {
953953
return nil, errors.Wrap(err, "adding clientcert cache")

pkg/sql/logictest/testdata/logic_test/user

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -240,9 +240,9 @@ onlyif config local-mixed-25.2
240240
statement ok
241241
DROP user testuser
242242

243-
onlyif config local-mixed-25.2
244243
user testuser nodeidx=0 newsession
245244

245+
onlyif config local-mixed-25.2
246246
statement error pq: password authentication failed for user testuser
247247
SHOW session_user
248248

@@ -259,7 +259,6 @@ skipif config local-mixed-25.2
259259
statement ok
260260
DROP user testuser
261261

262-
skipif config local-mixed-25.2
263262
user testuser nodeidx=0 newsession
264263

265264
# The logictest suite currently doesn't work for external authentication methods
@@ -271,38 +270,38 @@ skipif config local-mixed-25.2
271270
statement error pq: user identity unknown for identity provider
272271
SHOW session_user
273272

274-
subtest end
275-
276-
subtest validate_estimated_last_login_time
277-
278273
user root
279274

280275
statement ok
281276
CREATE user IF NOT EXISTS testuser
282277

283-
user testuser
278+
subtest end
279+
280+
subtest validate_estimated_last_login_time
281+
282+
user testuser2
284283

285284
# Since the logictest framework does not connect with the user until a command
286285
# is executed, we need to run a command before checking estimated_last_login_time.
287286
query T
288287
SHOW session_user
289288
----
290-
testuser
289+
testuser2
291290

292291
user root
293292

294293
# The estimated_last_login_time is not guaranteed to be populated synchronously,
295294
# so we poll until testuser's entry was updated with the last login time.
296295
skipif config local-mixed-25.2
297296
query I retry
298-
SELECT count(*) FROM system.users WHERE estimated_last_login_time IS NOT NULL AND username = 'testuser'
297+
SELECT count(*) FROM system.users WHERE estimated_last_login_time IS NOT NULL AND username = 'testuser2'
299298
----
300299
1
301300

302301
user root
303302

304303
onlyif config local-mixed-25.2
305304
statement error pq: column "estimated_last_login_time" does not exist
306-
select estimated_last_login_time from [SHOW USERS] where username = 'testuser';
305+
select estimated_last_login_time from [SHOW USERS] where username = 'testuser2';
307306

308307
subtest end

pkg/sql/pgwire/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ go_library(
1212
"conn.go",
1313
"hba_conf.go",
1414
"ident_map_conf.go",
15+
"last_login_updater.go",
1516
"pre_serve.go",
1617
"pre_serve_options.go",
1718
"role_mapper.go",
@@ -63,6 +64,7 @@ go_library(
6364
"//pkg/sql/sqltelemetry",
6465
"//pkg/sql/types",
6566
"//pkg/util",
67+
"//pkg/util/cache",
6668
"//pkg/util/ctxlog",
6769
"//pkg/util/duration",
6870
"//pkg/util/envutil",
@@ -81,6 +83,7 @@ go_library(
8183
"//pkg/util/ring",
8284
"//pkg/util/stop",
8385
"//pkg/util/syncutil",
86+
"//pkg/util/syncutil/singleflight",
8487
"//pkg/util/system",
8588
"//pkg/util/timeofday",
8689
"//pkg/util/timeutil",

pkg/sql/pgwire/auth.go

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -90,14 +90,17 @@ type authOptions struct {
9090
// authentication and update c.sessionArgs with the authenticated user's name,
9191
// if different from the one given initially.
9292
func (c *conn) handleAuthentication(
93-
ctx context.Context, ac AuthConn, authOpt authOptions, execCfg *sql.ExecutorConfig,
93+
ctx context.Context, ac AuthConn, authOpt authOptions, server *Server,
9494
) (connClose func(), _ error) {
9595
if authOpt.testingSkipAuth {
9696
return nil, nil
9797
}
9898
if authOpt.testingAuthHook != nil {
9999
return nil, authOpt.testingAuthHook(ctx)
100100
}
101+
// Get execCfg from the server.
102+
execCfg := server.execCfg
103+
101104
// To book-keep the authentication start time.
102105
authStartTime := timeutil.Now()
103106

@@ -278,28 +281,11 @@ func (c *conn) handleAuthentication(
278281
duration := timeutil.Since(authStartTime).Nanoseconds()
279282
c.publishConnLatencyMetric(duration, hbaEntry.Method.String())
280283

281-
c.populateLastLoginTime(ctx, execCfg, dbUser)
284+
server.lastLoginUpdater.updateLastLoginTime(ctx, dbUser)
282285

283286
return connClose, nil
284287
}
285288

286-
// populateLastLoginTime updates the last login time for the sql user
287-
// asynchronously. This not guaranteed to succeed and we log any errors obtained
288-
// from the update transaction to the DEV channel.
289-
func (c *conn) populateLastLoginTime(
290-
ctx context.Context, execCfg *sql.ExecutorConfig, dbUser username.SQLUsername,
291-
) {
292-
// Update last login time in async. This is done asynchronously to avoid
293-
// blocking the connection.
294-
if err := execCfg.Stopper.RunAsyncTask(ctx, "write_last_login_time", func(ctx context.Context) {
295-
if err := sql.UpdateLastLoginTime(ctx, execCfg, dbUser.SQLIdentifier()); err != nil {
296-
log.Warningf(ctx, "failed to update last login time for user %s: %v", dbUser, err)
297-
}
298-
}); err != nil {
299-
log.Warningf(ctx, "failed to create async task to update last login time for user %s: %v", dbUser, err)
300-
}
301-
}
302-
303289
// publishConnLatencyMetric publishes the latency of the connection
304290
// based on the authentication method.
305291
func (c *conn) publishConnLatencyMetric(duration int64, authMethod string) {

pkg/sql/pgwire/conn.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ func (c *conn) processCommands(
151151
ctx context.Context,
152152
authOpt authOptions,
153153
ac AuthConn,
154-
sqlServer *sql.Server,
154+
server *Server,
155155
reserved *mon.BoundAccount,
156156
onDefaultIntSizeChange func(newSize int32),
157157
sessionID clusterunique.ID,
@@ -175,7 +175,7 @@ func (c *conn) processCommands(
175175
// network read on the connection's goroutine.
176176
c.cancelConn()
177177

178-
pgwireKnobs := sqlServer.GetExecutorConfig().PGWireTestingKnobs
178+
pgwireKnobs := server.SQLServer.GetExecutorConfig().PGWireTestingKnobs
179179
if pgwireKnobs != nil && pgwireKnobs.CatchPanics {
180180
if r := recover(); r != nil {
181181
// Catch the panic and return it to the client as an error.
@@ -203,14 +203,14 @@ func (c *conn) processCommands(
203203

204204
// Authenticate the connection.
205205
if connCloseAuthHandler, retErr = c.handleAuthentication(
206-
ctx, ac, authOpt, sqlServer.GetExecutorConfig(),
206+
ctx, ac, authOpt, server,
207207
); retErr != nil {
208208
// Auth failed or some other error.
209209
return
210210
}
211211

212212
var decrementConnectionCount func()
213-
if decrementConnectionCount, retErr = sqlServer.IncrementConnectionCount(c.sessionArgs); retErr != nil {
213+
if decrementConnectionCount, retErr = server.SQLServer.IncrementConnectionCount(c.sessionArgs); retErr != nil {
214214
// This will return pgcode.TooManyConnections which is used by the sql proxy
215215
// to skip failed auth throttle (as in this case the auth was fine but the
216216
// error occurred before sending back auth ok msg)
@@ -224,7 +224,7 @@ func (c *conn) processCommands(
224224
}
225225

226226
// Inform the client of the default session settings.
227-
connHandler, retErr = c.sendInitialConnData(ctx, sqlServer, onDefaultIntSizeChange, sessionID)
227+
connHandler, retErr = c.sendInitialConnData(ctx, server.SQLServer, onDefaultIntSizeChange, sessionID)
228228
if retErr != nil {
229229
return
230230
}
@@ -250,7 +250,7 @@ func (c *conn) processCommands(
250250

251251
// Now actually process commands.
252252
reservedOwned = false // We're about to pass ownership away.
253-
retErr = sqlServer.ServeConn(
253+
retErr = server.SQLServer.ServeConn(
254254
ctx,
255255
connHandler,
256256
reserved,

0 commit comments

Comments
 (0)