Skip to content

Commit 295c016

Browse files
committed
PG-1604: Improve last key LSN calculation logic
Previosly we simply set the LSN for the new key to the first write location. This is however not correct, as there are many corner cases around this: * recovery / replication might write old LSNs * we can't handle multiple keys with the same TLI/LSN, which can happen with quick restarts without writes To support this in this commit we modify the following: * We only activate new keys outside crash recovery, or immediately if encryption is turned off * We also take the already existing last key into account (if exists), and only activate a new key if we progressed past its start location The remaining changes are just support infrastructure for this: * Since we might rewrite old records, we use the already existing keys for those writes, not the active last keys * We prefetch existing keys during initialization, so it doesn't accidentally happen in the critical section during a write There is a remaining bug with stopping wal encryption, also mentioned in a TODO message in the code. This will be addressed in a later PR as this fix already took too long.
1 parent a2d1638 commit 295c016

File tree

6 files changed

+1366
-19
lines changed

6 files changed

+1366
-19
lines changed

contrib/pg_tde/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ tap_tests = [
127127
't/wal_archiving.pl',
128128
't/wal_encrypt.pl',
129129
't/wal_key_tli.pl',
130+
't/059_tde_2pc_replication.pl',
131+
't/stream_rep.pl',
130132
]
131133

132134
tests += {

contrib/pg_tde/src/access/pg_tde_xlog_smgr.c

Lines changed: 81 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ void
228228
TDEXLogSmgrInitWrite(bool encrypt_xlog)
229229
{
230230
WalEncryptionKey *key = pg_tde_read_last_wal_key();
231+
WalLocation start = {.tli = 1,.lsn = 0};
232+
WALKeyCacheRec *keys;
231233

232234
/*
233235
* Always generate a new key on starting PostgreSQL to protect against
@@ -248,6 +250,14 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
248250
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
249251
}
250252

253+
keys = pg_tde_get_wal_cache_keys();
254+
255+
if(keys == NULL)
256+
{
257+
/* TODO cache is empty, try to preread keys from disk */
258+
keys = pg_tde_fetch_wal_keys(start);
259+
}
260+
251261
if (key)
252262
pfree(key);
253263
}
@@ -265,6 +275,27 @@ TDEXLogSmgrInitWriteReuseKey()
265275
}
266276
}
267277

278+
/*
279+
* Encrypt XLog page(s) from the buf and write to the segment file.
280+
*/
281+
static ssize_t
282+
TDEXLogWriteEncryptedPagesOldKeys(int fd, const void *buf, size_t count, off_t offset,
283+
TimeLineID tli, XLogSegNo segno, int segSize)
284+
{
285+
char *enc_buff = EncryptionBuf;
286+
287+
#ifndef FRONTEND
288+
Assert(count <= TDEXLogEncryptBuffSize());
289+
#endif
290+
291+
/* This method potentially allocates, but only in very early execution
292+
Shouldn't happen in a write, where we are in a critical section */
293+
TDEXLogCryptBuffer(buf, enc_buff, count, offset, tli, segno, segSize);
294+
295+
return pg_pwrite(fd, enc_buff, count, offset);
296+
}
297+
298+
268299
/*
269300
* Encrypt XLog page(s) from the buf and write to the segment file.
270301
*/
@@ -286,6 +317,7 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
286317
#endif
287318

288319
CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
320+
289321
pg_tde_stream_crypt(iv_prefix,
290322
offset,
291323
(char *) buf,
@@ -301,26 +333,49 @@ static ssize_t
301333
tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
302334
TimeLineID tli, XLogSegNo segno, int segSize)
303335
{
336+
bool lastKeyUsable;
337+
bool afterLastKey;
338+
#ifdef FRONTEND
339+
bool crashRecovery = false;
340+
#else
341+
bool crashRecovery = GetRecoveryState() == RECOVERY_STATE_CRASH;
342+
#endif
343+
344+
WalLocation loc = {.tli = tli};
345+
346+
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
347+
304348
/*
305349
* Set the last (most recent) key's start LSN if not set.
306350
*
307351
* This func called with WALWriteLock held, so no need in any extra sync.
308352
*/
309-
if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && TDEXLogGetEncKeyLsn() == 0)
310-
{
311-
WalLocation loc = {.tli = tli};
312353

313-
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
354+
lastKeyUsable = (TDEXLogGetEncKeyLsn() != 0);
355+
afterLastKey = (TDEXLogGetEncKeyLsn() <= loc.lsn);
314356

315-
pg_tde_wal_last_key_set_location(loc);
316-
EncryptionKey.wal_start = loc;
317-
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
357+
if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && !lastKeyUsable)
358+
{
359+
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
360+
if(!crashRecovery || EncryptionKey.type == WAL_KEY_TYPE_UNENCRYPTED) {
361+
/* TODO: the unencrypted case is still not perfect, we need to report an error in some cornercases */
362+
if (last_key == NULL || last_key->start.lsn < loc.lsn) {
363+
pg_tde_wal_last_key_set_location(loc);
364+
EncryptionKey.wal_start = loc;
365+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
366+
lastKeyUsable = true;
367+
}
368+
}
318369
}
319370

320-
if (EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
321-
return TDEXLogWriteEncryptedPages(fd, buf, count, offset, tli, segno);
322-
else
371+
if((!afterLastKey || !lastKeyUsable) && EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED) {
372+
return TDEXLogWriteEncryptedPagesOldKeys(fd, buf, count, offset, tli, segno, segSize);
373+
} else if (EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED) {
374+
return TDEXLogWriteEncryptedPages(fd, buf, count, offset, tli, segno);
375+
}
376+
else {
323377
return pg_pwrite(fd, buf, count, offset);
378+
}
324379
}
325380

326381
/*
@@ -342,7 +397,7 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
342397
if (readsz <= 0)
343398
return readsz;
344399

345-
TDEXLogCryptBuffer(buf, count, offset, tli, segno, segSize);
400+
TDEXLogCryptBuffer(buf, buf, count, offset, tli, segno, segSize);
346401

347402
return readsz;
348403
}
@@ -351,20 +406,22 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
351406
* [De]Crypt buffer if needed based on provided segment offset, number and TLI
352407
*/
353408
void
354-
TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
409+
TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
355410
TimeLineID tli, XLogSegNo segno, int segSize)
356411
{
357412
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
358413
XLogRecPtr write_key_lsn;
359414
WalLocation data_end = {.tli = tli};
360415
WalLocation data_start = {.tli = tli};
361416

362-
if (!keys)
417+
if (keys == NULL)
363418
{
364419
WalLocation start = {.tli = 1,.lsn = 0};
365420

366421
/* cache is empty, try to read keys from disk */
367-
keys = pg_tde_fetch_wal_keys(start);
422+
pg_tde_fetch_wal_keys(start);
423+
424+
keys = pg_tde_get_wal_cache_keys();
368425
}
369426

370427
/*
@@ -423,6 +480,7 @@ TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
423480
off_t dec_end = XLogSegmentOffset(minlsn, segSize);
424481
size_t dec_sz;
425482
char *dec_buf = (char *) buf + (dec_off - offset);
483+
char *o_buf = (char *) out_buf + (dec_off - offset);
426484

427485
Assert(dec_off >= offset);
428486

@@ -434,20 +492,26 @@ TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
434492
dec_end = offset + count;
435493
}
436494

437-
dec_sz = dec_end - dec_off;
495+
if(dec_end > dec_off) {
496+
dec_sz = dec_end - dec_off;
497+
} else {
498+
// assert?
499+
dec_sz = 0;
500+
}
438501

439502
#ifdef TDE_XLOG_DEBUG
440503
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
441504
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
442505
#endif
506+
443507
pg_tde_stream_crypt(iv_prefix,
444508
dec_off,
445509
dec_buf,
446510
dec_sz,
447-
dec_buf,
511+
o_buf,
448512
curr_key->key.key,
449513
&curr_key->crypt_ctx);
450-
}
514+
}
451515
}
452516
}
453517
}

contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ extern void TDEXLogSmgrInit(void);
1313
extern void TDEXLogSmgrInitWrite(bool encrypt_xlog);
1414
extern void TDEXLogSmgrInitWriteReuseKey(void);
1515

16-
extern void TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
16+
extern void TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
1717
TimeLineID tli, XLogSegNo segno, int segSize);
1818

1919
#endif /* PG_TDE_XLOGSMGR_H */

0 commit comments

Comments
 (0)