Skip to content

Commit fcc0b86

Browse files
committed
PG-1604: Improve last key LSN calculation logic
Previosly we simply set the LSN for the new key to the first write location. This is however not correct, as there are many corner cases around this: * recovery / replication might write old LSNs * we can't handle multiple keys with the same TLI/LSN, which can happen with quick restarts without writes To support this in this commit we modify the following: * We only activate new keys outside crash recovery, or immediately if encryption is turned off * We also take the already existing last key into account (if exists), and only activate a new key if we progressed past its start location The remaining changes are just support infrastructure for this: * Since we might rewrite old records, we use the already existing keys for those writes, not the active last keys * We prefetch existing keys during initialization, so it doesn't accidentally happen in the critical section during a write There is a remaining bug with stopping wal encryption, also mentioned in a TODO message in the code. This will be addressed in a later PR as this fix already took too long.
1 parent 937ac6f commit fcc0b86

File tree

6 files changed

+1395
-15
lines changed

6 files changed

+1395
-15
lines changed

contrib/pg_tde/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ tap_tests = [
128128
't/wal_archiving.pl',
129129
't/wal_encrypt.pl',
130130
't/wal_key_tli.pl',
131+
't/2pc_replication.pl',
132+
't/stream_rep.pl',
131133
]
132134

133135
tests += {

contrib/pg_tde/src/access/pg_tde_xlog_smgr.c

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ void
226226
TDEXLogSmgrInitWrite(bool encrypt_xlog)
227227
{
228228
WalEncryptionKey *key = pg_tde_read_last_wal_key();
229+
WALKeyCacheRec *keys;
229230

230231
/*
231232
* Always generate a new key on starting PostgreSQL to protect against
@@ -246,6 +247,16 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
246247
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
247248
}
248249

250+
keys = pg_tde_get_wal_cache_keys();
251+
252+
if (keys == NULL)
253+
{
254+
WalLocation start = {.tli = 1,.lsn = 0};
255+
256+
/* cache is empty, prefetch keys from disk */
257+
pg_tde_fetch_wal_keys(start);
258+
}
259+
249260
if (key)
250261
pfree(key);
251262
}
@@ -263,6 +274,32 @@ TDEXLogSmgrInitWriteReuseKey()
263274
}
264275
}
265276

277+
/*
278+
* Encrypt XLog page(s) from the buf and write to the segment file.
279+
*/
280+
static ssize_t
281+
TDEXLogWriteEncryptedPagesOldKeys(int fd, const void *buf, size_t count, off_t offset,
282+
TimeLineID tli, XLogSegNo segno, int segSize)
283+
{
284+
char *enc_buff = EncryptionBuf;
285+
286+
#ifndef FRONTEND
287+
Assert(count <= TDEXLogEncryptBuffSize());
288+
#endif
289+
290+
/* Copy the data as-is, as we might have unencrypted parts */
291+
memcpy(enc_buff, buf, count);
292+
293+
/*
294+
* This method potentially allocates, but only in very early execution
295+
* Shouldn't happen in a write, where we are in a critical section
296+
*/
297+
TDEXLogCryptBuffer(buf, enc_buff, count, offset, tli, segno, segSize);
298+
299+
return pg_pwrite(fd, enc_buff, count, offset);
300+
}
301+
302+
266303
/*
267304
* Encrypt XLog page(s) from the buf and write to the segment file.
268305
*/
@@ -284,6 +321,7 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
284321
#endif
285322

286323
CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
324+
287325
pg_tde_stream_crypt(iv_prefix,
288326
offset,
289327
(char *) buf,
@@ -299,26 +337,64 @@ static ssize_t
299337
tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
300338
TimeLineID tli, XLogSegNo segno, int segSize)
301339
{
340+
bool lastKeyUsable;
341+
bool afterWriteKey;
342+
#ifdef FRONTEND
343+
bool crashRecovery = false;
344+
#else
345+
bool crashRecovery = GetRecoveryState() == RECOVERY_STATE_CRASH;
346+
#endif
347+
348+
WalLocation loc = {.tli = tli};
349+
WalLocation writeKeyLoc;
350+
351+
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
352+
302353
/*
303354
* Set the last (most recent) key's start LSN if not set.
304355
*
305356
* This func called with WALWriteLock held, so no need in any extra sync.
306357
*/
307-
if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && TDEXLogGetEncKeyLsn() == 0)
308-
{
309-
WalLocation loc = {.tli = tli};
310358

311-
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
359+
writeKeyLoc.lsn = TDEXLogGetEncKeyLsn();
360+
pg_read_barrier();
361+
writeKeyLoc.tli = TDEXLogGetEncKeyTli();
312362

313-
pg_tde_wal_last_key_set_location(loc);
314-
EncryptionKey.wal_start = loc;
315-
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
363+
lastKeyUsable = (writeKeyLoc.lsn != 0);
364+
afterWriteKey = false; //wal_location_cmp(writeKeyLoc, loc) <= 0;
365+
366+
if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && !lastKeyUsable)
367+
{
368+
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
369+
370+
if (!crashRecovery || EncryptionKey.type == WAL_KEY_TYPE_UNENCRYPTED)
371+
{
372+
/*
373+
* TODO: the unencrypted case is still not perfect, we need to
374+
* report an error in some cornercases
375+
*/
376+
if (last_key == NULL || last_key->start.lsn < loc.lsn)
377+
{
378+
pg_tde_wal_last_key_set_location(loc);
379+
EncryptionKey.wal_start = loc;
380+
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
381+
lastKeyUsable = true;
382+
}
383+
}
316384
}
317385

318-
if (EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
386+
if ((!afterWriteKey || !lastKeyUsable) && EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
387+
{
388+
return TDEXLogWriteEncryptedPagesOldKeys(fd, buf, count, offset, tli, segno, segSize);
389+
}
390+
else if (EncryptionKey.type == WAL_KEY_TYPE_ENCRYPTED)
391+
{
319392
return TDEXLogWriteEncryptedPages(fd, buf, count, offset, tli, segno);
393+
}
320394
else
395+
{
321396
return pg_pwrite(fd, buf, count, offset);
397+
}
322398
}
323399

324400
/*
@@ -340,7 +416,7 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
340416
if (readsz <= 0)
341417
return readsz;
342418

343-
TDEXLogCryptBuffer(buf, count, offset, tli, segno, segSize);
419+
TDEXLogCryptBuffer(buf, buf, count, offset, tli, segno, segSize);
344420

345421
return readsz;
346422
}
@@ -349,15 +425,15 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
349425
* [De]Crypt buffer if needed based on provided segment offset, number and TLI
350426
*/
351427
void
352-
TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
428+
TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
353429
TimeLineID tli, XLogSegNo segno, int segSize)
354430
{
355431
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
356432
XLogRecPtr write_key_lsn;
357433
WalLocation data_end = {.tli = tli};
358434
WalLocation data_start = {.tli = tli};
359435

360-
if (!keys)
436+
if (keys == NULL)
361437
{
362438
WalLocation start = {.tli = 1,.lsn = 0};
363439

@@ -454,6 +530,7 @@ TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
454530
XLogSegmentOffset(end_lsn, segSize);
455531
size_t dec_sz;
456532
char *dec_buf = (char *) buf + (dec_off - offset);
533+
char *o_buf = (char *) out_buf + (dec_off - offset);
457534

458535
Assert(dec_off >= offset);
459536

@@ -468,17 +545,19 @@ TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
468545
dec_end = offset + count;
469546
}
470547

548+
Assert(dec_end > dec_off);
471549
dec_sz = dec_end - dec_off;
472550

473551
#ifdef TDE_XLOG_DEBUG
474552
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
475553
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
476554
#endif
555+
477556
pg_tde_stream_crypt(iv_prefix,
478557
dec_off,
479558
dec_buf,
480559
dec_sz,
481-
dec_buf,
560+
o_buf,
482561
curr_key->key.key,
483562
&curr_key->crypt_ctx);
484563
}

contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ extern void TDEXLogSmgrInit(void);
1313
extern void TDEXLogSmgrInitWrite(bool encrypt_xlog);
1414
extern void TDEXLogSmgrInitWriteReuseKey(void);
1515

16-
extern void TDEXLogCryptBuffer(void *buf, size_t count, off_t offset,
16+
extern void TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset,
1717
TimeLineID tli, XLogSegNo segno, int segSize);
1818

1919
#endif /* PG_TDE_XLOGSMGR_H */

0 commit comments

Comments
 (0)