Skip to content

Commit eac4827

Browse files
authored
Add auto-repair options for broken AOF tail on startup (redis#14058)
When Redis is shut down uncleanly (e.g., due to power loss), invalid bytes may remain at the end of the AOF file. Currently, Redis detects such corruption only after parsing most of the AOF, leading to delayed error detection and increased downtime. Manual recovery via `redis-check-aof --fix` is also time-consuming. This fix introduces two new options to improve resilience and reduce downtime: - `aof-load-broken`: Enables automatic detection and repair of broken AOF tails. - `aof-load-broken-max-size`: Sets a maximum threshold (in bytes) for the corrupted tail size that Redis will attempt to fix automatically without requiring user intervention.
1 parent 674b829 commit eac4827

File tree

5 files changed

+200
-7
lines changed

5 files changed

+200
-7
lines changed

redis.conf

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1529,6 +1529,25 @@ auto-aof-rewrite-min-size 64mb
15291529
# will be found.
15301530
aof-load-truncated yes
15311531

1532+
# When the AOF file is corrupted in the middle (format errors), Redis can
1533+
# attempt to automatically recover by truncating the corrupted portion if
1534+
# it's smaller than the configured maximum size. This is more aggressive
1535+
# than aof-load-truncated which only handles truncation at the end of files.
1536+
#
1537+
# The aof-load-broken-max-size setting controls the maximum size in bytes
1538+
# of corrupted data that can be automatically truncated.
1539+
#
1540+
# If aof-load-broken is set to yes and the corrupted portion is smaller than
1541+
# aof-load-broken-max-size, Redis will truncate the corrupted data and start
1542+
# normally, logging a warning about the recovery. Otherwise, the server will
1543+
# exit with an error and require manual intervention using "redis-check-aof".
1544+
#
1545+
# This option is disabled by default since automatically truncating corrupted
1546+
# data can lead to data loss. Only enable this if you understand the risks
1547+
# and prefer availability over data integrity in corruption scenarios.
1548+
aof-load-broken no
1549+
aof-load-broken-max-size 4096
1550+
15321551
# Redis can create append-only base files in either RDB or AOF formats. Using
15331552
# the RDB format is always faster and more efficient, and disabling it is only
15341553
# supported for backward compatibility purposes.

src/aof.c

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1658,7 +1658,7 @@ int loadSingleAppendOnlyFile(char *filename) {
16581658
/* Clean up. Command code may have changed argv/argc so we use the
16591659
* argv/argc of the client instead of the local variables. */
16601660
freeClientArgv(fakeClient);
1661-
if (server.aof_load_truncated) valid_up_to = ftello(fp);
1661+
if (server.aof_load_truncated || server.aof_load_broken) valid_up_to = ftello(fp);
16621662
if (server.key_load_delay)
16631663
debugDelay(server.key_load_delay);
16641664
}
@@ -1719,8 +1719,41 @@ int loadSingleAppendOnlyFile(char *filename) {
17191719
goto cleanup;
17201720

17211721
fmterr: /* Format error. */
1722-
serverLog(LL_WARNING, "Bad file format reading the append only file %s: "
1723-
"make a backup of your AOF file, then use ./redis-check-aof --fix <filename.manifest>", filename);
1722+
/* fmterr may be caused by accidentally machine shutdown, so if the broken tail
1723+
* is less than a specified size, try to recover it automatically */
1724+
if (server.aof_load_broken) {
1725+
if (valid_up_to == -1) {
1726+
serverLog(LL_WARNING,"Last valid command offset is invalid");
1727+
} else if (sb.st_size - valid_up_to < server.aof_load_broken_max_size) {
1728+
if (truncate(aof_filepath,valid_up_to) == -1) {
1729+
serverLog(LL_WARNING,"Error truncating the AOF file: %s",
1730+
strerror(errno));
1731+
} else {
1732+
/* Make sure the AOF file descriptor points to the end of the
1733+
* file after the truncate call. */
1734+
if (server.aof_fd != -1 && lseek(server.aof_fd,0,SEEK_END) == -1) {
1735+
serverLog(LL_WARNING,"Can't seek the end of the AOF file: %s",
1736+
strerror(errno));
1737+
} else {
1738+
serverLog(LL_WARNING,
1739+
"AOF loaded anyway because aof-load-broken is enabled and "
1740+
"broken size '%lld' is less than aof-load-broken-max-size '%lld'",
1741+
(long long)(sb.st_size - valid_up_to), (long long)(server.aof_load_broken_max_size));
1742+
ret = AOF_BROKEN_RECOVERED;
1743+
goto loaded_ok;
1744+
}
1745+
}
1746+
} else { /* The size of the corrupted portion exceeds the configured limit. */
1747+
serverLog(LL_WARNING,
1748+
"AOF was not loaded because the size of the corrupted portion "
1749+
"exceeds the configured limit. aof-load-broken is enabled and broken size '%lld' "
1750+
"is bigger than aof-load-broken-max-size '%lld'",
1751+
(long long)(sb.st_size - valid_up_to), (long long)(server.aof_load_broken_max_size));
1752+
}
1753+
} else {
1754+
serverLog(LL_WARNING, "Bad file format reading the append only file %s: "
1755+
"make a backup of your AOF file, then use ./redis-check-aof --fix <filename.manifest>", filename);
1756+
}
17241757
ret = AOF_FAILED;
17251758
/* fall through to cleanup. */
17261759

@@ -1794,13 +1827,13 @@ int loadAppendOnlyFiles(aofManifest *am) {
17941827
last_file = ++aof_num == total_num;
17951828
start = ustime();
17961829
ret = loadSingleAppendOnlyFile(aof_name);
1797-
if (ret == AOF_OK || (ret == AOF_TRUNCATED && last_file)) {
1830+
if (ret == AOF_OK || ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && last_file)) {
17981831
serverLog(LL_NOTICE, "DB loaded from base file %s: %.3f seconds",
17991832
aof_name, (float)(ustime()-start)/1000000);
18001833
}
18011834

18021835
/* If the truncated file is not the last file, we consider this to be a fatal error. */
1803-
if (ret == AOF_TRUNCATED && !last_file) {
1836+
if ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && !last_file) {
18041837
ret = AOF_FAILED;
18051838
serverLog(LL_WARNING, "Fatal error: the truncated file is not the last file");
18061839
}
@@ -1824,7 +1857,7 @@ int loadAppendOnlyFiles(aofManifest *am) {
18241857
last_file = ++aof_num == total_num;
18251858
start = ustime();
18261859
ret = loadSingleAppendOnlyFile(aof_name);
1827-
if (ret == AOF_OK || (ret == AOF_TRUNCATED && last_file)) {
1860+
if (ret == AOF_OK || ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && last_file)) {
18281861
serverLog(LL_NOTICE, "DB loaded from incr file %s: %.3f seconds",
18291862
aof_name, (float)(ustime()-start)/1000000);
18301863
}
@@ -1834,7 +1867,7 @@ int loadAppendOnlyFiles(aofManifest *am) {
18341867
if (ret == AOF_EMPTY) ret = AOF_OK;
18351868

18361869
/* If the truncated file is not the last file, we consider this to be a fatal error. */
1837-
if (ret == AOF_TRUNCATED && !last_file) {
1870+
if ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && !last_file) {
18381871
ret = AOF_FAILED;
18391872
serverLog(LL_WARNING, "Fatal error: the truncated file is not the last file");
18401873
}

src/config.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3100,6 +3100,7 @@ standardConfig static_configs[] = {
31003100
createBoolConfig("cluster-require-full-coverage", NULL, MODIFIABLE_CONFIG, server.cluster_require_full_coverage, 1, NULL, NULL),
31013101
createBoolConfig("rdb-save-incremental-fsync", NULL, MODIFIABLE_CONFIG, server.rdb_save_incremental_fsync, 1, NULL, NULL),
31023102
createBoolConfig("aof-load-truncated", NULL, MODIFIABLE_CONFIG, server.aof_load_truncated, 1, NULL, NULL),
3103+
createBoolConfig("aof-load-broken", NULL, MODIFIABLE_CONFIG, server.aof_load_broken, 0, NULL, NULL),
31033104
createBoolConfig("aof-use-rdb-preamble", NULL, MODIFIABLE_CONFIG, server.aof_use_rdb_preamble, 1, NULL, NULL),
31043105
createBoolConfig("aof-timestamp-enabled", NULL, MODIFIABLE_CONFIG, server.aof_timestamp_enabled, 0, NULL, NULL),
31053106
createBoolConfig("cluster-replica-no-failover", "cluster-slave-no-failover", MODIFIABLE_CONFIG, server.cluster_slave_no_failover, 0, NULL, updateClusterFlags), /* Failover by default. */
@@ -3266,6 +3267,7 @@ standardConfig static_configs[] = {
32663267
createTimeTConfig("repl-backlog-ttl", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.repl_backlog_time_limit, 60*60, INTEGER_CONFIG, NULL, NULL), /* Default: 1 hour */
32673268
createOffTConfig("auto-aof-rewrite-min-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_rewrite_min_size, 64*1024*1024, MEMORY_CONFIG, NULL, NULL),
32683269
createOffTConfig("loading-process-events-interval-bytes", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 1024, INT_MAX, server.loading_process_events_interval_bytes, 1024*512, INTEGER_CONFIG, NULL, NULL),
3270+
createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.aof_load_broken_max_size, 4*1024, INTEGER_CONFIG, NULL, NULL),
32693271

32703272
createIntConfig("tls-port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.tls_port, 0, INTEGER_CONFIG, NULL, applyTLSPort), /* TCP port. */
32713273
createIntConfig("tls-session-cache-size", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tls_ctx_config.session_cache_size, 20*1024, INTEGER_CONFIG, NULL, applyTlsCfg),

src/server.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
346346
#define AOF_OPEN_ERR 3
347347
#define AOF_FAILED 4
348348
#define AOF_TRUNCATED 5
349+
#define AOF_BROKEN_RECOVERED 6
349350

350351
/* RDB return values for rdbLoad. */
351352
#define RDB_OK 0
@@ -2017,6 +2018,8 @@ struct redisServer {
20172018
int aof_last_write_status; /* C_OK or C_ERR */
20182019
int aof_last_write_errno; /* Valid if aof write/fsync status is ERR */
20192020
int aof_load_truncated; /* Don't stop on unexpected AOF EOF. */
2021+
int aof_load_broken; /* Don't stop on bad fmt. */
2022+
off_t aof_load_broken_max_size; /* The max size of broken AOF tail than can be ignored. */
20202023
int aof_use_rdb_preamble; /* Specify base AOF to use RDB encoding on AOF rewrites. */
20212024
redisAtomic int aof_bio_fsync_status; /* Status of AOF fsync in bio job. */
20222025
redisAtomic int aof_bio_fsync_errno; /* Errno of AOF fsync in bio job. */

tests/integration/aof.tcl

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,4 +701,140 @@ tags {"aof external:skip"} {
701701
assert_equal {1} [r get t]
702702
}
703703
}
704+
705+
# Check AOF load broken behavior
706+
# Corrupted base AOF, existing AOF files
707+
create_aof $aof_dirpath $aof_base_file {
708+
append_to_aof [formatCommand set param ok]
709+
append_to_aof "corruption"
710+
}
711+
create_aof $aof_dirpath $aof_file {
712+
append_to_aof [formatCommand set foo hello]
713+
}
714+
start_server_aof_ex [list dir $server_path aof-load-broken yes] [list wait_ready false] {
715+
test "Log should mention truncated file is not last" {
716+
wait_for_log_messages 0 {
717+
{*AOF loaded anyway because aof-load-broken is enabled*}
718+
{*Fatal error: the truncated file is not the last file*}
719+
} 0 10 1000
720+
}
721+
}
722+
723+
# Remove all incr AOF files to make the base file being the last file
724+
exec rm -f $aof_dirpath/appendonly.aof.*
725+
start_server_aof [list dir $server_path aof-load-broken yes] {
726+
test "Corrupted base AOF (last file): should recover" {
727+
assert_equal 1 [is_alive [srv pid]]
728+
}
729+
730+
test "param should be 'ok'" {
731+
set client [redis [srv host] [srv port] 0 $::tls]
732+
wait_done_loading $client
733+
assert {[$client get param] eq "ok"}
734+
}
735+
}
736+
737+
# Should also start with broken incr AOF.
738+
create_aof $aof_dirpath $aof_file {
739+
append_to_aof [formatCommand set foo 1]
740+
append_to_aof [formatCommand incr foo]
741+
append_to_aof [formatCommand incr foo]
742+
append_to_aof [formatCommand incr foo]
743+
append_to_aof [formatCommand incr foo]
744+
append_to_aof "corruption"
745+
}
746+
747+
start_server_aof [list dir $server_path aof-load-broken yes] {
748+
test "Short read: Server should start if aof-load-broken is yes" {
749+
assert_equal 1 [is_alive [srv pid]]
750+
}
751+
752+
# The AOF file is expected to be correct because default value for aof-load-broken-max-size is 4096,
753+
# so the AOF will reload without the corruption
754+
test "Broken AOF loaded: we expect foo to be equal to 5" {
755+
set client [redis [srv host] [srv port] 0 $::tls]
756+
wait_done_loading $client
757+
assert {[$client get foo] eq "5"}
758+
}
759+
760+
test "Append a new command after loading an incomplete AOF" {
761+
$client incr foo
762+
}
763+
}
764+
765+
start_server_aof [list dir $server_path aof-load-broken yes] {
766+
test "Short read + command: Server should start" {
767+
assert_equal 1 [is_alive [srv pid]]
768+
}
769+
770+
test "Broken AOF loaded: we expect foo to be equal to 6 now" {
771+
set client [redis [srv host] [srv port] 0 $::tls]
772+
wait_done_loading $client
773+
assert {[$client get foo] eq "6"}
774+
}
775+
}
776+
777+
# Test that the server exits when the AOF contains a format error
778+
create_aof $aof_dirpath $aof_file {
779+
append_to_aof [formatCommand set foo hello]
780+
append_to_aof [string range [formatCommand incr foo] 0 end-3]
781+
append_to_aof "corruption"
782+
}
783+
784+
# We set the maximum allowed corrupted size to 2 bytes, but the actual corrupted portion is larger,
785+
# so the AOF file will not be reloaded.
786+
start_server_aof_ex [list dir $server_path aof-load-broken yes aof-load-broken-max-size 2] [list wait_ready false] {
787+
test "Bad format: Server should have logged an error" {
788+
wait_for_log_messages 0 {"*AOF was not loaded because the size*"} 0 10 1000
789+
}
790+
}
791+
792+
create_aof_manifest $aof_dirpath $aof_manifest_file {
793+
append_to_manifest "file appendonly.aof.1.base.aof seq 1 type b\n"
794+
append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n"
795+
append_to_manifest "file appendonly.aof.2.incr.aof seq 2 type i\n"
796+
}
797+
# Create base AOF file
798+
set base_aof_file "$aof_dirpath/appendonly.aof.1.base.aof"
799+
create_aof $aof_dirpath $base_aof_file {
800+
append_to_aof [formatCommand set fo base]
801+
}
802+
803+
# Create middle incr AOF file with corruption
804+
set mid_aof_file "$aof_dirpath/appendonly.aof.1.incr.aof"
805+
create_aof $aof_dirpath $mid_aof_file {
806+
append_to_aof [formatCommand set fo mid]
807+
append_to_aof "CORRUPTION"
808+
}
809+
810+
# Create last incr AOF file (valid)
811+
set last_aof_file "$aof_dirpath/appendonly.aof.2.incr.aof"
812+
create_aof $aof_dirpath $last_aof_file {
813+
append_to_aof [formatCommand set fo last]
814+
}
815+
816+
# Check that Redis fails to load because corruption is in the middle file
817+
start_server_aof_ex [list dir $server_path aof-load-broken yes] [list wait_ready false] {
818+
test "Intermediate AOF is broken: should log fatal and not start" {
819+
wait_for_log_messages 0 {
820+
{*Fatal error: the truncated file is not the last file*}
821+
} 0 10 1000
822+
}
823+
}
824+
825+
# Swap mid and last files
826+
set tmp_file "$aof_dirpath/temp.aof"
827+
file rename -force $mid_aof_file $tmp_file
828+
file rename -force $last_aof_file $mid_aof_file
829+
file rename -force $tmp_file $last_aof_file
830+
831+
# Should now start successfully since corruption is in last AOF file
832+
start_server_aof [list dir $server_path aof-load-broken yes] {
833+
test "Corrupted last AOF file: Server should still start and recover" {
834+
assert_equal 1 [is_alive [srv pid]]
835+
set client [redis [srv host] [srv port] 0 $::tls]
836+
wait_done_loading $client
837+
assert {[$client get fo] eq "mid"}
838+
}
839+
}
704840
}

0 commit comments

Comments
 (0)