Skip to content

Commit 5b490f2

Browse files
committed
sqllogfill lower-gen testcase
Signed-off-by: mhannum <mhannum@bloomberg.net>
1 parent 5590f2c commit 5b490f2

File tree

4 files changed

+289
-1
lines changed

4 files changed

+289
-1
lines changed
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ifeq ($(TESTSROOTDIR),)
2+
include ../testcase.mk
3+
else
4+
include $(TESTSROOTDIR)/testcase.mk
5+
endif
6+
export CHECK_DB_AT_FINISH=0
7+
ifeq ($(TEST_TIMEOUT),)
8+
export TEST_TIMEOUT=10m
9+
endif
10+
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Disable features that prevent generation from going backwards
2+
berkattr elect_highest_committed_gen 0
3+
retrieve_gen_from_ckp 0
4+
match_on_ckp 0
5+
emit_gen_commits 0
6+
endianize_locklist 0
7+
sql_logfill 1
8+
sql_logfill_debug 1
9+
recovery_ckp 0
10+
11+
logmsg level debug
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
#!/usr/bin/env bash
2+
3+
bash -n "$0" | exit 1
4+
5+
. ${TESTSROOTDIR}/tools/runit_common.sh
6+
. ${TESTSROOTDIR}/tools/cluster_utils.sh
7+
8+
export debug=1
9+
[[ "$debug" == "1" ]] && set -x
10+
11+
db=$1
12+
cnt=$(echo $CLUSTER | wc -w)
13+
14+
if [[ -z "$CLUSTER" || $cnt -lt 3 ]]; then
15+
echo "This test requires a clustered installation of at least 3 nodes"
16+
exit 0
17+
fi
18+
19+
function get_log_file {
20+
local master=$(get_master)
21+
$CDB2SQL_EXE ${CDB2_OPTIONS} --tabs $db --host $master "exec procedure sys.cmd.send('bdb cluster')" 2>/dev/null | grep "MASTER" | awk '{print $7}' | cut -d':' -f1
22+
}
23+
24+
function do_writes {
25+
# Get initial log file
26+
local start_logfile=$(get_log_file)
27+
28+
while [[ -z "$start_logfile" ]]; do
29+
echo "Waiting for log file to be available..."
30+
sleep 1
31+
start_logfile=$(get_log_file)
32+
done
33+
34+
echo "Inserting records until log advances (starting at logfile $start_logfile)"
35+
36+
# Insert records in batches until log file advances
37+
local batch_size=1000
38+
local total_inserted=0
39+
local current_logfile=$start_logfile
40+
41+
while [[ "$current_logfile" == "$start_logfile" ]]; do
42+
$CDB2SQL_EXE ${CDB2_OPTIONS} $db default "insert into t1 select value, value from generate_series(1, $batch_size)" >/dev/null 2>&1
43+
total_inserted=$((total_inserted + batch_size))
44+
current_logfile=$(get_log_file)
45+
done
46+
47+
echo " Log file advanced from $start_logfile to $current_logfile after $total_inserted inserts"
48+
}
49+
50+
function force_checkpoint {
51+
local node=$1
52+
echo "Forcing checkpoint on $node"
53+
$CDB2SQL_EXE ${CDB2_OPTIONS} $db @$node "exec procedure sys.cmd.send('flush')" >/dev/null 2>&1
54+
}
55+
56+
function check_ignore_message {
57+
echo "Checking logs for 'ignoring generation check' message"
58+
local found=0
59+
for node in $CLUSTER; do
60+
local logfile="${TESTDIR}/logs/${db}.${node}.db"
61+
if [[ -f "$logfile" ]] && grep -q "ignoring generation check" "$logfile" 2>/dev/null; then
62+
echo " Found in $node log"
63+
found=1
64+
fi
65+
done
66+
67+
if [[ $found -eq 0 ]]; then
68+
stop_all_nodes
69+
failexit "Did not find expected 'ignoring generation check' message in any node logs"
70+
else
71+
echo " SUCCESS: Found generation check ignore message"
72+
fi
73+
}
74+
75+
function runit {
76+
echo "=== Starting SQL logfill generation wraparound test ==="
77+
78+
# Get initial master
79+
master=$(get_master)
80+
echo "Initial master: $master"
81+
82+
# Create table
83+
echo "Creating test table"
84+
$CDB2SQL_EXE ${CDB2_OPTIONS} $db --host $master "drop table if exists t1" >/dev/null 2>&1
85+
$CDB2SQL_EXE ${CDB2_OPTIONS} $db --host $master "create table t1 (id int, val int)" >/dev/null 2>&1 || failexit "Failed to create table"
86+
87+
# Get initial generation
88+
initial_gen=$(get_node_gen $master)
89+
echo "Initial generation: $initial_gen"
90+
91+
# Phase 1: Increment generation through elections
92+
echo "=== Phase 1: Incrementing generation through elections ==="
93+
for i in {1..10}; do
94+
# Get current master and generation
95+
master=$(get_master)
96+
current_gen=$(get_node_gen $master)
97+
echo " Iteration $i: gen=$current_gen, master=$master"
98+
99+
# Downgrade master to force election
100+
echo " Forcing election via downgrade"
101+
downgrade_master
102+
103+
# Get new master and generation after election
104+
master=$(get_master)
105+
new_gen=$(get_node_gen $master)
106+
echo " After election: gen=$new_gen, new_master=$master"
107+
108+
# Insert records to push log forward on new master
109+
do_writes
110+
111+
# Force checkpoint on master to capture current generation
112+
force_checkpoint $master
113+
done
114+
115+
# Record high generation and data count
116+
high_gen=$(get_node_gen $master)
117+
echo "Highest generation reached: $high_gen"
118+
119+
# Phase 2: Stop and restart entire cluster
120+
echo "=== Phase 2: Stopping and restarting cluster ==="
121+
122+
# Get first node from cluster
123+
first_node=$(echo $CLUSTER | awk '{print $1}')
124+
echo "Stopping first node: $first_node"
125+
kill_by_pidfile ${TMPDIR}/${DBNAME}.${first_node}.pid
126+
127+
# Could have been master- wait for a new master
128+
master=$(get_master)
129+
130+
while [[ -z "$master" ]]; do
131+
echo "Waiting for new master after stopping $first_node..."
132+
sleep 1
133+
master=$(get_master)
134+
done
135+
136+
# Insert more records to create gap in first node's log
137+
echo "Inserting records to create gap in $first_node log"
138+
do_writes
139+
140+
count_before=$($CDB2SQL_EXE -tabs ${CDB2_OPTIONS} $db default "select count(*) from t1" 2>/dev/null)
141+
142+
if [[ -z "$count_before" ]] || [[ "$count_before" -eq 0 ]]; then
143+
failexit "No records found before restart"
144+
fi
145+
146+
echo "Record count before restart: $count_before"
147+
148+
echo "Pushnext to advance the log"
149+
${CDB2SQL_EXE} ${CDB2_OPTIONS} $DBNAME --host $master "exec procedure sys.cmd.send('pushnext')"
150+
151+
# Wait for all nodes to write a checkpoint
152+
sleep 5
153+
154+
echo "Stopping remaining cluster nodes"
155+
for node in $CLUSTER; do
156+
if [[ "$node" != "$first_node" ]]; then
157+
kill_by_pidfile ${TMPDIR}/${DBNAME}.${node}.pid
158+
fi
159+
done
160+
161+
echo "Verifying that all nodes are down"
162+
163+
for node in $CLUSTER; do
164+
$CDB2SQL_EXE ${CDB2_OPTIONS} $db --host $node "select 1" >/dev/null 2>&1
165+
if [[ $? -eq 0 ]]; then
166+
stop_all_nodes
167+
failexit "Node $node is still up after kill"
168+
else
169+
echo "Node $node is down as expected"
170+
fi
171+
done
172+
173+
echo "Starting entire cluster"
174+
start_all_nodes
175+
sleep 10
176+
177+
echo "Waiting for cluster coherency"
178+
wait_for_cluster
179+
180+
# Get generation after restart
181+
master=$(get_master)
182+
restart_gen=$(get_node_gen $master)
183+
echo "Generation after restart: $restart_gen (master: $master)"
184+
185+
# Verify generation wrapped
186+
if [[ -z "$restart_gen" ]]; then
187+
stop_all_nodes
188+
failexit "Could not get generation after restart"
189+
fi
190+
191+
if [[ $restart_gen -ge $high_gen ]]; then
192+
stop_all_nodes
193+
failexit "Generation did not reset after restart (restart_gen=$restart_gen >= high_gen=$high_gen)"
194+
else
195+
echo "SUCCESS: Generation reset confirmed (restart_gen=$restart_gen < high_gen=$high_gen)"
196+
fi
197+
198+
# Phase 3: Verify sql_logfill works correctly
199+
echo "=== Phase 3: Verifying sql_logfill with generation mismatch ==="
200+
201+
# Give sql_logfill time to catch up
202+
echo "Waiting for sql_logfill to process gap records..."
203+
sleep 30
204+
205+
# Verify data integrity
206+
count_after=$($CDB2SQL_EXE -tabs ${CDB2_OPTIONS} $db default "select count(*) from t1" 2>/dev/null)
207+
echo "Record count after restart: $count_after"
208+
209+
if [[ -z "$count_after" ]]; then
210+
stop_all_nodes
211+
failexit "Could not get record count after restart"
212+
fi
213+
214+
if [[ "$count_before" != "$count_after" ]]; then
215+
stop_all_nodes
216+
failexit "Record count mismatch (before=$count_before, after=$count_after)"
217+
fi
218+
219+
echo "SUCCESS: Data integrity verified (count=$count_after)"
220+
221+
# Check for ignore message in logs
222+
check_ignore_message
223+
224+
echo "=== Test completed successfully ==="
225+
echo "Summary:"
226+
echo " High generation during run: $high_gen"
227+
echo " Generation after restart: $restart_gen"
228+
echo " Records verified: $count_after"
229+
}
230+
231+
runit
232+
stop_all_nodes
233+
echo "Success"

tests/tools/cluster_utils.sh

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ function get_master
1111
{
1212
[[ "$debug" == 1 ]] && set -x
1313
typeset func="get_master"
14-
x=$($CDB2SQL_EXE $CDB2_OPTIONS --tabs $DBNAME default 'exec procedure sys.cmd.send("bdb cluster")' | grep MASTER | cut -f1 -d":" | tr -d '[:space:]')
14+
x=$($CDB2SQL_EXE $CDB2_OPTIONS --tabs $DBNAME default 'exec procedure sys.cmd.send("bdb cluster")' 2>/dev/null | grep MASTER | cut -f1 -d":" | tr -d '[:space:]')
1515
echo "$x"
1616
}
1717

@@ -392,3 +392,37 @@ downgrade_master() {
392392
# Double check that the cluster is up
393393
wait_for_cluster
394394
}
395+
396+
stop_all_nodes() {
397+
if [[ -z "$CLUSTER" ]]; then
398+
kill_by_pidfile ${TMPDIR}/${DBNAME}.pid
399+
return
400+
fi
401+
for node in $CLUSTER; do
402+
kill_by_pidfile ${TMPDIR}/${DBNAME}.${node}.pid
403+
done
404+
}
405+
406+
start_all_nodes() {
407+
local SSH="ssh -n -o StrictHostKeyChecking=no -tt"
408+
local LOGDIR=$TESTDIR/logs
409+
410+
PARAMS="$DBNAME --no-global-lrl"
411+
412+
if [[ -z "$CLUSTER" ]]; then
413+
mv --backup=numbered $LOGDIR/${DBNAME}.db $LOGDIR/${DBNAME}.db.1
414+
${DEBUG_PREFIX} ${COMDB2_EXE} ${PARAMS} --lrl ${DBDIR}/${DBNAME}.lrl -pidfile ${TMPDIR}/${DBNAME}.pid 2>&1 | gawk '{ print strftime("%H:%M:%S>"), $0; fflush(); }' >$TESTDIR/logs/${DBNAME}.db 2>&1 &
415+
return
416+
fi
417+
418+
for node in $CLUSTER; do
419+
mv --backup=numbered $LOGDIR/${DBNAME}.${node}.db $LOGDIR/${DBNAME}.${node}.db.1
420+
if [ $node == $(hostname) ] ; then
421+
${DEBUG_PREFIX} ${COMDB2_EXE} ${PARAMS} --lrl ${DBDIR}/${DBNAME}.lrl -pidfile ${TMPDIR}/${DBNAME}.$node.pid 2>&1 | gawk '{ print strftime("%H:%M:%S>"), $0; fflush(); }' >$TESTDIR/logs/${DBNAME}.${node}.db 2>&1 &
422+
else
423+
CMD="source ${TESTDIR}/replicant_vars ; ${COMDB2_EXE} ${PARAMS} --lrl ${DBDIR}/${DBNAME}.lrl -pidfile ${TMPDIR}/${DBNAME}.${node}.pid"
424+
$SSH $node ${DEBUG_PREFIX} ${CMD} 2>&1 </dev/null > >(gawk '{ print strftime("%H:%M:%S>"), $0; fflush(); }' >> $TESTDIR/logs/${DBNAME}.${node}.db) &
425+
echo $! > ${TMPDIR}/${DBNAME}.${node}.pid
426+
fi
427+
done
428+
}

0 commit comments

Comments
 (0)