Skip to content

Commit 036f22c

Browse files
committed
fix longreq logs
Signed-off-by: Emelia Lei <wlei29@bloomberg.net>
1 parent a01ba24 commit 036f22c

File tree

7 files changed

+179
-29
lines changed

7 files changed

+179
-29
lines changed

db/osql_srs.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ static int srs_tran_replay_int(struct sqlclntstate *clnt, int(dispatch_fn)(struc
321321
clnt->start_gen = bdb_get_rep_gen(thedb->bdb_env);
322322
LISTC_FOR_EACH(&osql->history->lst, item, lnk)
323323
{
324+
clnt->done = 0; /* reset done flag */
324325
restore_stmt(clnt, item);
325326
if ((rc = dispatch_fn(clnt)) != 0)
326327
break;
@@ -373,6 +374,7 @@ static int srs_tran_replay_int(struct sqlclntstate *clnt, int(dispatch_fn)(struc
373374
}
374375

375376
osql_set_replay(__FILE__, __LINE__, clnt, OSQL_RETRY_NONE);
377+
clnt->verify_retries = 0;
376378

377379
return rc;
378380
}

db/reqlog.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1603,7 +1603,7 @@ static void log_header_ll(struct reqlogger *logger, struct output *out,
16031603
struct reqlog_print_callback_args args;
16041604

16051605
if (out == long_request_out) {
1606-
dumpf(logger, out, "LONG REQUEST %d msec ", U2M(logger->durationus));
1606+
dumpf(logger, out, "LONG REQUEST %d msec", U2M(logger->durationus));
16071607
} else {
16081608
dumpf(logger, out, "%s %d msec ", logger->request_type,
16091609
U2M(logger->durationus));
@@ -1614,8 +1614,7 @@ static void log_header_ll(struct reqlogger *logger, struct output *out,
16141614
if (gbl_fingerprint_queries && logger->have_fingerprint) {
16151615
char expanded_fp[2 * FINGERPRINTSZ + 1];
16161616
util_tohex(expanded_fp, logger->fingerprint, FINGERPRINTSZ);
1617-
dumpf(logger, out, "for fingerprint %.*s", FINGERPRINTSZ * 2,
1618-
expanded_fp);
1617+
dumpf(logger, out, " for fingerprint %.*s", FINGERPRINTSZ * 2, expanded_fp);
16191618
}
16201619

16211620
struct sqlclntstate *clnt = logger->clnt;
@@ -1630,6 +1629,10 @@ static void log_header_ll(struct reqlogger *logger, struct output *out,
16301629
dumpf(logger, out, " clntapi %s", logger->api_type);
16311630
}
16321631

1632+
if (logger->vreplays) {
1633+
dumpf(logger, out, " verify replays=%d", logger->vreplays);
1634+
}
1635+
16331636
if (out == long_request_out && is_running) {
16341637
/* No need to include 'rc' if the request is still running. */
16351638
dumpf(logger, out, " **running**\n");
@@ -1921,6 +1924,16 @@ inline void reqlog_set_rqid(struct reqlogger *logger, void *id, int idlen)
19211924
logger->have_id = 1;
19221925
}
19231926

1927+
void reqlog_set_rqid_from_clnt(struct reqlogger *logger, struct sqlclntstate *clnt)
1928+
{
1929+
unsigned long long rqid = clnt->osql.rqid;
1930+
if (rqid != 0 && rqid != OSQL_RQID_USE_UUID)
1931+
reqlog_set_rqid(logger, &rqid, sizeof(rqid));
1932+
else if (!comdb2uuid_is_zero(clnt->osql.uuid))
1933+
/* have an "id_set" instead? */
1934+
reqlog_set_rqid(logger, clnt->osql.uuid, sizeof(uuid_t));
1935+
}
1936+
19241937
static int current_long_request_count = 0;
19251938
static int current_long_request_duration_ms = 0;
19261939
static int current_longest_long_request_ms = 0;
@@ -2019,6 +2032,8 @@ void reqlog_long_running_clnt(struct sqlclntstate *clnt)
20192032

20202033
logger.rc = 0;
20212034

2035+
reqlog_set_rqid_from_clnt(&logger, clnt);
2036+
20222037
/* Should this long running statement be reported at this instant? */
20232038

20242039
int duration_beyond_thresh_sec = (duration_ms - gbl_sql_time_threshold) / 1000;
@@ -2062,10 +2077,6 @@ void reqlog_long_running_clnt(struct sqlclntstate *clnt)
20622077
reqlog_set_fingerprint(&logger, (const char *)fp, FINGERPRINTSZ);
20632078
}
20642079

2065-
if (logger.vreplays) {
2066-
reqlog_logf(&logger, REQL_INFO, "verify replays=%d", logger.vreplays);
2067-
}
2068-
20692080
flushdump(&logger, NULL);
20702081

20712082
log_header(&logger, long_request_out, 1);

db/reqlog.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ uint64_t reqlog_get_queue_time(const struct reqlogger *logger);
8989
void reqlog_reset_fingerprint(struct reqlogger *logger, size_t n);
9090
void reqlog_set_fingerprint(struct reqlogger *logger, const char *fp, size_t n);
9191
void reqlog_set_rqid(struct reqlogger *logger, void *id, int idlen);
92+
void reqlog_set_rqid_from_clnt(struct reqlogger *logger, struct sqlclntstate *clnt);
9293
void reqlog_set_event(struct reqlogger *logger, evtype_t evtype);
9394
evtype_t reqlog_get_event(struct reqlogger *logger);
9495
void reqlog_add_table(struct reqlogger *logger, const char *table);

db/sqlinterfaces.c

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ void handle_sql_intrans_unrecoverable_error(struct sqlclntstate *clnt);
203203
void comdb2_set_sqlite_vdbe_dtprec(Vdbe *p);
204204
static int execute_sql_query_offload(struct sqlthdstate *, struct sqlclntstate *);
205205
static int record_query_cost(struct sql_thread *, struct sqlclntstate *);
206+
static void free_normalized_sql(struct sqlclntstate *clnt);
207+
void free_original_normalized_sql(struct sqlclntstate *clnt);
206208

207209
static int sql_debug_logf_int(struct sqlclntstate *clnt, const char *func,
208210
int line, const char *fmt, va_list args)
@@ -1247,13 +1249,7 @@ static void sql_statement_done(struct sql_thread *thd, struct reqlogger *logger,
12471249

12481250
thd->crtshard = 0;
12491251

1250-
unsigned long long rqid = clnt->osql.rqid;
1251-
if (rqid != 0 && rqid != OSQL_RQID_USE_UUID)
1252-
reqlog_set_rqid(logger, &rqid, sizeof(rqid));
1253-
else if (!comdb2uuid_is_zero(clnt->osql.uuid)) {
1254-
/* have an "id_set" instead? */
1255-
reqlog_set_rqid(logger, clnt->osql.uuid, sizeof(uuid_t));
1256-
}
1252+
reqlog_set_rqid_from_clnt(logger, clnt);
12571253

12581254
LISTC_T(struct sql_hist) lst;
12591255
listc_init(&lst, offsetof(struct sql_hist, lnk));
@@ -1266,15 +1262,15 @@ static void sql_statement_done(struct sql_thread *thd, struct reqlogger *logger,
12661262
h->cost.time = comdb2_time_epochms() - thd->startms;
12671263
h->cost.prepTime = thd->prepms;
12681264
h->when = thd->stime;
1269-
h->txnid = rqid;
1265+
h->txnid = clnt->osql.rqid;
12701266

12711267
time_metric_add(thedb->service_time, h->cost.time);
12721268
clnt->last_cost = (int64_t) h->cost.cost;
12731269

12741270
/* request logging framework takes care of logging long sql requests */
12751271
reqlog_set_cost(logger, h->cost.cost);
1276-
if (rqid) {
1277-
reqlog_logf(logger, REQL_INFO, "rqid=%llx", rqid);
1272+
if (clnt->osql.rqid) {
1273+
reqlog_logf(logger, REQL_INFO, "rqid=%llx", clnt->osql.rqid);
12781274
}
12791275
reqlog_set_netwaitus(logger, clnt->netwaitus);
12801276

@@ -1696,17 +1692,15 @@ static void reqlog_setup_begin_commit_rollback(struct sqlthdstate *thd, struct s
16961692
query_stats_setup(thd, clnt);
16971693

16981694
if (reqlog_get_event(thd->logger) == EV_SQL) {
1699-
unsigned char fp[FINGERPRINTSZ];
1700-
size_t len;
17011695
char stmt[9] = {0}; // begin, commit, or rollback
17021696
int i = 0;
17031697
for (const char *s = clnt->sql; *s != '\0' && *s != ' ' && i < sizeof(stmt) - 1; s++, i++) {
17041698
stmt[i] = (char) tolower(*s);
17051699
}
1706-
if (gbl_fingerprint_queries) {
1707-
calc_fingerprint(stmt, &len, fp);
1708-
reqlog_set_fingerprint(thd->logger, (const char *)fp, FINGERPRINTSZ);
1709-
}
1700+
1701+
/* Reset so we don't print the stale fingerprint */
1702+
free_normalized_sql(clnt);
1703+
free_original_normalized_sql(clnt);
17101704
}
17111705
log_queue_time(thd->logger, clnt);
17121706
}
@@ -1838,7 +1832,7 @@ int handle_sql_begin(struct sqlthdstate *thd, struct sqlclntstate *clnt,
18381832

18391833
if (sideeffects == TRANS_CLNTCOMM_NORMAL) {
18401834
/* for chunks and SPs, don't end the request */
1841-
reqlog_end_request(thd->logger, -1, __func__, __LINE__);
1835+
reqlog_end_request(thd->logger, rc, __func__, __LINE__);
18421836
}
18431837

18441838
return rc;
@@ -2392,7 +2386,7 @@ int handle_sql_commitrollback(struct sqlthdstate *thd,
23922386
clnt->osql.xerr.errval = ERR_BLOCK_FAILED + ERR_VERIFY;
23932387
sql_set_sqlengine_state(clnt, __FILE__, __LINE__,
23942388
SQLENG_NORMAL_PROCESS);
2395-
outrc = CDB2ERR_VERIFY_ERROR;
2389+
outrc = rc = CDB2ERR_VERIFY_ERROR;
23962390
Pthread_mutex_lock(&clnt->wait_mutex);
23972391
clnt->ready_for_heartbeats = 0;
23982392
Pthread_mutex_unlock(&clnt->wait_mutex);
@@ -2537,12 +2531,13 @@ int handle_sql_commitrollback(struct sqlthdstate *thd,
25372531

25382532
done:
25392533
reset_clnt_flags(clnt);
2534+
reqlog_set_rqid_from_clnt(thd->logger, clnt);
25402535
if (clnt->osql.sock_started == 0)
25412536
comdb2uuid_clear(clnt->osql.uuid);
25422537

25432538
if (sideeffects == TRANS_CLNTCOMM_NORMAL) {
25442539
/* end request only for non-chunk and non-SP transactions */
2545-
reqlog_end_request(thd->logger, -1, __func__, __LINE__);
2540+
reqlog_end_request(thd->logger, rc, __func__, __LINE__);
25462541
}
25472542

25482543
/* if this is a retry, let the upper layer free the structure */

tests/longreq_sanity.test/Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
ifeq ($(TESTSROOTDIR),)
2+
include ../testcase.mk
3+
else
4+
include $(TESTSROOTDIR)/testcase.mk
5+
endif
6+
ifeq ($(TEST_TIMEOUT),)
7+
export TEST_TIMEOUT=3m
8+
endif

tests/longreq_sanity.test/runit

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/env bash
2+
bash -n "$0" | exit 1
3+
4+
set -e
5+
dbnm=$1
6+
7+
mach=$(cdb2sql --tabs ${CDB2_OPTIONS} $dbnm default "SELECT comdb2_host()")
8+
9+
if [ "$mach" = "" ]; then
10+
echo "could not retrieve hostname" >&2
11+
exit 1
12+
fi
13+
14+
r="cdb2sql -s ${CDB2_OPTIONS} $dbnm default --host $mach"
15+
16+
if [ -n "$CLUSTER" ] ; then
17+
logfile="$TESTDIR/logs/$dbnm.$mach.db"
18+
else
19+
logfile="$TESTDIR/logs/$dbnm.db"
20+
fi
21+
22+
$r << "EOF" >/dev/null
23+
@send reql longreqfile <stdout>
24+
@send reql longsqlrequest 1
25+
put tunable longreq_log_freq_sec 1
26+
EOF
27+
28+
$r "create table if not exists tlr (id int, v int)" >/dev/null
29+
30+
$r << "EOF" >/dev/null
31+
set maxtransize 200000
32+
insert into tlr select value, value from generate_series(1, 100000)
33+
EOF
34+
35+
36+
# Use a FIFO to control pid1's transaction timing:
37+
# pid1: begin + update, then wait for pid2 to commit a conflicting update, then commit.
38+
# This forces a verify retry on pid1's commit.
39+
fifo=$(mktemp -u)
40+
outfile=$(mktemp)
41+
mkfifo $fifo
42+
43+
$r < $fifo > $outfile 2>&1 &
44+
pid1=$!
45+
exec 3> $fifo
46+
47+
echo "set maxtransize 200000" >&3
48+
echo "begin" >&3
49+
echo "update tlr set v=v+1" >&3
50+
echo "select 'update_done'" >&3
51+
while ! grep -q 'update_done' $outfile; do sleep 0.1; done
52+
53+
$r << "EOF"
54+
set maxtransize 200000
55+
update tlr set v=v-1
56+
EOF
57+
58+
echo "commit" >&3
59+
exec 3>&-
60+
wait $pid1
61+
rm -f $fifo $outfile
62+
63+
sleep 2
64+
65+
# ---- Test 1: begin/commit should not have fingerprint ----
66+
echo "Test 1: begin/commit should not have fingerprint"
67+
68+
if awk '/LONG REQUEST/{hdr=$0} /sql=(begin|commit)/ && hdr ~ /for fingerprint/{found=1} END{exit !found}' $logfile; then
69+
echo "FAIL: begin or commit has fingerprint" >&2
70+
exit 1
71+
fi
72+
73+
echo "Test 1 passed"
74+
75+
# ---- Test 2: begin/commit should not have negative rc ----
76+
echo "Test 2: no negative rc on begin/commit"
77+
78+
if awk '/LONG REQUEST/{hdr=$0} /sql=(begin|commit)/ && hdr ~ /rc -/{found=1} END{exit !found}' $logfile; then
79+
echo "FAIL: begin or commit has negative rc" >&2
80+
exit 1
81+
fi
82+
83+
echo "Test 2 passed"
84+
85+
# ---- Test 3: **running** logs appear during verify retry with replay count ----
86+
echo "Test 3: running logs have verify replays during retry"
87+
88+
if ! grep -q 'verify replays=.*\*\*running\*\*' $logfile; then
89+
echo "FAIL: no **running** log with verify replays found" >&2
90+
exit 1
91+
fi
92+
93+
echo "Test 3 passed"
94+
95+
# ---- Test 4: at least one **running** log has rqid ----
96+
echo "Test 4: at least one running log has rqid"
97+
98+
# rqid may be empty after watchdog kicks off for a long running transaction
99+
# but before osql session starts and assigns an uuid.
100+
# Check that at least one running log has a non-empty rqid.
101+
if ! grep '\*\*running\*\*' $logfile | grep -q 'rqid [0-9a-f]'; then
102+
echo "FAIL: no **running** log has rqid" >&2
103+
exit 1
104+
fi
105+
106+
echo "Test 4 passed"
107+
108+
# ---- Test 5: final commit completion log has rqid ----
109+
echo "Test 5: commit completion has rqid"
110+
111+
if awk '/LONG REQUEST/{hdr=$0} /sql=commit,/ && hdr ~ /rqid from/{found=1} END{exit !found}' $logfile; then
112+
echo "FAIL: commit completion log has empty rqid" >&2
113+
exit 1
114+
fi
115+
116+
echo "Test 5 passed"
117+
118+
# ---- Test 6: verify replays do not leak across transactions ----
119+
echo "Test 6: verify replays do not leak to next transaction"
120+
121+
# Run another long transaction after the replay test
122+
$r << "EOF"
123+
set maxtransize 200000
124+
update tlr set v=v+2
125+
EOF
126+
127+
# Check that the most recent update log does not have verify replays
128+
if tail -20 $logfile | grep 'sql=UPDATE tlr SET v = v + 2' | grep -q 'verify replays'; then
129+
echo "FAIL: verify replays leaked to subsequent transaction" >&2
130+
exit 1
131+
fi
132+
133+
echo "Test 6 passed"
134+
135+
echo "All tests passed."
136+
exit 0

tests/replay_eventlog.test/fingerprints_freq.exp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
1000 7b3aa6d46d54b8a092b8c3e74a1fc7d0 insert into t1( alltype
1616
100 8b048737cee209884c0eb5b86f9815f7 insert into t1(alltypes_
1717
1 8be5e8c404c7e6c72b3a5d926cc150fd create table t1 { // th
18-
2 8d589afa4dfaeeed85fff5aa78e5ff6a begin
1918
500 92e6e3090f8f99f9ed0b9015f849d8e3 insert into t1(alltypes_
2019
8 aa98684e80e6fa9bd1cb3ee39443df7e select count(*) from t1
2120
500 aefd75efcaf1d7e65205373fc5a09b89 update t1 set alltypes_s
@@ -26,5 +25,3 @@
2625
110 e4510162feae7bd9a746c798ad56027d insert into t1(alltypes_
2726
10 f2c85b80eb883ee3c1eb184a8bca4e4d update t1 set alltypes_u
2827
1 f49bb0b4877aac4df9adb8d4e097e140 select * from t1 order b
29-
1 fff66e9b3d962fa319c8068b5c1997cd rollback
30-
1 fffca4d67ea0a788813031b8bbc3b329 commit

0 commit comments

Comments
 (0)