Skip to content

Commit 0cb2396

Browse files
author
Rishitha Kalicheti
committed
{182781126} fix api_history_hung
Signed-off-by: Rishitha Kalicheti <rkalicheti1@bloomberg.com>
1 parent fbfeb73 commit 0cb2396

File tree

11 files changed

+216
-73
lines changed

11 files changed

+216
-73
lines changed

db/api_history.c

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -121,28 +121,20 @@ int free_api_history(api_history_t *api_history)
121121
return 0;
122122
}
123123

124-
api_driver_t *get_next_api_history_entry(api_history_t *api_history, void **curr, unsigned int *iter)
124+
int hash_for_api_history(api_history_t *api_history, hashforfunc_t *func, void *arg)
125125
{
126126
assert(api_history);
127127
acquire_api_history_lock(api_history, 0);
128-
api_driver_t *entry;
129-
130-
if (!*curr && !*iter) {
131-
entry = (api_driver_t *)hash_first(api_history->entries, curr, iter);
132-
} else {
133-
entry = (api_driver_t *)hash_next(api_history->entries, curr, iter);
134-
}
135-
128+
int rc = hash_for(api_history->entries, func, arg);
136129
release_api_history_lock(api_history);
137-
return entry;
130+
return rc;
138131
}
139132

140-
int get_num_api_history_entries(api_history_t *api_history) {
133+
int get_num_api_history_entries(api_history_t *api_history)
134+
{
141135
assert(api_history);
142136
acquire_api_history_lock(api_history, 0);
143-
144137
int num = hash_get_num_entries(api_history->entries);
145-
146138
release_api_history_lock(api_history);
147139
return num;
148140
}

db/api_history.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#define _API_HISTORY_H_
1919

2020
#include <sys/time.h>
21+
#include <plhash_glue.h>
2122

2223
typedef struct api_driver {
2324
char *name;
@@ -31,7 +32,7 @@ void acquire_api_history_lock(api_history_t*, int);
3132
void release_api_history_lock(api_history_t*);
3233
api_history_t *init_api_history();
3334
int free_api_history(api_history_t*);
34-
api_driver_t *get_next_api_history_entry(api_history_t*, void**, unsigned int*);
35+
int hash_for_api_history(api_history_t *, hashforfunc_t *func, void *arg);
3536
int get_num_api_history_entries(api_history_t*);
3637
int update_api_history(api_history_t*, char*, char*);
3738

db/comdb2.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ void berk_memp_sync_alarm_ms(int);
8080
#include "sql.h"
8181
#include "logmsg.h"
8282
#include "reqlog.h"
83+
#include "reqlog_int.h"
8384

8485
#include "comdb2_trn_intrl.h"
8586
#include "history.h"
@@ -1799,6 +1800,7 @@ void clean_exit(void)
17991800

18001801
backend_cleanup(thedb);
18011802
net_cleanup();
1803+
cleanup_clientstats();
18021804

18031805
if (gbl_ssl_ctx != NULL) {
18041806
SSL_CTX_free(gbl_ssl_ctx);

db/reqlog.c

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2664,7 +2664,6 @@ static int release_clientstats(unsigned checksum, int node)
26642664
rc = -1;
26652665
goto done;
26662666
}
2667-
26682667
Pthread_mutex_lock(&entry->mtx);
26692668
entry->ref--;
26702669
update_clientstats_cache(entry);
@@ -2676,20 +2675,34 @@ static int release_clientstats(unsigned checksum, int node)
26762675
return rc;
26772676
}
26782677

2679-
nodestats_t *get_next_clientstats_entry(void **curr, unsigned int *iter)
2678+
static int free_nodestats_entry(void *elem, void *unused)
26802679
{
2681-
assert(clientstats);
2682-
acquire_clientstats_lock(0);
2683-
nodestats_t *entry;
2684-
2685-
if (!*curr && !*iter) {
2686-
entry = (nodestats_t *)hash_first(clientstats, curr, iter);
2687-
} else {
2688-
entry = (nodestats_t *)hash_next(clientstats, curr, iter);
2680+
nodestats_t *entry = (nodestats_t *)elem;
2681+
if (entry->rawtotals.fingerprints) {
2682+
hash_for(entry->rawtotals.fingerprints, hash_free_element, NULL);
2683+
hash_free(entry->rawtotals.fingerprints);
26892684
}
2690-
2685+
free_api_history(entry->rawtotals.api_history);
2686+
time_metric_free(entry->rawtotals.svc_time);
2687+
Pthread_mutex_destroy(&entry->mtx);
2688+
Pthread_mutex_destroy(&entry->rawtotals.lk);
2689+
free(entry);
2690+
return 0;
2691+
}
2692+
2693+
void cleanup_clientstats(void)
2694+
{
2695+
acquire_clientstats_lock(1);
2696+
hash_for(clientstats, free_nodestats_entry, NULL);
2697+
hash_free(clientstats);
2698+
clientstats = NULL;
26912699
release_clientstats_lock();
2692-
return entry;
2700+
}
2701+
2702+
int hash_for_clientstats(hashforfunc_t *func, void *arg)
2703+
{
2704+
assert(clientstats);
2705+
return hash_for(clientstats, func, arg);
26932706
}
26942707

26952708
struct rawnodestats *get_raw_node_stats(const char *task, const char *stack,

db/reqlog_int.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "cson.h"
77
#include "sql.h"
88
#include "reqlog.h"
9+
#include <plhash_glue.h>
910

1011
/* This used to be private to reqlog. Moving to a shared header since
1112
eventlog also needs access to reqlog internals. I am not sure
@@ -241,7 +242,8 @@ typedef struct nodestats {
241242

242243
void acquire_clientstats_lock(int);
243244
void release_clientstats_lock();
244-
nodestats_t *get_next_clientstats_entry(void**, unsigned int*);
245+
void cleanup_clientstats(void);
246+
int hash_for_clientstats(hashforfunc_t *func, void *arg);
245247

246248
extern int gbl_time_fdb;
247249

sqlite/ext/comdb2/api_history.c

Lines changed: 59 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <stdlib.h>
1919
#include <string.h>
2020
#include <stddef.h>
21+
#include <unistd.h>
2122

2223
#include "comdb2.h"
2324
#include "reqlog_int.h"
@@ -37,57 +38,75 @@ typedef struct systable_api_history {
3738
cdb2_client_datetime_t last_seen;
3839
} systable_api_history_t;
3940

40-
static void append_entries(systable_api_history_t **data, int *nrecords, api_history_t *api_history, const char *host, const char *task)
41+
typedef struct api_history_collect_ctx {
42+
systable_api_history_t *cursor;
43+
const char *host;
44+
const char *task;
45+
int nrecords;
46+
} api_history_collect_ctx_t;
47+
48+
static int collect_api_history_rows(void *obj, void *arg)
4149
{
42-
acquire_api_history_lock(api_history, 0);
43-
int num_entries = get_num_api_history_entries(api_history);
44-
assert(num_entries > -1);
45-
if (!num_entries) {
46-
release_api_history_lock(api_history);
47-
return;
48-
}
50+
api_driver_t *entry = (api_driver_t *)obj;
51+
api_history_collect_ctx_t *ctx = (api_history_collect_ctx_t *)arg;
52+
systable_api_history_t *row = ctx->cursor;
53+
row->host = strdup(ctx->host ? ctx->host : "unknown");
54+
row->task = strdup(ctx->task ? ctx->task : "unknown");
55+
row->api_driver_name = strdup(entry->name ? entry->name : "unknown");
56+
row->api_driver_version = strdup(entry->version ? entry->version : "unknown");
57+
dttz_t dt = (dttz_t){.dttz_sec = entry->last_seen ? entry->last_seen : time(NULL)};
58+
dttz_to_client_datetime(&dt, "UTC", &row->last_seen);
59+
ctx->cursor++;
60+
ctx->nrecords++;
61+
return 0;
62+
}
4963

50-
int prev = *nrecords;
51-
*nrecords += num_entries;
52-
systable_api_history_t *buffer = realloc(*data, *nrecords * sizeof(systable_api_history_t));
53-
54-
void *curr = NULL;
55-
unsigned int iter = 0;
56-
for (unsigned int i = prev; i < *nrecords; i++) {
57-
api_driver_t *entry = get_next_api_history_entry(api_history, &curr, &iter);
58-
assert(entry);
59-
60-
buffer[i].host = strdup(host);
61-
buffer[i].task = strdup(task);
62-
buffer[i].api_driver_name = strdup(entry->name);
63-
buffer[i].api_driver_version = strdup(entry->version);
64-
65-
dttz_t dt = (dttz_t){.dttz_sec = entry->last_seen};
66-
dttz_to_client_datetime(&dt, "UTC", &buffer[i].last_seen);
67-
}
64+
static int count_api_entries(void *obj, void *arg)
65+
{
66+
nodestats_t *entry = (nodestats_t *)obj;
67+
int *total = (int *)arg;
68+
if (entry->rawtotals.api_history)
69+
*total += get_num_api_history_entries(entry->rawtotals.api_history);
70+
return 0;
71+
}
6872

69-
*data = buffer;
70-
release_api_history_lock(api_history);
73+
static int collect_node_api_history(void *obj, void *arg)
74+
{
75+
nodestats_t *entry = (nodestats_t *)obj;
76+
api_history_collect_ctx_t *ctx = (api_history_collect_ctx_t *)arg;
77+
ctx->host = entry->host;
78+
ctx->task = entry->task;
79+
if (entry->rawtotals.api_history)
80+
hash_for_api_history(entry->rawtotals.api_history, collect_api_history_rows, ctx);
81+
return 0;
7182
}
7283

7384
int init_api_history_data(void **data, int *nrecords)
7485
{
7586
*nrecords = 0;
76-
systable_api_history_t *systable = calloc(*nrecords, sizeof(systable_api_history_t));
87+
7788
acquire_clientstats_lock(0);
78-
79-
void *curr = NULL;
80-
unsigned int iter = 0;
81-
nodestats_t *entry = get_next_clientstats_entry(&curr, &iter);
82-
83-
while (entry) {
84-
assert(entry->rawtotals.api_history);
85-
Pthread_mutex_lock(&entry->rawtotals.lk);
86-
append_entries(&systable, nrecords, entry->rawtotals.api_history, entry->host, entry->task);
87-
Pthread_mutex_unlock(&entry->rawtotals.lk);
88-
entry = get_next_clientstats_entry(&curr, &iter);
89+
90+
int total_entries = 0;
91+
hash_for_clientstats(count_api_entries, &total_entries);
92+
93+
if (total_entries == 0) { //no clientstats so no api history entries
94+
*data = NULL;
95+
release_clientstats_lock();
96+
return 0;
97+
}
98+
99+
systable_api_history_t *systable = calloc(total_entries, sizeof(systable_api_history_t));
100+
if (!systable) {
101+
logmsg(LOGMSG_ERROR, "%s: out of memory for %d entries\n", __func__, total_entries);
102+
release_clientstats_lock();
103+
return SQLITE_NOMEM;
89104
}
90105

106+
api_history_collect_ctx_t ctx = { .cursor = systable, .host = NULL, .task = NULL, .nrecords = 0 };
107+
hash_for_clientstats(collect_node_api_history, &ctx);
108+
109+
*nrecords = ctx.nrecords;
91110
*data = systable;
92111
release_clientstats_lock();
93112
return 0;

tests/comdb2sys.test/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ else
44
include $(TESTSROOTDIR)/testcase.mk
55
endif
66
ifeq ($(TEST_TIMEOUT),)
7-
export TEST_TIMEOUT=6m
7+
export TEST_TIMEOUT=10m
88
endif

tests/comdb2sys.test/lrl.options

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ table t2 t2.csc2
55
table t3 t3.csc2
66
table t4 t4.csc2
77
enable_partial_indexes
8+
logmsg level debug
9+
do reql events on
Lines changed: 100 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,113 @@
11
#!/bin/bash
22

3+
bash -n "$0" | exit 1
4+
source ${TESTSROOTDIR}/tools/runit_common.sh
35
a_dbn=$1
46

5-
replicant=$(cdb2sql ${CDB2_OPTIONS} $a_dbn default 'select host from comdb2_cluster where is_master="N" limit 1')
7+
if [[ "x${DEBUGGER}" != "x" ]] ; then
8+
cdb2sql="$DEBUG_PREFIX ${CDB2SQL_EXE}"
9+
else
10+
cdb2sql="${CDB2SQL_EXE}"
11+
fi
12+
13+
TMPDIR=${TMPDIR:-${TESTDIR}/tmp}
14+
15+
export DUMPLOCK_ON_TIMEOUT=1
16+
export CORE_ON_TIMEOUT=1
17+
18+
master=$($cdb2sql ${CDB2_OPTIONS} $a_dbn default 'select host from comdb2_cluster where is_master="Y" limit 1')
19+
master=$(echo $master | grep -oP \'\(.*?\)\')
20+
master=${master:1:-1}
21+
22+
replicant=$($cdb2sql ${CDB2_OPTIONS} $a_dbn default "select host from comdb2_cluster where is_master='N' limit 1")
623
replicant=$(echo $replicant | grep -oP \'\(.*?\)\')
724
replicant=${replicant:1:-1}
825

9-
cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "SELECT 1"
10-
cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "SELECT CASE WHEN length(host) > 0 THEN 'hostname' ELSE '' END AS host, task, api_driver_name, api_driver_version FROM comdb2_api_history WHERE task='cdb2sql'" > testapihistory.out
26+
if [[ -z "$replicant" ]]; then
27+
echo "Failed to find replicant host for test, requires cluster!"
28+
exit 1
29+
fi
30+
31+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "SELECT 1"
32+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "SELECT CASE WHEN length(host) > 0 THEN 'hostname' ELSE '' END AS host, task, api_driver_name, api_driver_version FROM comdb2_api_history WHERE task='cdb2sql'" > testapihistory.out
1133
diff testapihistory.out testapihistory.expected >/dev/null
1234
rc=$?
1335
if [[ $rc -ne 0 ]]; then
1436
echo "Failed systable comdb2_api_history test"
15-
echo diff $(pwd)/testapihistory.out $(pwd)/testapihistory.expected
37+
echo $(diff $(pwd)/testapihistory.out $(pwd)/testapihistory.expected)
38+
fi
39+
40+
echo "Running concurrent api_history access regression test..."
41+
42+
43+
end=$(( $(date +%s) + 5 ))
44+
45+
(
46+
$cdb2sql ${CDB2_OPTIONS} $a_dbn --host $replicant default "drop table if exists t" >/dev/null 2>&1
47+
$cdb2sql ${CDB2_OPTIONS} $a_dbn --host $replicant default "create table t (i int)" >/dev/null 2>&1
48+
i=1
49+
while [[ $(date +%s) -lt $end ]]; do
50+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default "insert into t select * from generate_series( $(( $i+1 )) , $(( $i+1000 )) );" >/dev/null 2>&1
51+
[[ $? -ne 0 ]] && echo "Error inserting into t1" && exit 1
52+
i=$((i+1000))
53+
done
54+
echo "rows inserted into t: $i"
55+
56+
)
57+
58+
59+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "EXEC PROCEDURE sys.cmd.send('memstat util')" >> testapihistory.out 2>&1
60+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "SELECT SUM(used) FROM comdb2_memstats WHERE name='util'" >> testapihistory.out 2>&1
61+
62+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "PUT TUNABLE 'max_clientstats' '10000'"
63+
64+
select_1_in_loop() {
65+
66+
for i in $(seq 1 1000); do
67+
ln -sf "$cdb2sql" /tmp/client_$i
68+
/tmp/client_$i ${CDB2_OPTIONS} $a_dbn default "SELECT 1" > /dev/null 2>&1
69+
done
70+
71+
}
72+
73+
echo "Checking clientstats count..."
74+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "SELECT count(*) FROM comdb2_clientstats"
75+
76+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "EXEC PROCEDURE sys.cmd.send('memstat util')" >> testapihistory.out 2>&1
77+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default --host $replicant "SELECT SUM(used) FROM comdb2_memstats WHERE name='util'" >> testapihistory.out 2>&1
78+
79+
echo "selecting from comdb2_api_history"
80+
81+
select_1_in_loop
82+
wait $!
83+
84+
$cdb2sql ${CDB2_OPTIONS} $a_dbn --host $replicant default "select count(*) from comdb2_api_history" &
85+
$cdb2sql ${CDB2_OPTIONS} $a_dbn --host $replicant default "select count(*) from comdb2_api_history" &
86+
87+
wait
88+
89+
select_1_in_loop &
90+
91+
$cdb2sql ${CDB2_OPTIONS} $a_dbn --host $replicant default "select distinct api_driver_name from comdb2_api_history" &
92+
$cdb2sql ${CDB2_OPTIONS} $a_dbn --host $replicant default "select count(*) from comdb2_api_history" &
93+
94+
wait
95+
if [[ $? -ne 0 ]]; then
96+
echo "couldn't select from comdb2_api_history"
97+
exit 1
1698
fi
1799

18-
exit $rc
100+
wait
101+
102+
$cdb2sql ${CDB2_OPTIONS} $a_dbn --host $replicant default "select * from comdb2_locks where object=''" >> testapihistory.out 2>&1
103+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default "select 1" >> testapihistory.out 2>&1
104+
for node in $CLUSTER; do
105+
$cdb2sql ${CDB2_OPTIONS} $a_dbn default "select 1" >> testapihistory.out 2>&1
106+
if [[ $? -ne 0 ]]; then
107+
echo "couldn't select 1"
108+
exit 1
109+
fi
110+
done
111+
112+
echo "PASSED: Concurrent api_history test"
113+
exit 0

0 commit comments

Comments
 (0)