Skip to content

Commit 1993e00

Browse files
authored
Reworked append-table tests, improved coverage or snapshot isolation level, revisit serializable coverage. (#127)
* Added transaction isolation level to YSQL workloads and appended tests. * Rename RC_OL_TEST timeout variable and adjust concurrency for append-table test. * Updated `run-jepsen.py` to include `has_valid_unknown` support and adjusted concurrency for `append-table` tests. * Reduced YSQL serializable test coverage in `run-jepsen.py` to align with RC and SI focus. * Reduced YSQL serializable test coverage in `run-jepsen.py` to align with RC and SI focus. * Removed `sz.pl.geo.append` tests from YSQL workloads in `run-jepsen.py` to streamline test coverage. * Added dedicated table-specific workloads (`SI`, `RC`, `Serializable`) for `append-table` tests and updated core mappings in YSQL workloads. Removed unnecessary log statements. * Updated `append-table` to use deterministic primary key for consistent ordering, replaced `insert!` with `insert-using-count!`, and streamlined transaction handling logic. * Added logging to indicate whether queries use `IndexOnlyScan` or `SeqScan` across YSQL workloads. * Added detailed logging for table creation and indexing in `append_table` and fixed JVM options typo in `project.clj`. * Added detailed logging for table creation and indexing in `append_table` and fixed JVM options typo in `project.clj`. * Added detailed logging for table creation and indexing in `append_table` and fixed JVM options typo in `project.clj`. * Added detailed logging for table creation and indexing in `append_table` and fixed JVM options typo in `project.clj`. * Added detailed logging for table creation and indexing in `append_table` and fixed JVM options typo in `project.clj`. * WIP * WIP * WIP * WIP * WIP * WIP * Expanded YSQL test coverage by adding new Serializable isolation tests.
1 parent de56b99 commit 1993e00

File tree

15 files changed

+309
-111
lines changed

15 files changed

+309
-111
lines changed

yugabyte/project.clj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@
1414
[version-clj "2.0.2"]
1515
[clj-wallhack "1.0.1"]]
1616
:main yugabyte.runner
17-
:jvm-opts ["-Djava.awt.headless=true" "-Xms4g" "-Xmx8g"])
17+
:jvm-opts ["-Djava.awt.headless=true" "-Xms4g" "-Xmx10g"])
1818
; :aot [yugabyte.runner
1919
; clojure.tools.logging.impl])

yugabyte/run-jepsen.py

Lines changed: 67 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@
4444
'returncode',
4545
'timed_out',
4646
'everything_looks_good',
47-
'cycle_search_timeout_only'])
47+
'cycle_search_timeout_only',
48+
'has_valid_unknown'])
4849

4950

5051
def is_cycle_search_timeout_only(lines):
@@ -85,7 +86,7 @@ def is_cycle_search_timeout_only(lines):
8586
# The set test might time out if you let it run for 10 minutes and leave 10 more
8687
# minutes for analysis, so cut its running time in half.
8788
SINGLE_TEST_RUN_TIME_FOR_SET_TEST = 300
88-
SINGLE_TEST_RUN_TIME_FOR_RC_OL_TEST = 300
89+
SINGLE_TEST_RUN_TIME_FOR_RC_APPEND_TEST = 300
8990

9091
TEST_AND_ANALYSIS_TIMEOUT_SEC = 1200 # Includes test results analysis.
9192
DEFAULT_TARBALL_URL = "https://downloads.yugabyte.com/yugabyte-1.3.1.0-linux.tar.gz"
@@ -109,56 +110,56 @@ def is_cycle_search_timeout_only(lines):
109110
# YSQL serializable
110111
"ysql/sz.counter",
111112
"ysql/sz.set",
112-
"ysql/sz.bank",
113113
"ysql/sz.bank-contention",
114114
"ysql/sz.bank-multitable",
115115
"ysql/sz.long-fork",
116116
"ysql/sz.single-key-acid",
117117
"ysql/sz.multi-key-acid",
118118
"ysql/sz.default-value",
119-
"ysql/sz.ol.append",
120119

121120
# YSQL snapshot isolation
122-
"ysql/si.ol.append",
123-
"ysql/si.bank",
124121
"ysql/si.bank-contention",
125122
"ysql/si.bank-multitable",
123+
"ysql/si.counter",
124+
"ysql/si.set",
126125
]
127126
},
128127
{
129-
"start_version": "2.13.1.0-b1",
130-
"tests": [
131-
# YSQL read committed
132-
"ysql/rc.ol.append",
133-
]
134-
},
135-
{
128+
# RC pessimistic locking available since 2.15
136129
"start_version": "2.15.0.0-b1",
137130
"tests": [
138-
"ysql/rc.pl.append",
131+
"ysql/rc.append",
139132
]
140133
},
141134
{
135+
# SI pessimistic locking available since 2.17.2
142136
"start_version": "2.17.2.0-b1",
143137
"tests": [
144-
"ysql/si.pl.append",
138+
"ysql/si.append",
145139
]
146140
},
147141
{
148142
"start_version": "2.18.0.0-b1",
149143
"tests": [
150-
"ysql/rc.pl.geo.append",
151-
"ysql/si.pl.geo.append",
152-
"ysql/sz.pl.geo.append",
153-
"ysql/rc.ol.geo.append",
154-
"ysql/si.ol.geo.append",
155-
"ysql/sz.ol.geo.append",
144+
"ysql/rc.geo.append",
145+
"ysql/si.geo.append",
146+
"ysql/sz.geo.append",
156147
]
157148
},
158149
{
150+
# SZ pessimistic locking available since 2.20
159151
"start_version": "2.20.0.0-b1",
160152
"tests": [
161-
"ysql/sz.pl.append",
153+
"ysql/sz.append",
154+
]
155+
},
156+
{
157+
"start_version": "2.29.0.0-b500",
158+
"start_version_stable": "2026.1.0.0-b1",
159+
"tests": [
160+
"ysql/sz.append-table",
161+
"ysql/si.append-table",
162+
"ysql/rc.append-table",
162163
]
163164
}
164165
]
@@ -182,12 +183,24 @@ def is_cycle_search_timeout_only(lines):
182183
child_processes = []
183184

184185

185-
def get_workload_version(workload):
186+
def is_stable_version(version):
187+
"""Check if version uses the stable/production format (2024.x, 2025.x, etc.)
188+
Master versions use 2.x format (e.g. 2.29.0.0), stable use year-based (e.g. 2025.2.0.0)."""
189+
first = int(re.split(r'\.|-b', version)[0])
190+
return first >= 2024
191+
192+
193+
def get_workload_version(workload, target_version=None):
194+
"""Get the minimum version for a workload. When target_version is a stable/production
195+
release and the workload has a start_version_stable, use that instead of the master
196+
start_version."""
186197
for el in TEST_PER_VERSION:
187198
for tests in el["tests"]:
188199
if workload in tests:
200+
if target_version and is_stable_version(target_version) and "start_version_stable" in el:
201+
return el["start_version_stable"]
189202
return el["start_version"]
190-
raise EnvironmentError(f"Unanable to find workload in tests: {TESTS}")
203+
raise EnvironmentError(f"Unable to find workload in tests: {TESTS}")
191204

192205

193206
def is_version_at_least(v_least, v_actual):
@@ -290,13 +303,16 @@ def run_cmd(cmd,
290303
sys.exit(returncode)
291304
everything_looks_good = False
292305
cycle_search_timeout_only = False
306+
has_valid_unknown = False
293307
last_lines_of_output = []
294308
if stdout_path is not None and os.path.exists(stdout_path):
295309
last_lines_of_output, _ = get_last_lines(stdout_path, 50)
296310
everything_looks_good = any(
297311
line.startswith('Everything looks good!') for line in last_lines_of_output)
298312
if not everything_looks_good:
299313
cycle_search_timeout_only = is_cycle_search_timeout_only(last_lines_of_output)
314+
has_valid_unknown = any(
315+
':valid? :unknown' in line for line in last_lines_of_output)
300316
if everything_looks_good:
301317
keep_output_log_file = False
302318
return CmdResult(
@@ -305,7 +321,8 @@ def run_cmd(cmd,
305321
returncode=returncode,
306322
timed_out=timed_out,
307323
everything_looks_good=everything_looks_good,
308-
cycle_search_timeout_only=cycle_search_timeout_only)
324+
cycle_search_timeout_only=cycle_search_timeout_only,
325+
has_valid_unknown=has_valid_unknown)
309326

310327
finally:
311328
if stdout_file is not None:
@@ -395,6 +412,15 @@ def parse_args():
395412
'--iterations',
396413
type=int,
397414
help='Run each workload repeatedly for this many iterations.')
415+
parser.add_argument(
416+
'--locking',
417+
default=None,
418+
choices=['mixed', 'optimistic', 'pessimistic'],
419+
help='Locking mode for append workloads: mixed (default), optimistic, or pessimistic')
420+
parser.add_argument(
421+
'--stress-tuning',
422+
action='store_true',
423+
help='Enable stress-test flags with tiny thresholds for internal subsystems')
398424
return parser.parse_args()
399425

400426

@@ -448,13 +474,16 @@ def main():
448474
[os.path.join(os.environ["JAVA_HOME"], "bin", "java"), "-version"],
449475
stderr=subprocess.STDOUT).decode().strip()
450476
logging.info("Java version:\n%s", java_version)
477+
locking_flag = f"--locking {args.locking}" if args.locking else ""
478+
stress_flag = "--stress-tuning" if args.stress_tuning else ""
451479
lein_cmd = " ".join(["lein run test",
452480
"--os debian",
453481
f"--url {url}",
454482
f"--nemesis {nemeses}",
455483
f"--nodes {get_ip_from_dns()}",
456484
connection_manager_flag,
457-
f"--concurrency {args.concurrency}"])
485+
locking_flag,
486+
stress_flag])
458487

459488
if args.iterations:
460489
lein_cmd += " --test-count 1"
@@ -464,7 +493,7 @@ def main():
464493

465494
all_workloads = args.workloads.split(',')
466495
workloads_to_evaluate = [workload for workload in all_workloads
467-
if is_version_at_least(get_workload_version(workload),
496+
if is_version_at_least(get_workload_version(workload, version),
468497
version)]
469498
workloads_to_skip = set(all_workloads) - set(workloads_to_evaluate)
470499

@@ -496,11 +525,13 @@ def main():
496525
test_start_time_sec = time.time()
497526
if '/set' in test:
498527
test_run_time_limit_no_analysis_sec = SINGLE_TEST_RUN_TIME_FOR_SET_TEST if args.test_time_sec == 0 else args.test_time_sec
499-
elif '/rc.ol' in test:
500-
test_run_time_limit_no_analysis_sec = SINGLE_TEST_RUN_TIME_FOR_RC_OL_TEST if args.test_time_sec == 0 else args.test_time_sec
528+
elif '/rc.' in test and 'append' in test:
529+
test_run_time_limit_no_analysis_sec = SINGLE_TEST_RUN_TIME_FOR_RC_APPEND_TEST if args.test_time_sec == 0 else args.test_time_sec
501530
else:
502531
test_run_time_limit_no_analysis_sec = SINGLE_TEST_RUN_TIME if args.test_time_sec == 0 else args.test_time_sec
532+
concurrency = '3' if 'append-table' in test else args.concurrency
503533
full_cmd = lein_cmd + \
534+
f" --concurrency {concurrency}" + \
504535
" --time-limit " + str(test_run_time_limit_no_analysis_sec) + \
505536
" --workload " + test
506537
result = run_cmd(
@@ -534,16 +565,16 @@ def main():
534565
test_index, test_elapsed_time_sec, result.returncode,
535566
result.everything_looks_good)
536567

537-
# For rc.ol workloads, accept cycle-search-timeout as valid (no actual anomalies found)
538-
is_rc_ol_timeout_acceptable = (
539-
'/rc.ol' in test and
540-
result.cycle_search_timeout_only and
568+
# For read committed workloads, accept valid-unknown results (e.g. cycle-search-timeout)
569+
is_rc_unknown_acceptable = (
570+
'/rc.' in test and
571+
result.has_valid_unknown and
541572
not result.timed_out
542573
)
543574

544-
if result.everything_looks_good or is_rc_ol_timeout_acceptable:
545-
if is_rc_ol_timeout_acceptable:
546-
logging.info("Accepting rc.ol test with cycle-search-timeout (no anomalies found)")
575+
if result.everything_looks_good or is_rc_unknown_acceptable:
576+
if is_rc_unknown_acceptable:
577+
logging.info("Accepting read committed test with valid-unknown result")
547578
num_everything_looks_good += 1
548579

549580
if test_name not in test_cases:

yugabyte/sort-results.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ find $STORE_DIR -name "jepsen.log" -printf "%T+\t%p\n" | sort | cut -f2 |
2222
if grep -q ':valid? false' "$log_path"; then
2323
category="invalid"
2424
elif grep -q ':valid? :unknown' "$log_path"; then
25-
# For rc.ol tests, :valid? :unknown with only cycle-search-timeout is acceptable
26-
if [[ "$rel_dir_path" == *"rc.ol"* ]] && grep -q ':cycle-search-timeout' "$log_path" && ! grep -qE ':G0|:G1a|:G1b|:G1c|:G2' "$log_path"; then
25+
# For rc tests, :valid? :unknown with only cycle-search-timeout is acceptable
26+
if [[ "$rel_dir_path" == *"_rc."* ]] && grep -q ':cycle-search-timeout' "$log_path" && ! grep -qE ':G0|:G1a|:G1b|:G1c|:G2' "$log_path"; then
2727
category="ok"
2828
else
2929
category="valid-unknown"

yugabyte/src/yugabyte/append.clj

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,31 @@
3434
; :consistency-models [:strict-serializable] ; default value
3535
:additional-graphs [elle/realtime-graph]})))
3636
; (update :generator (partial gen/stagger 1/5)))
37+
38+
; Append-table workloads use lower limits because each key is a separate table
39+
; and every read fetches all rows — O(n) per read instead of O(1).
40+
(defn workload-si-table
41+
[opts]
42+
(-> (append/test {:key-count 16
43+
:max-txn-length 4
44+
:max-writes-per-key 128
45+
:anomalies [:internal :G-nonadjacent :G1 :G-SI]
46+
:consistency-models [:snapshot-isolation]
47+
:additional-graphs [elle/realtime-graph]})))
48+
49+
(defn workload-rc-table
50+
[opts]
51+
(-> (append/test {:key-count 16
52+
:max-txn-length 4
53+
:max-writes-per-key 128
54+
:anomalies [:G0 :G1a :G1b]
55+
:consistency-models [:read-committed]
56+
:additional-graphs [elle/realtime-graph]})))
57+
58+
(defn workload-serializable-table
59+
[opts]
60+
(-> (append/test {:key-count 16
61+
:max-txn-length 4
62+
:max-writes-per-key 128
63+
:anomalies [:G1 :G2]
64+
:additional-graphs [elle/realtime-graph]})))

yugabyte/src/yugabyte/auto.clj

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,45 @@
483483
:--rpc_connection_timeout_ms 1500]
484484
[]))
485485

486+
(defn master-tserver-stress-flags
487+
"Shared stress-test flags for master and tserver.
488+
Disabled flags are commented with the reason — re-enable after verifying startup."
489+
[test]
490+
(if (:stress-tuning test)
491+
[; WAL: 512KB segments — may be too small for catalog bootstrap
492+
; :--log_segment_size_bytes 524288
493+
; :--consensus_max_batch_size_bytes 65536 ; 64KB — smaller replication batches
494+
; :--bg_superblock_flush_interval_secs 5
495+
]
496+
[]))
497+
498+
(defn master-stress-flags
499+
"Stress-test flags for master: tablet splitting.
500+
Disabled — tiny thresholds cause split storms during bootstrap."
501+
[test]
502+
(if (:stress-tuning test)
503+
[; :--enable_automatic_tablet_splitting true
504+
; :--tablet_split_low_phase_size_threshold_bytes 1024
505+
; :--tablet_split_high_phase_size_threshold_bytes 4096
506+
; :--tablet_force_split_threshold_bytes 8192
507+
]
508+
[]))
509+
510+
(defn tserver-stress-flags
511+
"Stress-test flags for tserver — DocDB, RocksDB, MVCC, intent cleanup."
512+
[test]
513+
(if (:stress-tuning test)
514+
[:--txn_max_apply_batch_records 5
515+
; :--db_write_buffer_size 524288
516+
; :--db_block_cache_size_bytes 8388608
517+
; :--aborted_intent_cleanup_ms 1000
518+
; :--timestamp_history_retention_interval_sec 5
519+
; :--transaction_deadlock_detection_interval_usec 1000000
520+
:--backfill_index_write_batch_size 10
521+
; :--cdc_stream_records_threshold_size_bytes 1024
522+
]
523+
[]))
524+
486525
(def limits-conf
487526
"Ulimits, in the format for /etc/security/limits.conf."
488527
"
@@ -544,6 +583,8 @@
544583
(master-tserver-wait-on-conflict-flags test)
545584
(master-tserver-packed-columns test)
546585
(master-tserver-geo-partitioning-flags test node (:nodes test))
586+
(master-tserver-stress-flags test)
587+
(master-stress-flags test)
547588
(master-api-opts (:api test) node)
548589
)))
549590

@@ -565,6 +606,8 @@
565606
(master-tserver-wait-on-conflict-flags test)
566607
(master-tserver-packed-columns test)
567608
(master-tserver-geo-partitioning-flags test node (:nodes test))
609+
(master-tserver-stress-flags test)
610+
(tserver-stress-flags test)
568611
(tserver-api-opts test node)
569612
(tserver-connection-manager-preview test)
570613
(tserver-read-committed-flags test)

yugabyte/src/yugabyte/core.clj

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@
9292
"A map of workload names to functions that can take option maps and construct workloads."
9393
#:ysql{:none noop-test
9494
:sleep sleep-test
95-
:sz.counter (with-client counter/workload (yugabyte.ysql.counter/->YSQLCounterClient))
96-
:sz.set (with-client set/workload (yugabyte.ysql.set/->YSQLSetClient))
95+
:sz.counter (with-client counter/workload (yugabyte.ysql.counter/->YSQLCounterClient :serializable))
96+
:sz.set (with-client set/workload (yugabyte.ysql.set/->YSQLSetClient :serializable))
9797
; This one doesn't work because of https://github.com/YugaByte/yugabyte-db/issues/1554
9898
; :set-index (with-client set/workload (yugabyte.ysql.set/->YSQLSetIndexClient))
9999
; We'd rather allow negatives for now because it makes reproducing error easier
@@ -103,25 +103,23 @@
103103
:sz.long-fork (with-client long-fork/workload (yugabyte.ysql.long-fork/->YSQLLongForkClient))
104104
:sz.single-key-acid (with-client single-key-acid/workload (yugabyte.ysql.single-key-acid/->YSQLSingleKeyAcidClient))
105105
:sz.multi-key-acid (with-client multi-key-acid/workload (yugabyte.ysql.multi-key-acid/->YSQLMultiKeyAcidClient))
106-
:sz.ol.geo.append (with-client append/workload-serializable (ysql.append/->Client :serializable :optimistic :geo))
107-
:sz.pl.geo.append (with-client append/workload-serializable (ysql.append/->Client :serializable :pessimistic :geo))
108-
:sz.ol.append (with-client append/workload-serializable (ysql.append/->Client :serializable :optimistic :no-geo))
109-
:sz.pl.append (with-client append/workload-serializable (ysql.append/->Client :serializable :pessimistic :no-geo))
110-
:sz.append-table (with-client append/workload-serializable (ysql.append-table/->Client :serializable))
106+
:sz.geo.append (with-client append/workload-serializable (ysql.append/->Client :serializable (or (:locking opts) :mixed) :geo))
107+
:sz.append (with-client append/workload-serializable (ysql.append/->Client :serializable (or (:locking opts) :mixed) :no-geo))
108+
:sz.append-table (with-client append/workload-serializable-table (ysql.append-table/->Client :serializable))
111109
:sz.default-value (with-client default-value/workload (ysql.default-value/->Client))
112-
:rc.ol.geo.append (with-client append/workload-rc (ysql.append/->Client :read-committed :optimistic :geo))
113-
:rc.pl.geo.append (with-client append/workload-rc (ysql.append/->Client :read-committed :pessimistic :geo))
114-
:rc.ol.append (with-client append/workload-rc (ysql.append/->Client :read-committed :optimistic :no-geo))
115-
:rc.pl.append (with-client append/workload-rc (ysql.append/->Client :read-committed :pessimistic :no-geo))
110+
:rc.geo.append (with-client append/workload-rc (ysql.append/->Client :read-committed (or (:locking opts) :mixed) :geo))
111+
:rc.append (with-client append/workload-rc (ysql.append/->Client :read-committed (or (:locking opts) :mixed) :no-geo))
116112
; See https://docs.yugabyte.com/latest/architecture/transactions/isolation-levels/
117113
; :snapshot-isolation maps to :repeatable_read SQL
118-
:si.ol.geo.append (with-client append/workload-si (ysql.append/->Client :repeatable-read :optimistic :geo))
119-
:si.pl.geo.append (with-client append/workload-si (ysql.append/->Client :repeatable-read :pessimistic :geo))
120-
:si.ol.append (with-client append/workload-si (ysql.append/->Client :repeatable-read :optimistic :no-geo))
121-
:si.pl.append (with-client append/workload-si (ysql.append/->Client :repeatable-read :pessimistic :no-geo))
114+
:si.geo.append (with-client append/workload-si (ysql.append/->Client :repeatable-read (or (:locking opts) :mixed) :geo))
115+
:si.append (with-client append/workload-si (ysql.append/->Client :repeatable-read (or (:locking opts) :mixed) :no-geo))
122116
:si.bank (with-client bank/workload-allow-neg (yugabyte.ysql.bank/->YSQLBankClient true :repeatable-read))
123117
:si.bank-multitable (with-client bank/workload-allow-neg (yugabyte.ysql.bank/->YSQLBankClient true :repeatable-read))
124-
:si.bank-contention (with-client bank-improved/workload-contention-keys (yugabyte.ysql.bank-improved/->YSQLBankContentionClient :repeatable-read))})
118+
:si.bank-contention (with-client bank-improved/workload-contention-keys (yugabyte.ysql.bank-improved/->YSQLBankContentionClient :repeatable-read))
119+
:si.append-table (with-client append/workload-si-table (ysql.append-table/->Client :repeatable-read))
120+
:si.counter (with-client counter/workload (yugabyte.ysql.counter/->YSQLCounterClient :repeatable-read))
121+
:si.set (with-client set/workload (yugabyte.ysql.set/->YSQLSetClient :repeatable-read))
122+
:rc.append-table (with-client append/workload-rc-table (ysql.append-table/->Client :read-committed))})
125123

126124
(def workloads
127125
(merge workloads-ycql workloads-ysql))

0 commit comments

Comments
 (0)