Skip to content

Commit 5c23569

Browse files
authored
page_service: add benchmark for batching (#9820)
This PR adds two benchmark to demonstrate the effect of server-side getpage request batching added in #9321. For the CPU usage, I found the the `prometheus` crate's built-in CPU usage accounts the seconds at integer granularity. That's not enough you reduce the target benchmark runtime for local iteration. So, add a new `libmetrics` metric and report that. The benchmarks are disabled because [on our benchmark nodes, timer resolution isn't high enough](https://neondb.slack.com/archives/C059ZC138NR/p1732264223207449). They work (no statement about quality) on my bare-metal devbox. They will be refined and enabled once we find a fix. Candidates at time of writing are: - #9822 - #9851 Refs: - Epic: #9376 - Extracted from #9792
1 parent 441612c commit 5c23569

File tree

3 files changed

+347
-3
lines changed

3 files changed

+347
-3
lines changed

libs/metrics/src/more_process_metrics.rs

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,28 @@
22
33
// This module has heavy inspiration from the prometheus crate's `process_collector.rs`.
44

5+
use once_cell::sync::Lazy;
6+
use prometheus::Gauge;
7+
58
use crate::UIntGauge;
69

710
pub struct Collector {
811
descs: Vec<prometheus::core::Desc>,
912
vmlck: crate::UIntGauge,
13+
cpu_seconds_highres: Gauge,
1014
}
1115

12-
const NMETRICS: usize = 1;
16+
const NMETRICS: usize = 2;
17+
18+
static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
19+
let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
20+
if long == -1 {
21+
panic!("sysconf(_SC_CLK_TCK) failed");
22+
}
23+
let convertible_to_f64: i32 =
24+
i32::try_from(long).expect("sysconf(_SC_CLK_TCK) is larger than i32");
25+
convertible_to_f64 as f64
26+
});
1327

1428
impl prometheus::core::Collector for Collector {
1529
fn desc(&self) -> Vec<&prometheus::core::Desc> {
@@ -27,6 +41,12 @@ impl prometheus::core::Collector for Collector {
2741
mfs.extend(self.vmlck.collect())
2842
}
2943
}
44+
if let Ok(stat) = myself.stat() {
45+
let cpu_seconds = stat.utime + stat.stime;
46+
self.cpu_seconds_highres
47+
.set(cpu_seconds as f64 / *CLK_TCK_F64);
48+
mfs.extend(self.cpu_seconds_highres.collect());
49+
}
3050
mfs
3151
}
3252
}
@@ -43,7 +63,23 @@ impl Collector {
4363
.cloned(),
4464
);
4565

46-
Self { descs, vmlck }
66+
let cpu_seconds_highres = Gauge::new(
67+
"libmetrics_process_cpu_seconds_highres",
68+
"Total user and system CPU time spent in seconds.\
69+
Sub-second resolution, hence better than `process_cpu_seconds_total`.",
70+
)
71+
.unwrap();
72+
descs.extend(
73+
prometheus::core::Collector::desc(&cpu_seconds_highres)
74+
.into_iter()
75+
.cloned(),
76+
);
77+
78+
Self {
79+
descs,
80+
vmlck,
81+
cpu_seconds_highres,
82+
}
4783
}
4884
}
4985

test_runner/performance/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Some handy pytest flags for local development:
1515
- `-k` selects a test to run
1616
- `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
1717
- `--preserve-database-files` to skip cleanup
18+
- `--out-dir` to produce a JSON with the recorded test metrics
1819

1920
# What performance tests do we have and how we run them
2021

@@ -36,6 +37,6 @@ All tests run only once. Usually to obtain more consistent performance numbers,
3637

3738
## Results collection
3839

39-
Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.
40+
Local test results for main branch, and results of daily performance tests, are stored in a [neon project](https://console.neon.tech/app/projects/withered-sky-69117821) deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.
4041

4142
There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing.
Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
import dataclasses
2+
import json
3+
import time
4+
from dataclasses import dataclass
5+
from pathlib import Path
6+
from typing import Any
7+
8+
import pytest
9+
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
10+
from fixtures.log_helper import log
11+
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
12+
from fixtures.utils import humantime_to_ms
13+
14+
TARGET_RUNTIME = 60
15+
16+
17+
@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
18+
@pytest.mark.parametrize(
19+
"tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
20+
[
21+
# the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout
22+
(50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"),
23+
(50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"),
24+
(50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"),
25+
# the next 4 cases demonstrate how batchable workloads benefit from batching
26+
(50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"),
27+
(50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"),
28+
(50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"),
29+
(50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"),
30+
],
31+
)
32+
def test_getpage_merge_smoke(
33+
neon_env_builder: NeonEnvBuilder,
34+
zenbenchmark: NeonBenchmarker,
35+
tablesize_mib: int,
36+
batch_timeout: str | None,
37+
target_runtime: int,
38+
effective_io_concurrency: int,
39+
readhead_buffer_size: int,
40+
name: str,
41+
):
42+
"""
43+
Do a bunch of sequential scans and ensure that the pageserver does some merging.
44+
"""
45+
46+
#
47+
# record perf-related parameters as metrics to simplify processing of results
48+
#
49+
params: dict[str, tuple[float | int, dict[str, Any]]] = {}
50+
51+
params.update(
52+
{
53+
"tablesize_mib": (tablesize_mib, {"unit": "MiB"}),
54+
"batch_timeout": (
55+
-1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout),
56+
{"unit": "us"},
57+
),
58+
# target_runtime is just a polite ask to the workload to run for this long
59+
"effective_io_concurrency": (effective_io_concurrency, {}),
60+
"readhead_buffer_size": (readhead_buffer_size, {}),
61+
# name is not a metric
62+
}
63+
)
64+
65+
log.info("params: %s", params)
66+
67+
for param, (value, kwargs) in params.items():
68+
zenbenchmark.record(
69+
param,
70+
metric_value=value,
71+
unit=kwargs.pop("unit", ""),
72+
report=MetricReport.TEST_PARAM,
73+
**kwargs,
74+
)
75+
76+
#
77+
# Setup
78+
#
79+
80+
env = neon_env_builder.init_start()
81+
ps_http = env.pageserver.http_client()
82+
endpoint = env.endpoints.create_start("main")
83+
conn = endpoint.connect()
84+
cur = conn.cursor()
85+
86+
cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends
87+
cur.execute(f"SET effective_io_concurrency={effective_io_concurrency}")
88+
cur.execute(
89+
f"SET neon.readahead_buffer_size={readhead_buffer_size}"
90+
) # this is the current default value, but let's hard-code that
91+
92+
cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
93+
cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
94+
95+
log.info("Filling the table")
96+
cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
97+
tablesize = tablesize_mib * 1024 * 1024
98+
npages = tablesize // (8 * 1024)
99+
cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
100+
# TODO: can we force postgres to do sequential scans?
101+
102+
#
103+
# Run the workload, collect `Metrics` before and after, calculate difference, normalize.
104+
#
105+
106+
@dataclass
107+
class Metrics:
108+
time: float
109+
pageserver_getpage_count: float
110+
pageserver_vectored_get_count: float
111+
compute_getpage_count: float
112+
pageserver_cpu_seconds_total: float
113+
114+
def __sub__(self, other: "Metrics") -> "Metrics":
115+
return Metrics(
116+
time=self.time - other.time,
117+
pageserver_getpage_count=self.pageserver_getpage_count
118+
- other.pageserver_getpage_count,
119+
pageserver_vectored_get_count=self.pageserver_vectored_get_count
120+
- other.pageserver_vectored_get_count,
121+
compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
122+
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
123+
- other.pageserver_cpu_seconds_total,
124+
)
125+
126+
def normalize(self, by) -> "Metrics":
127+
return Metrics(
128+
time=self.time / by,
129+
pageserver_getpage_count=self.pageserver_getpage_count / by,
130+
pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
131+
compute_getpage_count=self.compute_getpage_count / by,
132+
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
133+
)
134+
135+
def get_metrics() -> Metrics:
136+
with conn.cursor() as cur:
137+
cur.execute(
138+
"select value from neon_perf_counters where metric='getpage_wait_seconds_count';"
139+
)
140+
compute_getpage_count = cur.fetchall()[0][0]
141+
pageserver_metrics = ps_http.get_metrics()
142+
return Metrics(
143+
time=time.time(),
144+
pageserver_getpage_count=pageserver_metrics.query_one(
145+
"pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
146+
).value,
147+
pageserver_vectored_get_count=pageserver_metrics.query_one(
148+
"pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
149+
).value,
150+
compute_getpage_count=compute_getpage_count,
151+
pageserver_cpu_seconds_total=pageserver_metrics.query_one(
152+
"libmetrics_process_cpu_seconds_highres"
153+
).value,
154+
)
155+
156+
def workload() -> Metrics:
157+
start = time.time()
158+
iters = 0
159+
while time.time() - start < target_runtime or iters < 2:
160+
log.info("Seqscan %d", iters)
161+
if iters == 1:
162+
# round zero for warming up
163+
before = get_metrics()
164+
cur.execute(
165+
"select clear_buffer_cache()"
166+
) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
167+
cur.execute("select sum(data::bigint) from t")
168+
assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
169+
iters += 1
170+
after = get_metrics()
171+
return (after - before).normalize(iters - 1)
172+
173+
env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout})
174+
env.pageserver.restart()
175+
metrics = workload()
176+
177+
log.info("Results: %s", metrics)
178+
179+
#
180+
# Sanity-checks on the collected data
181+
#
182+
# assert that getpage counts roughly match between compute and ps
183+
assert metrics.pageserver_getpage_count == pytest.approx(
184+
metrics.compute_getpage_count, rel=0.01
185+
)
186+
187+
#
188+
# Record the results
189+
#
190+
191+
for metric, value in dataclasses.asdict(metrics).items():
192+
zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
193+
194+
zenbenchmark.record(
195+
"perfmetric.batching_factor",
196+
metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
197+
unit="",
198+
report=MetricReport.HIGHER_IS_BETTER,
199+
)
200+
201+
202+
@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
203+
@pytest.mark.parametrize(
204+
"batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"]
205+
)
206+
def test_timer_precision(
207+
neon_env_builder: NeonEnvBuilder,
208+
zenbenchmark: NeonBenchmarker,
209+
pg_bin: PgBin,
210+
batch_timeout: str | None,
211+
):
212+
"""
213+
Determine the batching timeout precision (mean latency) and tail latency impact.
214+
215+
The baseline is `None`; an ideal batching timeout implementation would increase
216+
the mean latency by exactly `batch_timeout`.
217+
218+
That is not the case with the current implementation, will be addressed in future changes.
219+
"""
220+
221+
#
222+
# Setup
223+
#
224+
225+
def patch_ps_config(ps_config):
226+
ps_config["server_side_batch_timeout"] = batch_timeout
227+
228+
neon_env_builder.pageserver_config_override = patch_ps_config
229+
230+
env = neon_env_builder.init_start()
231+
endpoint = env.endpoints.create_start("main")
232+
conn = endpoint.connect()
233+
cur = conn.cursor()
234+
235+
cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends
236+
cur.execute("SET effective_io_concurrency=1")
237+
238+
cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
239+
cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
240+
241+
log.info("Filling the table")
242+
cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
243+
tablesize = 50 * 1024 * 1024
244+
npages = tablesize // (8 * 1024)
245+
cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
246+
# TODO: can we force postgres to do sequential scans?
247+
248+
cur.close()
249+
conn.close()
250+
251+
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
252+
253+
endpoint.stop()
254+
255+
for sk in env.safekeepers:
256+
sk.stop()
257+
258+
#
259+
# Run single-threaded pagebench (TODO: dedup with other benchmark code)
260+
#
261+
262+
env.pageserver.allowed_errors.append(
263+
# https://github.com/neondatabase/neon/issues/6925
264+
r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
265+
)
266+
267+
ps_http = env.pageserver.http_client()
268+
269+
cmd = [
270+
str(env.neon_binpath / "pagebench"),
271+
"get-page-latest-lsn",
272+
"--mgmt-api-endpoint",
273+
ps_http.base_url,
274+
"--page-service-connstring",
275+
env.pageserver.connstr(password=None),
276+
"--num-clients",
277+
"1",
278+
"--runtime",
279+
"10s",
280+
]
281+
log.info(f"command: {' '.join(cmd)}")
282+
basepath = pg_bin.run_capture(cmd, with_command_header=False)
283+
results_path = Path(basepath + ".stdout")
284+
log.info(f"Benchmark results at: {results_path}")
285+
286+
with open(results_path) as f:
287+
results = json.load(f)
288+
log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
289+
290+
total = results["total"]
291+
292+
metric = "latency_mean"
293+
zenbenchmark.record(
294+
metric,
295+
metric_value=humantime_to_ms(total[metric]),
296+
unit="ms",
297+
report=MetricReport.LOWER_IS_BETTER,
298+
)
299+
300+
metric = "latency_percentiles"
301+
for k, v in total[metric].items():
302+
zenbenchmark.record(
303+
f"{metric}.{k}",
304+
metric_value=humantime_to_ms(v),
305+
unit="ms",
306+
report=MetricReport.LOWER_IS_BETTER,
307+
)

0 commit comments

Comments
 (0)