Skip to content

Commit 090a1c2

Browse files
stefannicaschustmi
andauthored
Idempotent POST requests, request caching and other performance improvements (#3738)
* Increase the filedescriptor count limit on the zenml server container and log more metrics * Reduced logged metrics and add parallel option to stress test * Lighter health check endpoints and more conservative k8s probe timeouts * Fix linter errors * Add event loop monitor and move user activity DB call outside event loop * Add endpoint debug stats and retry client POST requests * Add log parser and metrics visualiser and more logs * Some logging improvements * Implement centralized request management with deduplication and caching * Implement idempotency checks and request retries * Implement SQLAlchemy indempotent Session * Reverted all SQLZenStore changes * Reimplemented transaction management with database caching. * Fix transaction result table type and encode result as base64 secret * Enable deduplication for POST requests and only a selected set of endpoints * Update src/zenml/models/v2/core/api_transaction.py Co-authored-by: Michael Schuster <[email protected]> * Update src/zenml/zen_server/middleware.py Co-authored-by: Michael Schuster <[email protected]> * Add transaction table indices and improve queries; increase auth threads to 5 * Fix the transaction finalization to include the result * Fix long URL when fetching input steps * Fix linter on windows * Add new configuration options to Helm chart * Fix linter errors * Fix the resolve_step_inputs utility * Move user activity global vars together * Reorder DB migration versions * Fix URL split in resolve_step_inputs and windows unit tests --------- Co-authored-by: Michael Schuster <[email protected]>
1 parent 3c4f77a commit 090a1c2

40 files changed

+3559
-666
lines changed

docker/base.Dockerfile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ RUN groupadd --gid $USER_GID $USERNAME \
184184
COPY --chown=$USERNAME:$USER_GID --from=server-builder /opt/venv /opt/venv
185185
# Copy the requirements.txt file from the builder stage
186186
COPY --chown=$USERNAME:$USER_GID --from=server-builder /zenml/requirements.txt /zenml/requirements.txt
187+
COPY --chown=$USERNAME:$USER_GID scripts/docker-entrypoint.sh /entrypoint.sh
188+
RUN chmod +x /entrypoint.sh
187189

188190
ENV PATH="$VIRTUAL_ENV/bin:/home/$USERNAME/.local/bin:$PATH"
189191

@@ -192,5 +194,6 @@ USER $USERNAME
192194

193195
# Start the ZenML server
194196
EXPOSE 8080
195-
ENTRYPOINT ["uvicorn", "zenml.zen_server.zen_server_api:app", "--log-level", "debug", "--no-server-header", "--proxy-headers", "--forwarded-allow-ips", "*"]
196-
CMD ["--port", "8080", "--host", "0.0.0.0"]
197+
ENTRYPOINT ["/entrypoint.sh"]
198+
199+
CMD ["uvicorn", "zenml.zen_server.zen_server_api:app", "--no-server-header", "--proxy-headers", "--forwarded-allow-ips", "*", "--port", "8080", "--host", "0.0.0.0"]

docker/zenml-server-dev.Dockerfile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ COPY --chown=$USERNAME:$USER_GID --from=builder /zenml/requirements.txt /zenml/r
138138
# Copy source code
139139
COPY --chown=$USERNAME:$USER_GID README.md pyproject.toml ./
140140
COPY --chown=$USERNAME:$USER_GID src src
141+
COPY --chown=$USERNAME:$USER_GID scripts/docker-entrypoint.sh /entrypoint.sh
142+
RUN chmod +x /entrypoint.sh
141143

142144
FROM common-runtime AS local-runtime
143145

@@ -147,8 +149,8 @@ RUN pip install --no-deps --no-cache -e .[server,secrets-aws,secrets-gcp,secrets
147149

148150
EXPOSE 8080
149151

150-
ENTRYPOINT ["uvicorn", "zenml.zen_server.zen_server_api:app", "--log-level", "debug", "--no-server-header", "--proxy-headers", "--forwarded-allow-ips", "*", "--reload", "--access-log"]
151-
CMD ["--port", "8080", "--host", "0.0.0.0"]
152+
ENTRYPOINT ["/entrypoint.sh"]
153+
CMD ["uvicorn", "zenml.zen_server.zen_server_api:app", "--log-level", "debug", "--no-server-header", "--proxy-headers", "--forwarded-allow-ips", "*", "--reload", "--port", "8080", "--host", "0.0.0.0"]
152154

153155

154156
FROM common-runtime AS runtime
@@ -160,5 +162,5 @@ RUN pip install --no-deps --no-cache .[server,secrets-aws,secrets-gcp,secrets-az
160162

161163
EXPOSE 8080
162164

163-
ENTRYPOINT ["uvicorn", "zenml.zen_server.zen_server_api:app", "--log-level", "debug", "--no-server-header", "--proxy-headers", "--forwarded-allow-ips", "*", "--access-log"]
164-
CMD ["--port", "8080", "--host", "0.0.0.0"]
165+
ENTRYPOINT ["/entrypoint.sh"]
166+
CMD ["uvicorn", "zenml.zen_server.zen_server_api:app", "--log-level", "debug", "--no-server-header", "--proxy-headers", "--forwarded-allow-ips", "*", "--port", "8080", "--host", "0.0.0.0"]

examples/stress-test/results.ipynb

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "63ab391a",
6+
"metadata": {},
7+
"source": [
8+
"# Stress Test Results\n"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": 1,
14+
"id": "53e60d14",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"from utils import LogFile"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 2,
24+
"id": "ee4c7fb0",
25+
"metadata": {
26+
"collapsed": true,
27+
"jupyter": {
28+
"outputs_hidden": true,
29+
"source_hidden": true
30+
}
31+
},
32+
"outputs": [],
33+
"source": [
34+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-040-40th-1000-steps-30s-50-batch-no-rbac-endpoint-metrics.txt\")\n",
35+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-041-40th-1000-steps-30s-50-batch-no-rbac-no-service-connector.txt\")\n",
36+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-042-20th-1000-steps-30s-50-batch-no-rbac-no-service-connector.txt\")\n",
37+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-043-2th-1000-steps-30s-50-batch-no-rbac-no-service-connector.txt\")\n",
38+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-044-4th-1000-steps-30s-50-batch-no-rbac-no-service-connector.txt\")\n",
39+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-044-4th-1000-steps-30s-50-batch-no-rbac-no-service-connector-take-2.txt\")\n",
40+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-045-8th-1000-steps-30s-50-batch-no-rbac-no-service-connector.txt\")\n",
41+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-046-8th-1000-steps-30s-50-batch-no-rbac.txt\")\n",
42+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-047-2th-1000-steps-30s-50-batch-no-rbac.txt\")\n",
43+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-048-2th-1000-steps-30s-50-batch.txt\")\n",
44+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-049-4th-1000-steps-30s-50-batch.txt\")\n",
45+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-050-8th-1000-steps-30s-50-batch.txt\")\n",
46+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-050-8th-1000-steps-30s-50-batch-take-2.txt\")\n",
47+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-050-8th-1000-steps-30s-50-batch-take-3.txt\")\n",
48+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-051-8th-1000-steps-30s-50-batch-deduplication.txt\")\n",
49+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-052-8th-1000-steps-30s-50-batch-deduplication-take-2.txt\")\n",
50+
"# log_file = LogFile.parse_logs(\"../../../../docs/aiml/zenml/stress-tests/server-logs-053-2th-1000-steps-30s-50-batch-deduplication.txt\")\n",
51+
"log_file = LogFile.parse_logs(\n",
52+
" \"../../../../docs/aiml/zenml/stress-tests/server-logs-054-2th-1000-steps-30s-50-batch-deduplication-async-auth.txt\"\n",
53+
")"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": 3,
59+
"id": "425c47c5",
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"# log_file.request_flows[\"b170dbe2\"]"
64+
]
65+
},
66+
{
67+
"cell_type": "code",
68+
"execution_count": null,
69+
"id": "4b7dab39",
70+
"metadata": {
71+
"collapsed": true,
72+
"jupyter": {
73+
"outputs_hidden": true,
74+
"source_hidden": true
75+
}
76+
},
77+
"outputs": [],
78+
"source": [
79+
"log_file.plot_request_flows(\n",
80+
" start_time=200,\n",
81+
" end_time=500,\n",
82+
" max_requests=5000,\n",
83+
" # pps=10,\n",
84+
" # width=2000,\n",
85+
" height=1000,\n",
86+
" width=1900,\n",
87+
" # pod=1,\n",
88+
" hide_legend=True,\n",
89+
" hide_y_axis=True,\n",
90+
" # api_call_filter=[\n",
91+
" # \".*/api/v1/service_connectors/<uuid>/client\",\n",
92+
" # \".*/api/v1/runs$\",\n",
93+
" # \".*/api/v1/pipeline_deployments\",\n",
94+
" # ],\n",
95+
" # min_duration=1,\n",
96+
" # group_retry_requests=False,\n",
97+
" # filter_states=[LogType.SQL_STARTED, LogType.SQL_COMPLETED],\n",
98+
" request_id=\"88ddf46e\",\n",
99+
")"
100+
]
101+
},
102+
{
103+
"cell_type": "code",
104+
"execution_count": null,
105+
"id": "b84ef271",
106+
"metadata": {
107+
"collapsed": true,
108+
"jupyter": {
109+
"outputs_hidden": true,
110+
"source_hidden": true
111+
}
112+
},
113+
"outputs": [],
114+
"source": [
115+
"log_file.plot(\n",
116+
" log_file.get_plot_fn(\n",
117+
" y_metric=\"threads\",\n",
118+
" # y_metric=\"memory_usage\",\n",
119+
" # y_metric=\"active_requests\",\n",
120+
" label_attribute=\"pod\",\n",
121+
" ),\n",
122+
")"
123+
]
124+
}
125+
],
126+
"metadata": {
127+
"kernelspec": {
128+
"display_name": ".venv",
129+
"language": "python",
130+
"name": "python3"
131+
},
132+
"language_info": {
133+
"codemirror_mode": {
134+
"name": "ipython",
135+
"version": 3
136+
},
137+
"file_extension": ".py",
138+
"mimetype": "text/x-python",
139+
"name": "python",
140+
"nbconvert_exporter": "python",
141+
"pygments_lexer": "ipython3",
142+
"version": "3.10.12"
143+
}
144+
},
145+
"nbformat": 4,
146+
"nbformat_minor": 5
147+
}

examples/stress-test/run.py

Lines changed: 87 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#
1717

1818
import time
19-
from typing import Annotated, Any, Dict, Tuple
19+
from typing import Annotated, Any, Dict, Optional, Tuple
2020

2121
import click
2222

@@ -29,45 +29,63 @@
2929
)
3030
from zenml.integrations.kubernetes.pod_settings import KubernetesPodSettings
3131

32-
kubernetes_settings = KubernetesOrchestratorSettings(
33-
pod_startup_timeout=600,
34-
pod_settings=KubernetesPodSettings(
35-
resources={
36-
"requests": {"cpu": "100m", "memory": "500Mi"},
37-
# "limits": {"memory": "500Mi"}, -> grows linearly with number of steps
38-
},
39-
node_selectors={"pool": "workloads"},
40-
tolerations=[
41-
{
42-
"key": "pool",
43-
"operator": "Equal",
44-
"value": "workloads",
45-
"effect": "NoSchedule",
46-
}
47-
],
48-
env=[{"name": "ZENML_LOGGING_VERBOSITY", "value": "debug"}],
49-
),
50-
orchestrator_pod_settings=KubernetesPodSettings(
51-
resources={
52-
"requests": {"cpu": "100m", "memory": "500Mi"},
53-
# "limits": {"memory": "500Mi"}, # -> grows linearly with number of steps
54-
},
55-
node_selectors={"pool": "workloads"},
56-
tolerations=[
57-
{
58-
"key": "pool",
59-
"operator": "Equal",
60-
"value": "workloads",
61-
"effect": "NoSchedule",
62-
}
63-
],
64-
),
65-
)
32+
33+
def get_kubernetes_settings(
34+
max_parallelism: Optional[int],
35+
) -> KubernetesOrchestratorSettings:
36+
"""Get the Kubernetes settings for the ZenML server.
37+
38+
Args:
39+
max_parallelism: The maximum number of parallel steps to run.
40+
41+
Returns:
42+
The Kubernetes settings for the ZenML server.
43+
"""
44+
return KubernetesOrchestratorSettings(
45+
service_account_name="zenml-service-account",
46+
pod_startup_timeout=600,
47+
max_parallelism=max_parallelism,
48+
pod_settings=KubernetesPodSettings(
49+
resources={
50+
"requests": {"cpu": "100m", "memory": "500Mi"},
51+
# "limits": {"memory": "500Mi"}, -> grows linearly with number of steps
52+
},
53+
node_selectors={"pool": "workloads"},
54+
tolerations=[
55+
{
56+
"key": "pool",
57+
"operator": "Equal",
58+
"value": "workloads",
59+
"effect": "NoSchedule",
60+
}
61+
],
62+
env=[
63+
{"name": "ZENML_LOGGING_VERBOSITY", "value": "debug"},
64+
{"name": "ZENML_ENABLE_RICH_TRACEBACK", "value": "false"},
65+
],
66+
),
67+
orchestrator_pod_settings=KubernetesPodSettings(
68+
resources={
69+
"requests": {"cpu": "100m", "memory": "500Mi"},
70+
# "limits": {"memory": "500Mi"}, # -> grows linearly with number of steps
71+
},
72+
node_selectors={"pool": "workloads"},
73+
tolerations=[
74+
{
75+
"key": "pool",
76+
"operator": "Equal",
77+
"value": "workloads",
78+
"effect": "NoSchedule",
79+
}
80+
],
81+
),
82+
)
83+
6684

6785
docker_settings = DockerSettings(
6886
python_package_installer=PythonPackageInstaller.UV,
6987
)
70-
settings = {"docker": docker_settings, "orchestrator": kubernetes_settings}
88+
settings = {"docker": docker_settings}
7189

7290

7391
@step
@@ -243,6 +261,7 @@ def load_step(
243261
# The report results step is beefier than the load step because it has to fetch
244262
# all the artifacts from the run.
245263
report_kubernetes_settings = KubernetesOrchestratorSettings(
264+
service_account_name="zenml-service-account",
246265
pod_settings=KubernetesPodSettings(
247266
resources={
248267
"requests": {"cpu": "100m", "memory": "800Mi"},
@@ -257,7 +276,10 @@ def load_step(
257276
"effect": "NoSchedule",
258277
}
259278
],
260-
env=[{"name": "ZENML_LOGGING_VERBOSITY", "value": "debug"}],
279+
env=[
280+
{"name": "ZENML_LOGGING_VERBOSITY", "value": "debug"},
281+
{"name": "ZENML_ENABLE_RICH_TRACEBACK", "value": "false"},
282+
],
261283
),
262284
)
263285

@@ -297,7 +319,7 @@ def report_results() -> None:
297319
print(f"Number of steps: {len(results)}")
298320

299321

300-
@pipeline(enable_cache=False, settings=settings)
322+
@pipeline(enable_cache=False)
301323
def load_test_pipeline(
302324
num_parallel_steps: int, duration: int, sleep_interval: float
303325
) -> None:
@@ -360,8 +382,20 @@ def load_test_pipeline(
360382
type=int,
361383
show_default=True,
362384
)
385+
@click.option(
386+
"--max-parallel-steps",
387+
"-m",
388+
help="Maximum number of parallel steps to run",
389+
required=False,
390+
default=None,
391+
type=int,
392+
)
363393
def main(
364-
parallel_steps: int, duration: int, sleep_interval: float, num_tags: int
394+
parallel_steps: int,
395+
duration: int,
396+
sleep_interval: float,
397+
num_tags: int,
398+
max_parallel_steps: Optional[int] = None,
365399
) -> None:
366400
"""Execute a ZenML load test with configurable parallel steps.
367401
@@ -373,12 +407,25 @@ def main(
373407
duration: The duration of the load test in seconds.
374408
sleep_interval: The interval to sleep between API calls in seconds.
375409
num_tags: The number of tags to add to the pipeline.
410+
max_parallel_steps: The maximum number of parallel steps to run.
376411
"""
377-
click.echo(f"Starting load test with {parallel_steps} parallel steps...")
412+
if max_parallel_steps:
413+
click.echo(
414+
f"Starting load test with {parallel_steps} parallel steps with "
415+
f"max {max_parallel_steps} running steps at a time..."
416+
)
417+
else:
418+
click.echo(
419+
f"Starting load test with {parallel_steps} parallel steps..."
420+
)
378421
click.echo(f"Duration: {duration}s, Sleep Interval: {sleep_interval}s")
379422

423+
kubernetes_settings = get_kubernetes_settings(max_parallel_steps)
424+
settings["orchestrator"] = kubernetes_settings
425+
380426
load_test_pipeline.configure(
381427
tags=[Tag(name=f"tag_{i}", cascade=True) for i in range(num_tags)],
428+
settings=settings,
382429
)
383430

384431
load_test_pipeline(

0 commit comments

Comments
 (0)