Skip to content

Commit fe89cc4

Browse files
committed
wb | move "latency" and "voting" to a new profile "workloads" stanza
1 parent 63eae3f commit fe89cc4

File tree

18 files changed

+765
-653
lines changed

18 files changed

+765
-653
lines changed

nix/workbench/backend/backend.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ usage_backend() {
2424
2525
wait-pools-stopped RUNDIR
2626
Wait until all pools are stopped
27+
wait-workloads-stopped RUNDIR
28+
Wait until all workloads are stopped
2729
stop-cluster RUNDIR
2830
cleanup-cluster RUNDIR
2931
Wipe cluster state to pristine
@@ -50,16 +52,16 @@ case "${op}" in
5052
start-tracers ) backend_$WB_BACKEND "$@";;
5153
start-nodes ) backend_$WB_BACKEND "$@";;
5254
start-generator ) backend_$WB_BACKEND "$@";;
55+
start-workloads ) backend_$WB_BACKEND "$@";;
5356
start-healthchecks ) backend_$WB_BACKEND "$@";;
54-
start-latencies ) backend_$WB_BACKEND "$@";;
5557
# Fine grained
5658
start-node ) backend_$WB_BACKEND "$@";;
5759
stop-node ) backend_$WB_BACKEND "$@";;
5860
wait-node ) backend_$WB_BACKEND "$@";;
5961
wait-node-stopped ) backend_$WB_BACKEND "$@";;
6062
get-node-socket-path ) backend_$WB_BACKEND "$@";;
6163
wait-pools-stopped ) backend_$WB_BACKEND "$@";;
62-
wait-latencies-stopped ) backend_$WB_BACKEND "$@";;
64+
wait-workloads-stopped ) backend_$WB_BACKEND "$@";;
6365
# Stop functions
6466
stop-all ) backend_$WB_BACKEND "$@";;
6567
fetch-logs ) backend_$WB_BACKEND "$@";;

nix/workbench/backend/nomad-job.nix

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -902,28 +902,27 @@ let
902902
}
903903
])
904904
++
905-
# healthcheck
906-
[
907-
## healthcheck start.sh script.
905+
# workloads
906+
(builtins.map (workload:
907+
## workload start.sh script.
908908
{
909909
env = false;
910-
destination = "local/${stateDir}/healthcheck/start.sh";
911-
data = escapeTemplate
912-
profileData.healthcheck-service.start.value;
910+
destination = "local/${stateDir}/workloads/${workload.name}/start.sh";
911+
data = escapeTemplate workload.start.value;
913912
change_mode = "noop";
914913
error_on_missing_key = true;
915914
perms = "744"; # Only for every "start.sh" script. Default: "644"
916915
}
917-
]
916+
) profileData.workloads-service)
918917
++
919-
# latency
918+
# healthcheck
920919
[
921-
## Latency start.sh script.
920+
## healthcheck start.sh script.
922921
{
923922
env = false;
924-
destination = "local/${stateDir}/latency/start.sh";
923+
destination = "local/${stateDir}/healthcheck/start.sh";
925924
data = escapeTemplate
926-
profileData.latency-service.start.value;
925+
profileData.healthcheck-service.start.value;
927926
change_mode = "noop";
928927
error_on_missing_key = true;
929928
perms = "744"; # Only for every "start.sh" script. Default: "644"

nix/workbench/backend/nomad.sh

Lines changed: 295 additions & 262 deletions
Large diffs are not rendered by default.

nix/workbench/backend/nomad/cloud.sh

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,13 @@ backend_nomadcloud() {
8181
backend_nomad wait-pools-stopped 60 "$@"
8282
;;
8383

84-
wait-latencies-stopped )
84+
wait-workloads-stopped )
8585
# It passes the sleep time (in seconds) required argument.
8686
# This time is different between local and cloud backends to avoid
8787
# unnecesary Nomad specific traffic (~99% happens waiting for node-0, the
8888
# first one it waits to stop inside a loop) and at the same time be less
8989
# sensitive to network failures.
90-
backend_nomad wait-latencies-stopped 60 "$@"
90+
backend_nomad wait-workloads-stopped 60 "$@"
9191
;;
9292

9393
fetch-logs )
@@ -146,12 +146,12 @@ backend_nomadcloud() {
146146
backend_nomad start-generator "$@"
147147
;;
148148

149-
start-healthchecks )
150-
backend_nomad start-healthchecks "$@"
149+
start-workloads )
150+
backend_nomad start-workloads "$@"
151151
;;
152152

153-
start-latencies )
154-
backend_nomad start-latencies "$@"
153+
start-healthchecks )
154+
backend_nomad start-healthchecks "$@"
155155
;;
156156

157157
start-node )
@@ -998,18 +998,6 @@ fetch-logs-ssh-node() {
998998
local ssh_config_path ssh_command
999999
ssh_config_path="$(wb nomad ssh config)"
10001000
ssh_command="ssh -F ${ssh_config_path} -p 32000 -l nobody"
1001-
# Download latency(ies) logs. ################################################
1002-
##############################################################################
1003-
msg "$(blue "Fetching") $(yellow "program \"latency\"") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
1004-
if ! rsync -e "${ssh_command}" -au \
1005-
-f'- start.sh' \
1006-
"${public_ipv4}":/local/run/current/latency/ \
1007-
"${dir}"/latency/"${node}"/
1008-
then
1009-
node_ok="false"
1010-
touch "${dir}"/nomad/"${node}"/download_failed
1011-
msg "$(red Error fetching) $(yellow "program \"latency\"") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
1012-
fi
10131001
# Download healthcheck(s) logs. ##############################################
10141002
##############################################################################
10151003
msg "$(blue "Fetching") $(yellow "program \"healthcheck\"") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
@@ -1022,6 +1010,22 @@ fetch-logs-ssh-node() {
10221010
touch "${dir}"/nomad/"${node}"/download_failed
10231011
msg "$(red Error fetching) $(yellow "program \"healthcheck\"") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
10241012
fi
1013+
# Download workload(s) logs. #################################################
1014+
##############################################################################
1015+
# For every workload
1016+
for workload in $(jq_tolist '.workloads | map(.name)' "$dir"/profile.json)
1017+
do
1018+
msg "$(blue "Fetching") $(yellow "program \"${workload}\" workload") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
1019+
if ! rsync -e "${ssh_command}" -au \
1020+
-f'- start.sh' \
1021+
"${public_ipv4}":/local/run/current/workloads/"${workload}"/ \
1022+
"${dir}"/workloads/"${workload}"/"${node}"/
1023+
then
1024+
node_ok="false"
1025+
touch "${dir}"/nomad/"${node}"/download_failed
1026+
msg "$(red Error fetching) $(yellow "program \"${workload}\" workload") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..."
1027+
fi
1028+
done
10251029
# Download generator logs. ###################################################
10261030
##############################################################################
10271031
if test "${node}" = "explorer"

nix/workbench/backend/nomad/exec.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,13 @@ backend_nomadexec() {
6161
backend_nomad wait-pools-stopped 1 "$@"
6262
;;
6363

64-
wait-latencies-stopped )
64+
wait-workloads-stopped )
6565
# It passes the sleep time (in seconds) required argument.
6666
# This time is different between local and cloud backends to avoid
6767
# unnecesary Nomad specific traffic (~99% happens waiting for node-0, the
6868
# first one it waits to stop inside a loop) and at the same time be less
6969
# sensitive to network failures.
70-
backend_nomad wait-latencies-stopped 1 "$@"
70+
backend_nomad wait-workloads-stopped 1 "$@"
7171
;;
7272

7373
# All or clean up everything!
@@ -107,12 +107,12 @@ backend_nomadexec() {
107107
backend_nomad start-generator "$@"
108108
;;
109109

110-
start-healthchecks )
111-
backend_nomad start-healthchecks "$@"
110+
start-workloads )
111+
backend_nomad start-workloads "$@"
112112
;;
113113

114-
start-latencies )
115-
backend_nomad start-latencies "$@"
114+
start-healthchecks )
115+
backend_nomad start-healthchecks "$@"
116116
;;
117117

118118
start-node )

nix/workbench/backend/supervisor-conf.nix

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -184,14 +184,18 @@ let
184184
startsecs = 5;
185185
};
186186
}
187+
188+
189+
187190
//
188-
{
189-
"program:latency" = {
191+
(builtins.listToAttrs (builtins.map (workload: {
192+
name = "program:${workload.name}";
193+
value = {
190194
# "command" below assumes "directory" is set accordingly.
191-
directory = "${stateDir}/latency";
195+
directory = "${stateDir}/workloads/${workload.name}";
192196
command = "${command}";
193-
stdout_logfile = "${stateDir}/latency/stdout";
194-
stderr_logfile = "${stateDir}/latency/stderr";
197+
stdout_logfile = "${stateDir}/workloads/${workload.name}/stdout";
198+
stderr_logfile = "${stateDir}/workloads/${workload.name}/stderr";
195199
# Set these values to 0 to indicate an unlimited log size / no rotation.
196200
stdout_logfile_maxbytes = 0;
197201
stderr_logfile_maxbytes = 0;
@@ -204,7 +208,10 @@ let
204208
# Seconds it needs to stay running to consider the start successful
205209
startsecs = 5;
206210
};
207-
}
211+
}) profileData.workloads))
212+
213+
214+
208215
//
209216
lib.attrsets.optionalAttrs withSsh
210217
{

nix/workbench/backend/supervisor.sh

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ case "$op" in
5858

5959
local svcs=$dir/profile/node-services.json
6060
local gtor=$dir/profile/generator-service.json
61+
local work=$dir/profile/workloads-service.json
6162
local trac=$dir/profile/tracer-service.json
6263
local hche=$dir/profile/healthcheck-service.json
6364

@@ -76,6 +77,15 @@ case "$op" in
7677
cp $(jq '."plutus-redeemer"' -r $gtor) "$gen_dir"/plutus-redeemer.json
7778
cp $(jq '."plutus-datum"' -r $gtor) "$gen_dir"/plutus-datum.json
7879

80+
local work_dir="$dir"/workloads
81+
mkdir -p "$work_dir"
82+
for workload in $(jq_tolist 'map(.name)' "$work")
83+
do
84+
mkdir -p "$work_dir"/"${workload}"
85+
cp $(jq "map(select(.name == \"${workload}\"))[0] | .start" -r $work) \
86+
"$work_dir"/"${workload}"/start.sh
87+
done
88+
7989
local trac_dir="$dir"/tracer
8090
mkdir -p "$trac_dir"
8191
cp $(jq '."start"' -r $trac) "$trac_dir"/start.sh
@@ -84,8 +94,6 @@ case "$op" in
8494
local hche_dir="$dir"/healthcheck
8595
mkdir -p "$hche_dir"
8696
cp $(jq '."start"' -r $hche) "$hche_dir"/start.sh
87-
88-
mkdir -p "$dir"/latency
8997
;;
9098

9199
deploy-genesis )
@@ -274,6 +282,30 @@ EOF
274282
fi
275283
backend_supervisor save-child-pids "$dir";;
276284

285+
start-workloads )
286+
local usage="USAGE: wb backend $op RUN-DIR"
287+
local dir=${1:?$usage}; shift
288+
289+
while test $# -gt 0
290+
do case "$1" in
291+
--* ) msg "FATAL: unknown flag '$1'"; usage_supervisor;;
292+
* ) break;; esac; shift; done
293+
294+
# For every workload
295+
for workload in $(jq_tolist '.workloads | map(.name)' "$dir"/profile.json)
296+
do
297+
if ! supervisorctl start "${workload}"
298+
then progress "supervisor" "$(red fatal: failed to start) $(white "${workload} workload")"
299+
echo "$(red "${workload}" workload stdout) ----------------------" >&2
300+
cat "$dir"/workloads/"${workload}"/stdout
301+
echo "$(red "${workload}" workload stderr) ----------------------" >&2
302+
cat "$dir"/workloads/"${workload}"/stderr
303+
echo "$(white -------------------------------------------------)" >&2
304+
fatal "could not start $(white "${workload} workload")"
305+
fi
306+
done
307+
backend_supervisor save-child-pids "$dir";;
308+
277309
wait-node-stopped )
278310
local usage="USAGE: wb backend $op RUN-DIR NODE"
279311
local dir=${1:?$usage}; shift
@@ -322,6 +354,40 @@ EOF
322354
fi
323355
;;
324356

357+
wait-workloads-stopped )
358+
local usage="USAGE: wb backend $op RUN-DIR"
359+
local dir=${1:?$usage}; shift
360+
361+
local start_time=$(date +%s)
362+
msg_ne "supervisor: waiting until all workloads are stopped: 000000"
363+
for workload in $(jq_tolist '.workloads | map(.name)' "$dir"/profile.json)
364+
do
365+
while \
366+
! test -f "${dir}"/flag/cluster-stopping \
367+
&& \
368+
supervisorctl status "${workload}" > /dev/null
369+
do
370+
echo -ne "\b\b\b\b\b\b"
371+
printf "%6d" "$(($(date +%s) - start_time))"
372+
sleep 1
373+
done
374+
if ! test -f "${dir}"/flag/cluster-stopping
375+
then
376+
echo -ne "\b\b\b\b\b\b"
377+
echo -n "${workload} 000000"
378+
fi
379+
done >&2
380+
echo -ne "\b\b\b\b\b\b"
381+
local elapsed=$(($(date +%s) - start_time))
382+
if test -f "${dir}"/flag/cluster-stopping
383+
then
384+
echo " Termination requested -- after $(yellow ${elapsed})s" >&2
385+
else
386+
touch "${dir}"/flag/cluster-stopping
387+
echo " All workloads exited -- after $(yellow ${elapsed})s" >&2
388+
fi
389+
;;
390+
325391
stop-all )
326392
local usage="USAGE: wb backend $op RUN-DIR"
327393
local dir=${1:?$usage}; shift

nix/workbench/genesis/genesis.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,7 @@ genesis-create-testnet-data() {
735735
link_keys utxo-keys utxo-keys
736736

737737
local is_voting
738-
is_voting=$(jq --raw-output '.generator.drep_voting' "$profile_json")
738+
is_voting=$(jq --raw-output '.workloads | any( .name == "voting")' "$profile_json")
739739
if [[ "$is_voting" == "true" ]];
740740
then
741741
info genesis "voting workload specified - keeping one stake key per producer"

nix/workbench/profile/prof0-defaults.jq

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ def era_defaults($era):
6363
}
6464
}
6565

66+
, workloads: []
67+
6668
, node:
6769
{ rts_flags_override: []
6870
, heap_limit: null ## optional: heap limit in MB (translates to RTS flag -M)

0 commit comments

Comments
 (0)