1919 case $opt in
2020 a) FORCE_ALL=true; continue ;;
2121 f) FORCE=$OPTARG ; continue ;;
22- s) export GUIDEBOOK_STORE=$OPTARG ; echo " Using store=$GUIDEBOOK_STORE " ; continue ;;
22+ s) export GUIDEBOOK_STORE=$OPTARG ; echo " [Test] Using store=$GUIDEBOOK_STORE " ; continue ;;
2323 b) GUIDEBOOK=$OPTARG ; continue ;;
2424 i) NO_KIND=true; continue ;;
2525 * ) continue ;;
@@ -29,13 +29,20 @@ shift $((OPTIND-1))
2929
3030if [ -z " $NO_KIND " ]; then
3131 export KUBECONFIG=$( " $SCRIPTDIR " /setup.sh)
32- echo " Using KUBECONFIG=$KUBECONFIG "
32+ echo " [Test] Using KUBECONFIG=$KUBECONFIG "
3333fi
3434
3535# We get this for free from github actions. Add it generally. This
3636# informs the guidebooks to adjust their resource demands.
3737export CI=true
3838
39+ # If you find that dangling processes (mostly kubectl) linger, this
40+ # may help with debugging; you may also set it to "*" to get
41+ # everything. Word of warning: kubectl seems to respond to DEBUG being
42+ # set to *anything*, so expect to get a bunch of low-level go logs, no
43+ # matter what you set this to.
44+ # export DEBUG=madwizard/cleanup
45+
3946# build docker image of log aggregator just for this test and load it
4047# into kind
4148function build {
@@ -63,12 +70,12 @@ function run {
6370
6471 local PRE=" $MWPROFILES_PATH_BASE " /../profiles.d/$profile /pre
6572 if [ -f " $PRE " ]; then
66- echo " Running pre guidebooks for profile=$profile "
73+ echo " [Test] Running pre guidebooks for profile=$profile "
6774 cat " $PRE " | xargs -n1 " $ROOT " /bin/codeflare -p $profile $yes
6875 fi
6976
70- echo " Running with variant=$variant profile=$profile yes=$yes "
71- " $ROOT " /bin/codeflare -V -p $profile $yes $guidebook
77+ echo " [Test] Running with variant=$variant profile=$profile yes=$yes "
78+ GUIDEBOOK_NAME= " main-job-run " " $ROOT " /bin/codeflare -V -p $profile $yes $guidebook
7279}
7380
7481# Undeploy any prior log aggregator
@@ -78,8 +85,8 @@ function cleanup {
7885 local profile=$( basename $profileFull )
7986 export MWPROFILES_PATH=" $MWPROFILES_PATH_BASE " /$variant
8087
81- echo " Undeploying any prior log aggregator"
82- (" $ROOT " /bin/codeflare -p $profile -y ml/ray/aggregator/in-cluster/client-side/undeploy \
88+ echo " [Test] Undeploying any prior log aggregator"
89+ (GUIDEBOOK_NAME= " log-aggregator-undeploy " " $ROOT " /bin/codeflare -p $profile -y ml/ray/aggregator/in-cluster/client-side/undeploy \
8390 || exit 0)
8491}
8592
@@ -96,8 +103,10 @@ function attach {
96103
97104 local jobId=$2
98105
99- echo " Attaching variant=$variant profile=$profile jobId=$jobId "
100- " $ROOT " /bin/codeflare -V -p $profile attach -a $jobId
106+ echo " [Test] Attaching variant=$variant profile=$profile jobId=$jobId "
107+ GUIDEBOOK_NAME=" log-aggregator-attach" " $ROOT " /bin/codeflare -V -p $profile attach -a $jobId --wait &
108+ ATTACH_PID=$!
109+ echo " [Test] Attach underway"
101110}
102111
103112# @return path to locally captured logs for the given jobId, run in the given profile
@@ -121,23 +130,23 @@ function validateAttach {
121130 RUNDIR=$( localpath $profile $jobId )
122131
123132 if [ ! -d " $RUNDIR " ]; then
124- echo " ❌ Logs were not captured locally: missing logdir"
133+ echo " [Test] ❌ Logs were not captured locally: missing logdir"
125134 exit 1
126135 elif [ ! -f " $RUNDIR /jobid.txt" ]; then
127- echo " ❌ Logs were not captured locally: missing jobid.txt"
136+ echo " [Test] ❌ Logs were not captured locally: missing jobid.txt"
128137 exit 1
129138 elif [ ! -f " $RUNDIR /logs/job.txt" ]; then
130- echo " ❌ Logs were not captured locally: missing logs/job.txt"
139+ echo " [Test] ❌ Logs were not captured locally: missing logs/job.txt"
131140 exit 1
132141 elif [ ! -s " $RUNDIR /logs/job.txt" ]; then
133- echo " ❌ Logs were not captured locally: empty logs/job.txt"
142+ echo " [Test] ❌ Logs were not captured locally: empty logs/job.txt"
134143 exit 1
135144 fi
136145
137146 # TODO the expected output is going to be profile-specific
138147 grep -q ' Final result' " $RUNDIR /logs/job.txt" \
139- && echo " ✅ Logs seem good!" \
140- || (echo " ❌ Logs were not captured locally: job logs incomplete" && exit 1)
148+ && echo " [Test] ✅ Logs seem good!" \
149+ || (echo " [Test] ❌ Logs were not captured locally: job logs incomplete" && exit 1)
141150}
142151
143152function logpoller {
@@ -152,18 +161,24 @@ function logpoller {
152161# clean up after ourselves before we exit
153162#
154163function onexit {
164+ if [ -n " $ATTACH_PID " ]; then
165+ echo " !!!!!!!!!KILL ATTACH $ATTACH_PID "
166+ (pkill -P $ATTACH_PID || exit 0)
167+ fi
155168 if [ -n " $HEAD_POLLER_PID " ]; then
156- (kill $HEAD_POLLER_PID || exit 0)
169+ (pkill -P $HEAD_POLLER_PID || exit 0)
157170 fi
158171 if [ -n " $WORKER_POLLER_PID " ]; then
159- (kill $WORKER_POLLER_PID || exit 0)
172+ (pkill -P $WORKER_POLLER_PID || exit 0)
160173 fi
161174 if [ -n " $EVENTS_PID " ]; then
162- (kill $EVENTS_PID || exit 0)
175+ (pkill -P $EVENTS_PID || exit 0)
163176 fi
164177 if [ -n " $AGGREGATOR_POLLER_PID " ]; then
165- (kill $AGGREGATOR_POLLER_PID || exit 0)
178+ (pkill -P $AGGREGATOR_POLLER_PID || exit 0)
166179 fi
180+
181+ pkill -P $$
167182}
168183
169184#
@@ -199,7 +214,7 @@ function test {
199214 # allocate JOB_ID (requires node and `uuid` npm; but we should
200215 # have both for codeflare-cli dev)
201216 export JOB_ID=$( node -e ' console.log(require("uuid").v4())' )
202- echo " Using JOB_ID=$JOB_ID "
217+ echo " [Test] Using JOB_ID=$JOB_ID "
203218
204219 # 1. launch codeflare guidebook run
205220 run " $1 " | tee $OUTPUT &
@@ -216,11 +231,11 @@ function test {
216231 # done
217232 sleep 10
218233
219- echo " About to attach"
220234 attach " $1 " " $JOB_ID "
221235 fi
222236
223237 wait $RUN_PID
238+ echo " [Test] Run has finished"
224239 # the job should be done now
225240
226241 # 3. if asked, now validate the log aggregator
0 commit comments