@@ -27,6 +27,7 @@ import (
2727 "io"
2828 "math"
2929 "strings"
30+ "sync"
3031 "time"
3132
3233 "github.com/gofrs/uuid"
@@ -207,7 +208,6 @@ func runScript(ctx context.Context, conns []*Connector, execScript *script.Execu
207208 return tw , err
208209}
209210
210- // RunScript runs the script and return the data channel
211211func RunScript (ctx context.Context , conns []* Connector , execScript * script.ExecutableScript , encOpts * vizierpb.ExecuteScriptRequest_EncryptionOptions ) (chan * ExecData , error ) {
212212 // TODO(zasgar): Refactor this when we change to the new API to make analytics cleaner.
213213 _ = pxanalytics .Client ().Enqueue (& analytics.Track {
@@ -269,16 +269,16 @@ func RunScript(ctx context.Context, conns []*Connector, execScript *script.Execu
269269 return mergedResponses , nil
270270}
271271
272- type healthCheckWarning struct {
272+ type HealthCheckWarning struct {
273273 message string
274274}
275275
276- func (h * healthCheckWarning ) Error () string {
276+ func (h * HealthCheckWarning ) Error () string {
277277 return h .message
278278}
279279
280280func newHealthCheckWarning (message string ) error {
281- return & healthCheckWarning {message }
281+ return & HealthCheckWarning {message }
282282}
283283
284284func evaluateHealthCheckResult (output string ) error {
@@ -288,6 +288,7 @@ func evaluateHealthCheckResult(output string) error {
288288 if err != nil {
289289 return err
290290 }
291+ log .Warnf ("Health check output: %v\n " , jsonData )
291292
292293 if v , ok := jsonData [headersInstalledPercColumn ]; ok {
293294 switch t := v .(type ) {
@@ -303,20 +304,8 @@ func evaluateHealthCheckResult(output string) error {
303304 return nil
304305}
305306
306- // RunSimpleHealthCheckScript runs a diagnostic pxl script to verify query serving works.
307- // For newer viziers, it performs additional checks to ensure that the cluster is healthy
308- // and that common issues are detected.
309- func RunSimpleHealthCheckScript (br * script.BundleManager , cloudAddr string , clusterID uuid.UUID ) (chan string , error ) {
307+ func runHealthCheckScript (v * Connector , execScript * script.ExecutableScript ) (chan string , error ) {
310308 output := make (chan string , 1 )
311- v , err := ConnectionToVizierByID (cloudAddr , clusterID )
312- if err != nil {
313- return output , err
314- }
315- execScript , err := br .GetScript (script .AgentStatusDiagnosticsScript )
316- if err != nil {
317- execScript = br .MustGetScript (script .AgentStatusScript )
318- }
319-
320309 ctx , cancel := context .WithTimeout (context .Background (), 5 * time .Second )
321310 defer cancel ()
322311
@@ -335,25 +324,31 @@ func RunSimpleHealthCheckScript(br *script.BundleManager, cloudAddr string, clus
335324 tw := NewStreamOutputAdapterWithFactory (ctx , resp , "json" , decOpts , factoryFunc )
336325
337326 bufReader := bufio .NewReader (reader )
338- errCh := make (chan error , 1 )
327+ errCh := make (chan error , 2 )
328+ var wg sync.WaitGroup
329+ wg .Add (1 )
339330 go func () {
331+ defer wg .Done ()
340332 defer writer .Close ()
341333 err = tw .WaitForCompletion ()
342334
343335 if err != nil {
336+ log .Warnf ("Error on tw.WaitForCompletion: %v" , err )
344337 errCh <- err
345338 return
346339 }
347340 }()
348341
342+ wg .Add (1 )
349343 go func () {
344+ defer wg .Done ()
350345 defer close (output )
351346 var prevLine string
352347 for {
353348 select {
354349 case <- ctx .Done ():
355350 errCh <- ctx .Err ()
356- break
351+ return
357352 default :
358353 if ctx .Err () != nil {
359354 errCh <- ctx .Err ()
@@ -362,32 +357,67 @@ func RunSimpleHealthCheckScript(br *script.BundleManager, cloudAddr string, clus
362357 line , err := bufReader .ReadString ('\n' )
363358 if err != nil {
364359 if err == io .EOF {
360+ log .Warn ("EOF reached while reading health check script output" )
365361 output <- prevLine
366- errCh <- nil
362+ errCh <- err
363+ return
367364 }
368365 errCh <- err
369- break
366+ return
370367 }
371368 // Capture the last line of output. This ensures
372369 // that the EOF case returns the actual output instead of
373370 // an EOF string.
374371 prevLine = line
375372 err = evaluateHealthCheckResult (line )
376373 if err != nil {
377- if _ , ok := err .(* healthCheckWarning ); ok {
378- log .Errorf (err .Error ())
379- output <- prevLine
380- errCh <- nil
381- } else {
382- errCh <- err
383- }
374+ log .Warn ("evaluateHealthCheckResult err" )
375+ output <- prevLine
376+ errCh <- err
384377 return
385378 }
386379 }
387380 }
388381 }()
389382
383+ go func () {
384+ wg .Wait ()
385+ close (errCh )
386+ }()
387+
390388 err = <- errCh
391389
392390 return output , err
393391}
392+
393+ // RunSimpleHealthCheckScript runs a diagnostic pxl script to verify query serving works.
394+ // For newer viziers, it performs additional checks to ensure that the cluster is healthy
395+ // and that common issues are detected.
396+ func RunSimpleHealthCheckScript (br * script.BundleManager , cloudAddr string , clusterID uuid.UUID ) (chan string , error ) {
397+ v , err := ConnectionToVizierByID (cloudAddr , clusterID )
398+ if err != nil {
399+ return nil , err
400+ }
401+ execScript , err := br .GetScript (script .AgentStatusDiagnosticsScript )
402+
403+ if err != nil {
404+ execScript , err = br .GetScript (script .AgentStatusScript )
405+ if err != nil {
406+ return nil , err
407+ }
408+ }
409+
410+ resp , err := runHealthCheckScript (v , execScript )
411+ if scriptErr , ok := err .(* ScriptExecutionError ); ok {
412+ if scriptErr .Code () == CodeCompilerError {
413+ log .Warn ("Detected an older vizier running. Please upgrade to the latest version." )
414+ // If the script compilation failed, we fall back to the old health check script.
415+ execScript , err = br .GetScript (script .AgentStatusScript )
416+ if err != nil {
417+ return nil , err
418+ }
419+ return runHealthCheckScript (v , execScript )
420+ }
421+ }
422+ return resp , err
423+ }
0 commit comments