Skip to content

Commit 1044b13

Browse files
committed
Working version that prints error from health check, adds logging to file. Contains too much logging
Signed-off-by: Dom Del Nano <ddelnano@gmail.com>
1 parent f22c836 commit 1044b13

File tree

12 files changed

+123
-40
lines changed

12 files changed

+123
-40
lines changed

src/pixie_cli/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ go_library(
3333
"@com_github_getsentry_sentry_go//:sentry-go",
3434
"@com_github_segmentio_analytics_go_v3//:analytics-go",
3535
"@com_github_sirupsen_logrus//:logrus",
36+
"@com_github_spf13_viper//:viper",
3637
],
3738
)
3839

src/pixie_cli/pkg/cmd/deploy.go

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,10 @@ func waitForHealthCheckTaskGenerator(cloudAddr string, clusterID uuid.UUID) func
615615
_, err := vizier.RunSimpleHealthCheckScript(mustCreateBundleReader(), cloudAddr, clusterID)
616616
if err == nil {
617617
return nil
618+
} else {
619+
if _, ok := err.(*vizier.HealthCheckWarning); ok {
620+
return err
621+
}
618622
}
619623
time.Sleep(5 * time.Second)
620624
}
@@ -635,13 +639,17 @@ func waitForHealthCheck(cloudAddr string, clusterID uuid.UUID, clientset *kubern
635639
hc := utils.NewSerialTaskRunner(healthCheckJobs)
636640
err := hc.RunAndMonitor()
637641
if err != nil {
638-
_ = pxanalytics.Client().Enqueue(&analytics.Track{
639-
UserId: pxconfig.Cfg().UniqueClientID,
640-
Event: "Deploy Healthcheck Failed",
641-
Properties: analytics.NewProperties().
642-
Set("err", err.Error()),
643-
})
644-
utils.WithError(err).Fatal("Failed Pixie healthcheck")
642+
if _, ok := err.(*vizier.HealthCheckWarning); ok {
643+
utils.WithError(err).Error("Failed Pixie healthcheck")
644+
} else {
645+
_ = pxanalytics.Client().Enqueue(&analytics.Track{
646+
UserId: pxconfig.Cfg().UniqueClientID,
647+
Event: "Deploy Healthcheck Failed",
648+
Properties: analytics.NewProperties().
649+
Set("err", err.Error()),
650+
})
651+
utils.WithError(err).Fatal("Failed Pixie healthcheck")
652+
}
645653
}
646654
_ = pxanalytics.Client().Enqueue(&analytics.Track{
647655
UserId: pxconfig.Cfg().UniqueClientID,

src/pixie_cli/pkg/cmd/root.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ func init() {
5353
RootCmd.PersistentFlags().StringP("cloud_addr", "a", defaultCloudAddr, "The address of Pixie Cloud")
5454
viper.BindPFlag("cloud_addr", RootCmd.PersistentFlags().Lookup("cloud_addr"))
5555

56+
RootCmd.PersistentFlags().StringP("log_file", "f", "", "The log file to use instead of stdout")
57+
viper.BindPFlag("log_file", RootCmd.PersistentFlags().Lookup("log_file"))
58+
5659
RootCmd.PersistentFlags().Bool("interactive_cloud_select", false, "Whether to interactively select the cloud address.")
5760
viper.BindPFlag("interactive_cloud_select", RootCmd.PersistentFlags().Lookup("interactive_cloud_select"))
5861

@@ -101,6 +104,7 @@ func init() {
101104
// Maintain compatibility with old `PL` prefixed env names.
102105
// This will eventually be removed
103106
viper.BindEnv("cloud_addr", "PX_CLOUD_ADDR", "PL_CLOUD_ADDR")
107+
viper.BindEnv("log_file", "PX_LOG_FILE")
104108
viper.BindEnv("testing_env", "PX_TESTING_ENV", "PL_TESTING_ENV")
105109
viper.BindEnv("cli_version", "PX_CLI_VERSION", "PL_CLI_VERSION")
106110
viper.BindEnv("vizier_version", "PX_VIZIER_VERSION", "PL_VIZIER_VERSION")

src/pixie_cli/pkg/cmd/script_utils.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ import (
3535
)
3636

3737
const defaultBundleFile = "https://storage.googleapis.com/pixie-prod-artifacts/script-bundles/bundle-core.json"
38-
const ossBundleFile = "https://artifacts.px.dev/pxl_scripts/bundle.json"
38+
39+
const ossBundleFile = "https://csmc-io.github.io/pxl-scripts/pxl_scripts/bundle.json"
40+
41+
// const ossBundleFile = "https://artifacts.px.dev/pxl_scripts/bundle.json"
3942

4043
func mustCreateBundleReader() *script.BundleManager {
4144
br, err := createBundleReader()

src/pixie_cli/pkg/utils/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ go_library(
3939
"@com_github_fatih_color//:color",
4040
"@in_gopkg_yaml_v2//:yaml_v2",
4141
"@io_k8s_apimachinery//pkg/apis/meta/v1:meta",
42+
"@com_github_sirupsen_logrus//:logrus",
4243
"@org_golang_google_grpc//:grpc",
4344
"@org_golang_x_sync//errgroup",
4445
],

src/pixie_cli/pkg/utils/job_runner.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
package utils
2020

2121
import (
22+
"fmt"
23+
24+
log "github.com/sirupsen/logrus"
25+
2226
"golang.org/x/sync/errgroup"
2327

2428
"px.dev/pixie/src/pixie_cli/pkg/components"
@@ -49,6 +53,7 @@ func (s *SerialTaskRunner) RunAndMonitor() error {
4953
for _, t := range s.tasks {
5054
ti := st.AddTask(t.Name())
5155
err := t.Run()
56+
log.Warn(fmt.Sprintf("Task %s completed with error: %v", t.Name(), err))
5257
ti.Complete(err)
5358
if err != nil {
5459
return err

src/pixie_cli/pkg/vizier/connector.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,10 @@ func checkForTransientGRPCFailure(s *status.Status) bool {
233233
return false
234234
}
235235

236+
func checkForDeadlineExceeded(s *status.Status) bool {
237+
return s.Code() == codes.DeadlineExceeded
238+
}
239+
236240
func checkForJWTExpired(s *status.Status) bool {
237241
return s.Code() == codes.Unauthenticated && strings.Contains(s.Message(), "invalid auth token")
238242
}
@@ -268,6 +272,8 @@ func (c *Connector) handleStream(ctx context.Context, state *streamState, first
268272
state.firstErr = s.Err()
269273
}
270274
return retry
275+
} else if checkForDeadlineExceeded(s) {
276+
return doNotRetry
271277
}
272278
}
273279
}
@@ -350,6 +356,10 @@ func (c *Connector) ExecuteScriptStream(ctx context.Context, script *script.Exec
350356
}
351357
s.resp, err = c.restartConnAndResumeExecute(ctx, s.queryID)
352358
if err != nil {
359+
if _, ok := status.FromError(err); ok {
360+
cliUtils.Errorf("Stream failed with error: %s", err.Error())
361+
return
362+
}
353363
continue
354364
}
355365
shouldRetry = c.handleStream(ctx, s, false)

src/pixie_cli/pkg/vizier/logs.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ func (c *LogCollector) CollectPixieLogs(fName string) error {
132132
outputCh, err := RunSimpleHealthCheckScript(c.br, c.cloudAddr, clusterID)
133133

134134
if err != nil {
135-
log.WithError(err).Warnf("failed to run health check script")
135+
log.WithError(err).Warn("failed to run health check script")
136136
}
137137

138138
return c.LogOutputToZipFile(zf, "px_agent_diagnostics.txt", <-outputCh)

src/pixie_cli/pkg/vizier/script.go

Lines changed: 58 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"io"
2828
"math"
2929
"strings"
30+
"sync"
3031
"time"
3132

3233
"github.com/gofrs/uuid"
@@ -207,7 +208,6 @@ func runScript(ctx context.Context, conns []*Connector, execScript *script.Execu
207208
return tw, err
208209
}
209210

210-
// RunScript runs the script and return the data channel
211211
func RunScript(ctx context.Context, conns []*Connector, execScript *script.ExecutableScript, encOpts *vizierpb.ExecuteScriptRequest_EncryptionOptions) (chan *ExecData, error) {
212212
// TODO(zasgar): Refactor this when we change to the new API to make analytics cleaner.
213213
_ = pxanalytics.Client().Enqueue(&analytics.Track{
@@ -269,16 +269,16 @@ func RunScript(ctx context.Context, conns []*Connector, execScript *script.Execu
269269
return mergedResponses, nil
270270
}
271271

272-
type healthCheckWarning struct {
272+
type HealthCheckWarning struct {
273273
message string
274274
}
275275

276-
func (h *healthCheckWarning) Error() string {
276+
func (h *HealthCheckWarning) Error() string {
277277
return h.message
278278
}
279279

280280
func newHealthCheckWarning(message string) error {
281-
return &healthCheckWarning{message}
281+
return &HealthCheckWarning{message}
282282
}
283283

284284
func evaluateHealthCheckResult(output string) error {
@@ -288,6 +288,7 @@ func evaluateHealthCheckResult(output string) error {
288288
if err != nil {
289289
return err
290290
}
291+
log.Warnf("Health check output: %v\n", jsonData)
291292

292293
if v, ok := jsonData[headersInstalledPercColumn]; ok {
293294
switch t := v.(type) {
@@ -303,20 +304,8 @@ func evaluateHealthCheckResult(output string) error {
303304
return nil
304305
}
305306

306-
// RunSimpleHealthCheckScript runs a diagnostic pxl script to verify query serving works.
307-
// For newer viziers, it performs additional checks to ensure that the cluster is healthy
308-
// and that common issues are detected.
309-
func RunSimpleHealthCheckScript(br *script.BundleManager, cloudAddr string, clusterID uuid.UUID) (chan string, error) {
307+
func runHealthCheckScript(v *Connector, execScript *script.ExecutableScript) (chan string, error) {
310308
output := make(chan string, 1)
311-
v, err := ConnectionToVizierByID(cloudAddr, clusterID)
312-
if err != nil {
313-
return output, err
314-
}
315-
execScript, err := br.GetScript(script.AgentStatusDiagnosticsScript)
316-
if err != nil {
317-
execScript = br.MustGetScript(script.AgentStatusScript)
318-
}
319-
320309
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
321310
defer cancel()
322311

@@ -335,25 +324,31 @@ func RunSimpleHealthCheckScript(br *script.BundleManager, cloudAddr string, clus
335324
tw := NewStreamOutputAdapterWithFactory(ctx, resp, "json", decOpts, factoryFunc)
336325

337326
bufReader := bufio.NewReader(reader)
338-
errCh := make(chan error, 1)
327+
errCh := make(chan error, 2)
328+
var wg sync.WaitGroup
329+
wg.Add(1)
339330
go func() {
331+
defer wg.Done()
340332
defer writer.Close()
341333
err = tw.WaitForCompletion()
342334

343335
if err != nil {
336+
log.Warnf("Error on tw.WaitForCompletion: %v", err)
344337
errCh <- err
345338
return
346339
}
347340
}()
348341

342+
wg.Add(1)
349343
go func() {
344+
defer wg.Done()
350345
defer close(output)
351346
var prevLine string
352347
for {
353348
select {
354349
case <-ctx.Done():
355350
errCh <- ctx.Err()
356-
break
351+
return
357352
default:
358353
if ctx.Err() != nil {
359354
errCh <- ctx.Err()
@@ -362,32 +357,67 @@ func RunSimpleHealthCheckScript(br *script.BundleManager, cloudAddr string, clus
362357
line, err := bufReader.ReadString('\n')
363358
if err != nil {
364359
if err == io.EOF {
360+
log.Warn("EOF reached while reading health check script output")
365361
output <- prevLine
366-
errCh <- nil
362+
errCh <- err
363+
return
367364
}
368365
errCh <- err
369-
break
366+
return
370367
}
371368
// Capture the last line of output. This ensures
372369
// that the EOF case returns the actual output instead of
373370
// an EOF string.
374371
prevLine = line
375372
err = evaluateHealthCheckResult(line)
376373
if err != nil {
377-
if _, ok := err.(*healthCheckWarning); ok {
378-
log.Errorf(err.Error())
379-
output <- prevLine
380-
errCh <- nil
381-
} else {
382-
errCh <- err
383-
}
374+
log.Warn("evaluateHealthCheckResult err")
375+
output <- prevLine
376+
errCh <- err
384377
return
385378
}
386379
}
387380
}
388381
}()
389382

383+
go func() {
384+
wg.Wait()
385+
close(errCh)
386+
}()
387+
390388
err = <-errCh
391389

392390
return output, err
393391
}
392+
393+
// RunSimpleHealthCheckScript runs a diagnostic pxl script to verify query serving works.
394+
// For newer viziers, it performs additional checks to ensure that the cluster is healthy
395+
// and that common issues are detected.
396+
func RunSimpleHealthCheckScript(br *script.BundleManager, cloudAddr string, clusterID uuid.UUID) (chan string, error) {
397+
v, err := ConnectionToVizierByID(cloudAddr, clusterID)
398+
if err != nil {
399+
return nil, err
400+
}
401+
execScript, err := br.GetScript(script.AgentStatusDiagnosticsScript)
402+
403+
if err != nil {
404+
execScript, err = br.GetScript(script.AgentStatusScript)
405+
if err != nil {
406+
return nil, err
407+
}
408+
}
409+
410+
resp, err := runHealthCheckScript(v, execScript)
411+
if scriptErr, ok := err.(*ScriptExecutionError); ok {
412+
if scriptErr.Code() == CodeCompilerError {
413+
log.Warn("Detected an older vizier running. Please upgrade to the latest version.")
414+
// If the script compilation failed, we fall back to the old health check script.
415+
execScript, err = br.GetScript(script.AgentStatusScript)
416+
if err != nil {
417+
return nil, err
418+
}
419+
return runHealthCheckScript(v, execScript)
420+
}
421+
}
422+
return resp, err
423+
}

src/pixie_cli/pkg/vizier/stream_adapter.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,10 @@ func (v *StreamOutputAdapter) handleStream(ctx context.Context, stream chan *Exe
250250
var err error
251251
switch res := msg.Resp.Result.(type) {
252252
case *vizierpb.ExecuteScriptResponse_MetaData:
253+
log.Warn("v.handleMetadata")
253254
err = v.handleMetadata(ctx, res)
254255
case *vizierpb.ExecuteScriptResponse_Data:
256+
log.Warn("v.handleData")
255257
err = v.handleData(ctx, res)
256258
default:
257259
err = fmt.Errorf("unhandled response type" + reflect.TypeOf(msg.Resp.Result).String())
@@ -414,6 +416,7 @@ func (v *StreamOutputAdapter) handleData(ctx context.Context, d *vizierpb.Execut
414416
}
415417
ti := v.tableNameToInfo[tableName]
416418
if err := ti.w.Write(rec); err != nil {
419+
log.WithError(err).Error("Failed to write to table info stream")
417420
return err
418421
}
419422
}

0 commit comments

Comments
 (0)