Skip to content

Commit e12309d

Browse files
craig[bot]iskettaneh
andcommitted
Merge #155180
155180: roachtest: add profile collection step at the end of tpcc r=iskettaneh a=iskettaneh Similar to what we do in sysbench, this commit performs a small 75s run at the end of tpcc roachtest runs, and collects the different profiles. This will be useful to update our PGO process to use profiles taken from tpcc runs instead of sysbench. References: #147400 Release note: None Co-authored-by: Ibrahim Kettaneh <[email protected]>
2 parents 7d15c71 + 60ed59d commit e12309d

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

pkg/cmd/roachtest/tests/tpcc.go

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
"github.com/cockroachdb/ttycolor"
4343
"github.com/cockroachdb/version"
4444
"github.com/codahale/hdrhistogram"
45+
"github.com/google/pprof/profile"
4546
"github.com/lib/pq"
4647
promapi "github.com/prometheus/client_golang/api"
4748
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
@@ -469,6 +470,98 @@ func runTPCC(
469470
}
470471
m.Wait()
471472

473+
// Collect profiles by running a short workload
474+
t.Status("running 75 second workload to collect profiles")
475+
476+
// Capture existing workload log files before collecting profiles. This will
477+
// allow us to clean up any new workload log files that are created during
478+
// the profile collection run.
479+
logFileNamePattern := "run_*_n*_cockroach-workload-*.log"
480+
existingWorkloadLogs := make(map[string]struct{})
481+
if matches, _ := filepath.Glob(filepath.Join(t.ArtifactsDir(),
482+
logFileNamePattern)); matches != nil {
483+
for _, logPath := range matches {
484+
existingWorkloadLogs[logPath] = struct{}{}
485+
}
486+
}
487+
488+
profilesDir := filepath.Join(t.ArtifactsDir(), "1.perf", "profiles")
489+
if err := os.MkdirAll(profilesDir, 0755); err != nil {
490+
t.L().Errorf("Failed to create profiles directory %s: %v", profilesDir, err)
491+
} else {
492+
// Start a short TPCC test in order to collect the profiles from an
493+
// active cluster.
494+
profileM := t.NewErrorGroup(task.WithContext(ctx))
495+
profileM.Go(
496+
func(ctx context.Context, l *logger.Logger) error {
497+
// Run workload for 75 seconds
498+
profileDuration := 75 * time.Second
499+
500+
fileName := roachtestutil.GetBenchmarkMetricsFileName(t)
501+
histogramsPath := fmt.Sprintf("%s/profile_%s", t.PerfArtifactsDir(), fileName)
502+
503+
cmd := roachtestutil.NewCommand("%s workload run %s",
504+
test.DefaultCockroachPath, opts.getWorkloadCmd()).
505+
MaybeFlag(opts.DB != "", "db", opts.DB).
506+
Flag("warehouses", opts.Warehouses).
507+
MaybeFlag(!opts.DisableHistogram, "histograms", histogramsPath).
508+
Flag("ramp", 0*time.Second). // No ramp for profile collection
509+
Flag("duration", profileDuration).
510+
Arg("%s", opts.ExtraRunArgs).
511+
Arg("%s", pgURLs[0])
512+
return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd.String())
513+
},
514+
)
515+
516+
// Wait for 30 seconds to give a chance to the workload to start, and then
517+
// collect CPU, mutex, allocs profiles.
518+
t.L().Printf("waiting 30 seconds for workload to ramp up before collecting profiles")
519+
time.Sleep(30 * time.Second)
520+
521+
collectionDuration := 30 * time.Second
522+
t.L().Printf("starting profile collection from %d nodes for %s",
523+
len(c.CRDBNodes()), collectionDuration)
524+
525+
// Collect the profiles.
526+
profiles := map[string][]*profile.Profile{"cpu": {}, "allocs": {}, "mutex": {}}
527+
for typ := range profiles {
528+
typ := typ // Capture for goroutine
529+
profileM.Go(
530+
func(ctx context.Context, l *logger.Logger) error {
531+
var err error
532+
profiles[typ], err = roachtestutil.GetProfile(ctx, t, c, typ,
533+
collectionDuration, c.CRDBNodes())
534+
return err
535+
},
536+
)
537+
}
538+
539+
// If there is a problem executing the workload or there is a problem
540+
// collecting the profiles we need to clean up the directory and log the error.
541+
if err := profileM.WaitE(); err != nil {
542+
t.L().Errorf("failed to collect profiles: %v", err)
543+
_ = os.RemoveAll(profilesDir)
544+
} else {
545+
// At this point we know that the workload has not crashed, and we have
546+
// collected all the individual profiles. We can now merge and export them.
547+
if err := mergeAndExportTPCCProfiles(t, c, collectionDuration, profiles, profilesDir); err != nil {
548+
t.L().Errorf("failed to merge and export profiles: %v", err)
549+
_ = os.RemoveAll(profilesDir)
550+
}
551+
}
552+
553+
// Clean up the profile collection workload log file to reduce clutter.
554+
// Delete any workload-r logs that were created after the main test run.
555+
if matches, _ := filepath.Glob(filepath.Join(t.ArtifactsDir(), logFileNamePattern)); matches != nil {
556+
for _, logPath := range matches {
557+
if _, ok := existingWorkloadLogs[logPath]; !ok {
558+
// This log was created by the profile collection run, remove it
559+
_ = os.Remove(logPath)
560+
}
561+
}
562+
}
563+
}
564+
472565
if !opts.SkipPostRunCheck {
473566
cmd := roachtestutil.NewCommand("%s workload check %s", test.DefaultCockroachPath, opts.getWorkloadCmd()).
474567
MaybeFlag(opts.DB != "", "db", opts.DB).
@@ -488,6 +581,50 @@ func runTPCC(
488581
}
489582
}
490583

584+
// mergeAndExportTPCCProfiles accepts a map of individual profiles of each
585+
// node of different types (cpu, allocs, mutex), and exports them to the
586+
// specified directory. Also, it merges them and exports the merged profiles
587+
// to the same directory.
588+
func mergeAndExportTPCCProfiles(
589+
t test.Test,
590+
c cluster.Cluster,
591+
duration time.Duration,
592+
profiles map[string][]*profile.Profile,
593+
profilesDir string,
594+
) error {
595+
// Merge the profiles.
596+
mergedProfiles := map[string]*profile.Profile{"cpu": {}, "allocs": {}, "mutex": {}}
597+
for typ := range mergedProfiles {
598+
var err error
599+
if mergedProfiles[typ], err = profile.Merge(profiles[typ]); err != nil {
600+
return errors.Wrapf(err, "failed to merge profiles type: %s", typ)
601+
}
602+
}
603+
604+
// Export the merged profiles.
605+
for typ := range mergedProfiles {
606+
filename := fmt.Sprintf("merged.%s.pb.gz", typ)
607+
if err := roachtestutil.ExportProfile(mergedProfiles[typ], profilesDir, filename); err != nil {
608+
return errors.Wrapf(err, "failed to export merged profiles: %s", typ)
609+
}
610+
t.L().Printf("successfully exported merged profile: %s of type %s", filename, typ)
611+
}
612+
613+
// Export the individual profiles as well.
614+
numNodes := len(c.CRDBNodes())
615+
for i := range numNodes {
616+
for typ := range profiles {
617+
filename := fmt.Sprintf("n%d.%s%s.pb.gz", i+1, typ, duration)
618+
if err := roachtestutil.ExportProfile(profiles[typ][i], profilesDir, filename); err != nil {
619+
return errors.Wrapf(err, "failed to export individual profile type: %s for node %d", typ, i+1)
620+
}
621+
t.L().Printf("successfully exported individual profile %s of type %s for node %d", filename, typ, i+1)
622+
}
623+
}
624+
625+
return nil
626+
}
627+
491628
// tpccSupportedWarehouses returns our claim for the maximum number of tpcc
492629
// warehouses we support for a given hardware configuration.
493630
//

0 commit comments

Comments
 (0)