@@ -42,6 +42,7 @@ import (
4242 "github.com/cockroachdb/ttycolor"
4343 "github.com/cockroachdb/version"
4444 "github.com/codahale/hdrhistogram"
45+ "github.com/google/pprof/profile"
4546 "github.com/lib/pq"
4647 promapi "github.com/prometheus/client_golang/api"
4748 promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
@@ -469,6 +470,98 @@ func runTPCC(
469470 }
470471 m .Wait ()
471472
473+ // Collect profiles by running a short workload
474+ t .Status ("running 75 second workload to collect profiles" )
475+
476+ // Capture existing workload log files before collecting profiles. This will
477+ // allow us to clean up any new workload log files that are created during
478+ // the profile collection run.
479+ logFileNamePattern := "run_*_n*_cockroach-workload-*.log"
480+ existingWorkloadLogs := make (map [string ]struct {})
481+ if matches , _ := filepath .Glob (filepath .Join (t .ArtifactsDir (),
482+ logFileNamePattern )); matches != nil {
483+ for _ , logPath := range matches {
484+ existingWorkloadLogs [logPath ] = struct {}{}
485+ }
486+ }
487+
488+ profilesDir := filepath .Join (t .ArtifactsDir (), "1.perf" , "profiles" )
489+ if err := os .MkdirAll (profilesDir , 0755 ); err != nil {
490+ t .L ().Errorf ("Failed to create profiles directory %s: %v" , profilesDir , err )
491+ } else {
492+ // Start a short TPCC test in order to collect the profiles from an
493+ // active cluster.
494+ profileM := t .NewErrorGroup (task .WithContext (ctx ))
495+ profileM .Go (
496+ func (ctx context.Context , l * logger.Logger ) error {
497+ // Run workload for 75 seconds
498+ profileDuration := 75 * time .Second
499+
500+ fileName := roachtestutil .GetBenchmarkMetricsFileName (t )
501+ histogramsPath := fmt .Sprintf ("%s/profile_%s" , t .PerfArtifactsDir (), fileName )
502+
503+ cmd := roachtestutil .NewCommand ("%s workload run %s" ,
504+ test .DefaultCockroachPath , opts .getWorkloadCmd ()).
505+ MaybeFlag (opts .DB != "" , "db" , opts .DB ).
506+ Flag ("warehouses" , opts .Warehouses ).
507+ MaybeFlag (! opts .DisableHistogram , "histograms" , histogramsPath ).
508+ Flag ("ramp" , 0 * time .Second ). // No ramp for profile collection
509+ Flag ("duration" , profileDuration ).
510+ Arg ("%s" , opts .ExtraRunArgs ).
511+ Arg ("%s" , pgURLs [0 ])
512+ return c .RunE (ctx , option .WithNodes (c .WorkloadNode ()), cmd .String ())
513+ },
514+ )
515+
516+ // Wait for 30 seconds to give a chance to the workload to start, and then
517+ // collect CPU, mutex, allocs profiles.
518+ t .L ().Printf ("waiting 30 seconds for workload to ramp up before collecting profiles" )
519+ time .Sleep (30 * time .Second )
520+
521+ collectionDuration := 30 * time .Second
522+ t .L ().Printf ("starting profile collection from %d nodes for %s" ,
523+ len (c .CRDBNodes ()), collectionDuration )
524+
525+ // Collect the profiles.
526+ profiles := map [string ][]* profile.Profile {"cpu" : {}, "allocs" : {}, "mutex" : {}}
527+ for typ := range profiles {
528+ typ := typ // Capture for goroutine
529+ profileM .Go (
530+ func (ctx context.Context , l * logger.Logger ) error {
531+ var err error
532+ profiles [typ ], err = roachtestutil .GetProfile (ctx , t , c , typ ,
533+ collectionDuration , c .CRDBNodes ())
534+ return err
535+ },
536+ )
537+ }
538+
539+ // If there is a problem executing the workload or there is a problem
540+ // collecting the profiles we need to clean up the directory and log the error.
541+ if err := profileM .WaitE (); err != nil {
542+ t .L ().Errorf ("failed to collect profiles: %v" , err )
543+ _ = os .RemoveAll (profilesDir )
544+ } else {
545+ // At this point we know that the workload has not crashed, and we have
546+ // collected all the individual profiles. We can now merge and export them.
547+ if err := mergeAndExportTPCCProfiles (t , c , collectionDuration , profiles , profilesDir ); err != nil {
548+ t .L ().Errorf ("failed to merge and export profiles: %v" , err )
549+ _ = os .RemoveAll (profilesDir )
550+ }
551+ }
552+
553+ // Clean up the profile collection workload log file to reduce clutter.
554+ // Delete any workload-r logs that were created after the main test run.
555+ if matches , _ := filepath .Glob (filepath .Join (t .ArtifactsDir (), logFileNamePattern )); matches != nil {
556+ for _ , logPath := range matches {
557+ if _ , ok := existingWorkloadLogs [logPath ]; ! ok {
558+ // This log was created by the profile collection run, remove it
559+ _ = os .Remove (logPath )
560+ }
561+ }
562+ }
563+ }
564+
472565 if ! opts .SkipPostRunCheck {
473566 cmd := roachtestutil .NewCommand ("%s workload check %s" , test .DefaultCockroachPath , opts .getWorkloadCmd ()).
474567 MaybeFlag (opts .DB != "" , "db" , opts .DB ).
@@ -488,6 +581,50 @@ func runTPCC(
488581 }
489582}
490583
584+ // mergeAndExportTPCCProfiles accepts a map of individual profiles of each
585+ // node of different types (cpu, allocs, mutex), and exports them to the
586+ // specified directory. Also, it merges them and exports the merged profiles
587+ // to the same directory.
588+ func mergeAndExportTPCCProfiles (
589+ t test.Test ,
590+ c cluster.Cluster ,
591+ duration time.Duration ,
592+ profiles map [string ][]* profile.Profile ,
593+ profilesDir string ,
594+ ) error {
595+ // Merge the profiles.
596+ mergedProfiles := map [string ]* profile.Profile {"cpu" : {}, "allocs" : {}, "mutex" : {}}
597+ for typ := range mergedProfiles {
598+ var err error
599+ if mergedProfiles [typ ], err = profile .Merge (profiles [typ ]); err != nil {
600+ return errors .Wrapf (err , "failed to merge profiles type: %s" , typ )
601+ }
602+ }
603+
604+ // Export the merged profiles.
605+ for typ := range mergedProfiles {
606+ filename := fmt .Sprintf ("merged.%s.pb.gz" , typ )
607+ if err := roachtestutil .ExportProfile (mergedProfiles [typ ], profilesDir , filename ); err != nil {
608+ return errors .Wrapf (err , "failed to export merged profiles: %s" , typ )
609+ }
610+ t .L ().Printf ("successfully exported merged profile: %s of type %s" , filename , typ )
611+ }
612+
613+ // Export the individual profiles as well.
614+ numNodes := len (c .CRDBNodes ())
615+ for i := range numNodes {
616+ for typ := range profiles {
617+ filename := fmt .Sprintf ("n%d.%s%s.pb.gz" , i + 1 , typ , duration )
618+ if err := roachtestutil .ExportProfile (profiles [typ ][i ], profilesDir , filename ); err != nil {
619+ return errors .Wrapf (err , "failed to export individual profile type: %s for node %d" , typ , i + 1 )
620+ }
621+ t .L ().Printf ("successfully exported individual profile %s of type %s for node %d" , filename , typ , i + 1 )
622+ }
623+ }
624+
625+ return nil
626+ }
627+
491628// tpccSupportedWarehouses returns our claim for the maximum number of tpcc
492629// warehouses we support for a given hardware configuration.
493630//
0 commit comments