@@ -11,6 +11,7 @@ import (
1111 "sync/atomic"
1212 "time"
1313
14+ "github.com/cenkalti/backoff/v4"
1415 "golang.org/x/exp/maps"
1516 "golang.org/x/sync/errgroup"
1617
@@ -64,6 +65,9 @@ type processRegistryMetrics struct {
6465 mappingsFailedScheduleUpload metrics.Counter
6566 mappingsFailedNameToHandleAt metrics.Counter
6667 mappingsFailedELFVaddrRetrieval metrics.Counter
68+
69+ processesWithEmptyEnvironment metrics.Counter
70+ processEnvironmentWaitDelay metrics.Counter
6771}
6872
6973type mappingImpl struct {
@@ -215,6 +219,8 @@ func NewProcessRegistry(
215219 mappingsFailedScheduleUpload : m .WithTags (map [string ]string {"kind" : "failed_schedule_upload" }).Counter ("mappings.count" ),
216220 mappingsFailedNameToHandleAt : m .WithTags (map [string ]string {"kind" : "failed_name_to_handle_at" }).Counter ("mappings.count" ),
217221 mappingsFailedELFVaddrRetrieval : m .WithTags (map [string ]string {"kind" : "failed_elf_vaddr_retrieval" }).Counter ("mappings.count" ),
222+ processesWithEmptyEnvironment : m .Counter ("processes.with_empty_environment.count" ),
223+ processEnvironmentWaitDelay : m .Counter ("environment.wait_delay.total.milliseconds" ),
218224 },
219225 processScanner : processScanner ,
220226 listeners : listeners ,
@@ -896,13 +902,52 @@ func iterateMappingLPMSegments(m Mapping, callback func(address uint64, prefix u
896902////////////////////////////////////////////////////////////////////////////////
897903
898904func (a * processAnalyzer ) loadEnvs (ctx context.Context ) error {
899- envs , err := procfs .Process (a .proc .currentNamespaceID ).ListEnvs ()
900- if err != nil {
901- return err
902- }
905+ proc := procfs .Process (a .proc .currentNamespaceID )
906+ backoff := backoff .NewExponentialBackOff (
907+ backoff .WithInitialInterval (1 * time .Millisecond ),
908+ backoff .WithMultiplier (2 ),
909+ backoff .WithMaxElapsedTime (1 * time .Second ),
910+ )
911+ backoff .Reset ()
912+ defer func () {
913+ a .reg .metrics .processEnvironmentWaitDelay .Add (backoff .GetElapsedTime ().Milliseconds ())
914+ }()
915+ // TODO(PERFORATOR-1102): loop here is hacky attempt to work around some
916+ // race conditions when we fail to observe correct process environment shortly after process creation.
917+ for i := 0 ; ; i ++ {
918+ envs , err := proc .ListEnvs ()
919+ if err != nil {
920+ return err
921+ }
903922
904- a .log .Debug (ctx , "Put process envs" , log .Int ("env_count" , len (envs )))
905- a .proc .setEnvs (envs )
923+ if len (envs ) > 0 {
924+ a .log .Debug (
925+ ctx ,
926+ "Put process envs" ,
927+ log .Int ("env_count" , len (envs )),
928+ log .Int ("attempts" , i ),
929+ )
930+ a .proc .setEnvs (envs )
931+ break
932+ }
933+
934+ // we read empty environment.
935+ // While this is technically possible, it is more likely a race
936+ // with a newly created process.
937+ sleepFor := backoff .NextBackOff ()
938+ if sleepFor == backoff .Stop {
939+ // Level is not DEBUG because it is the only sign of a possible race
940+ // and processes with actually empty environment are likely to be rare.
941+ a .log .Info (ctx , "Process seems to have empty environment" )
942+ a .reg .metrics .processesWithEmptyEnvironment .Inc ()
943+ break
944+ }
945+ select {
946+ case <- ctx .Done ():
947+ return fmt .Errorf ("canceled while obtaining process environment: %w" , context .Cause (ctx ))
948+ case <- time .After (sleepFor ):
949+ }
950+ }
906951 return nil
907952}
908953
0 commit comments