@@ -764,32 +764,69 @@ func (c *Manager) MemoryEventFD() (int, uint32, error) {
764764 return fd , uint32 (wd ), nil
765765}
766766
767- func (c * Manager ) EventChan () (<- chan Event , <- chan error ) {
768- ec := make (chan Event )
769- errCh := make (chan error , 1 )
770- go c .waitForEvents (ec , errCh )
767+ // memoryEventNonBlockFD returns a non-blocking inotify file descriptor monitoring memory.events.
768+ //
769+ // NOTE: Block FD is expensive because unix.Read will block that thread once there is
770+ // available data to read. In high scale scenarios, it will create a lot of threads.
771+ func (c * Manager ) memoryEventNonBlockFD () (_ * os.File , retErr error ) {
771772
772- return ec , errCh
773+ rawFd , err := unix .InotifyInit1 (unix .IN_CLOEXEC | unix .IN_NONBLOCK )
774+ if err != nil {
775+ return nil , fmt .Errorf ("failed to create inotify fd: %w" , err )
776+ }
777+
778+ fd := os .NewFile (uintptr (rawFd ), "inotifyfd" )
779+ defer func () {
780+ if retErr != nil {
781+ fd .Close ()
782+ }
783+ }()
784+
785+ fpath := filepath .Join (c .path , "memory.events" )
786+ if _ , err := unix .InotifyAddWatch (rawFd , fpath , unix .IN_MODIFY ); err != nil {
787+ return nil , fmt .Errorf ("failed to add inotify watch for %q: %w" , fpath , err )
788+ }
789+
790+ // monitor to detect process exit/cgroup deletion
791+ evpath := filepath .Join (c .path , "cgroup.events" )
792+ if _ , err = unix .InotifyAddWatch (rawFd , evpath , unix .IN_MODIFY ); err != nil {
793+ return nil , fmt .Errorf ("failed to add inotify watch for %q: %w" , evpath , err )
794+ }
795+ return fd , nil
773796}
774797
775- func (c * Manager ) waitForEvents (ec chan <- Event , errCh chan <- error ) {
776- defer close (errCh )
798+ func (c * Manager ) EventChan () (<- chan Event , <- chan error ) {
799+ ec := make (chan Event , 1 )
800+ errCh := make (chan error , 1 )
777801
778- fd , _ , err := c .MemoryEventFD ()
802+ fd , err := c .memoryEventNonBlockFD ()
779803 if err != nil {
780804 errCh <- err
781- return
805+ return ec , errCh
782806 }
783- defer unix .Close (fd )
784807
785- for {
786- buffer := make ([]byte , unix .SizeofInotifyEvent * 10 )
787- bytesRead , err := unix .Read (fd , buffer )
788- if err != nil {
789- errCh <- err
790- return
791- }
792- if bytesRead >= unix .SizeofInotifyEvent {
808+ go func () {
809+ defer close (errCh )
810+ defer fd .Close ()
811+
812+ for {
813+ buffer := make ([]byte , unix .SizeofInotifyEvent * 10 )
814+ bytesRead , err := fd .Read (buffer )
815+ if err != nil {
816+ errCh <- err
817+ return
818+ }
819+
820+ if bytesRead < unix .SizeofInotifyEvent {
821+ continue
822+ }
823+
824+ // Check cgroup.events first
825+ shouldExit := false
826+ if c .isCgroupEmpty () {
827+ shouldExit = true
828+ }
829+
793830 out := make (map [string ]uint64 )
794831 if err := readKVStatsFile (c .path , "memory.events" , out ); err != nil {
795832 // When cgroup is deleted read may return -ENODEV instead of -ENOENT from open.
@@ -798,18 +835,21 @@ func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
798835 }
799836 return
800837 }
838+
801839 ec <- Event {
802840 Low : out ["low" ],
803841 High : out ["high" ],
804842 Max : out ["max" ],
805843 OOM : out ["oom" ],
806844 OOMKill : out ["oom_kill" ],
807845 }
808- if c .isCgroupEmpty () {
846+
847+ if shouldExit {
809848 return
810849 }
811850 }
812- }
851+ }()
852+ return ec , errCh
813853}
814854
815855func setDevices (path string , devices []specs.LinuxDeviceCgroup ) error {
0 commit comments