@@ -36,7 +36,7 @@ export class MongoLogManager {
3636 }
3737
3838 /** Clean up log files older than `retentionDays`. */
39- async cleanupOldLogFiles ( maxDurationMs = 5_000 ) : Promise < void > {
39+ async cleanupOldLogFiles ( maxDurationMs = 5_000 , remainingRetries = 1 ) : Promise < void > {
4040 const dir = this . _options . directory ;
4141 let dirHandle ;
4242 try {
@@ -56,43 +56,54 @@ export class MongoLogManager {
5656 fullPath : string ;
5757 } > ( ( a , b ) => a . fileTimestamp - b . fileTimestamp ) ;
5858
59- for await ( const dirent of dirHandle ) {
60- // Cap the overall time spent inside this function. Consider situations like
61- // a large number of machines using a shared network-mounted $HOME directory
62- // where lots and lots of log files end up and filesystem operations happen
63- // with network latency.
64- if ( Date . now ( ) - deletionStartTimestamp > maxDurationMs ) break ;
65-
66- if ( ! dirent . isFile ( ) ) continue ;
67- const { id } =
68- / ^ (?< id > [ a - f 0 - 9 ] { 24 } ) _ l o g ( \. g z ) ? $ / i. exec ( dirent . name ) ?. groups ?? { } ;
69- if ( ! id ) continue ;
70- const fileTimestamp = + new ObjectId ( id ) . getTimestamp ( ) ;
71- const fullPath = path . join ( dir , dirent . name ) ;
72- let toDelete : string | undefined ;
73-
74- // If the file is older than expected, delete it. If the file is recent,
75- // add it to the list of seen files, and if that list is too large, remove
76- // the least recent file we've seen so far.
77- if ( fileTimestamp < deletionCutoffTimestamp ) {
78- toDelete = fullPath ;
79- } else if ( this . _options . maxLogFileCount ) {
80- leastRecentFileHeap . push ( { fullPath, fileTimestamp } ) ;
81- if ( leastRecentFileHeap . size ( ) > this . _options . maxLogFileCount ) {
82- toDelete = leastRecentFileHeap . pop ( ) ?. fullPath ;
59+ try {
60+ for await ( const dirent of dirHandle ) {
61+ // Cap the overall time spent inside this function. Consider situations like
62+ // a large number of machines using a shared network-mounted $HOME directory
63+ // where lots and lots of log files end up and filesystem operations happen
64+ // with network latency.
65+ if ( Date . now ( ) - deletionStartTimestamp > maxDurationMs ) break ;
66+
67+ if ( ! dirent . isFile ( ) ) continue ;
68+ const { id } =
69+ / ^ (?< id > [ a - f 0 - 9 ] { 24 } ) _ l o g ( \. g z ) ? $ / i. exec ( dirent . name ) ?. groups ?? { } ;
70+ if ( ! id ) continue ;
71+ const fileTimestamp = + new ObjectId ( id ) . getTimestamp ( ) ;
72+ const fullPath = path . join ( dir , dirent . name ) ;
73+ let toDelete : string | undefined ;
74+
75+ // If the file is older than expected, delete it. If the file is recent,
76+ // add it to the list of seen files, and if that list is too large, remove
77+ // the least recent file we've seen so far.
78+ if ( fileTimestamp < deletionCutoffTimestamp ) {
79+ toDelete = fullPath ;
80+ } else if ( this . _options . maxLogFileCount ) {
81+ leastRecentFileHeap . push ( { fullPath, fileTimestamp } ) ;
82+ if ( leastRecentFileHeap . size ( ) > this . _options . maxLogFileCount ) {
83+ toDelete = leastRecentFileHeap . pop ( ) ?. fullPath ;
84+ }
8385 }
84- }
85-
86- if ( ! toDelete ) continue ;
87- try {
88- await fs . unlink ( toDelete ) ;
89- // eslint-disable-next-line @typescript-eslint/no-explicit- any
90- } catch ( err : any ) {
91- if ( err ?. code !== 'ENOENT' ) {
92- // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
93- this . _options . onerror ( err , fullPath ) ;
86+
87+ if ( ! toDelete ) continue ;
88+ try {
89+ await fs . unlink ( toDelete ) ;
90+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
91+ } catch ( err : any ) {
92+ if ( err . code !== 'ENOENT' ) {
93+ // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
94+ this . _options . onerror ( err , fullPath ) ;
95+ }
9496 }
9597 }
98+ } catch ( statErr : any ) {
99+ // Multiple processes may attempt to clean up log files in parallel.
100+ // A situation can arise where one process tries to read a file
101+ // that another process has already unlinked (see MONGOSH-1914).
102+ // To handle such scenarios, we will catch lstat errors and retry cleaning up
103+ // to let different processes reach out to different log files.
104+ if ( statErr . code === 'ENOENT' && remainingRetries > 0 ) {
105+ await this . cleanupOldLogFiles ( maxDurationMs , remainingRetries - 1 ) ;
106+ }
96107 }
97108 }
98109
0 commit comments