@@ -62,7 +62,12 @@ export class MongoLogManager {
6262 }
6363
6464 /** Clean up log files older than `retentionDays`. */
65- async cleanupOldLogFiles ( maxDurationMs = 5_000 ) : Promise < void > {
65+ async cleanupOldLogFiles ( maxDurationMs = 5_000 , remainingRetries = 1 ) : Promise < void > {
66+ const deletionStartTimestamp = Date . now ( ) ;
67+ // Delete files older than N days
68+ const deletionCutoffTimestamp =
69+ deletionStartTimestamp - this . _options . retentionDays * 86400 * 1000 ;
70+
6671 const dir = this . _options . directory ;
6772 let dirHandle ;
6873 try {
@@ -71,10 +76,6 @@ export class MongoLogManager {
7176 return ;
7277 }
7378
74- const deletionStartTimestamp = Date . now ( ) ;
75- // Delete files older than N days
76- const deletionCutoffTimestamp =
77- deletionStartTimestamp - this . _options . retentionDays * 86400 * 1000 ;
7879 // Store the known set of least recent files in a heap in order to be able to
7980 // delete all but the most recent N files.
8081 const leastRecentFileHeap = new Heap < {
@@ -85,55 +86,66 @@ export class MongoLogManager {
8586
8687 let usedStorageSize = this . _options . retentionGB ? 0 : - Infinity ;
8788
88- for await ( const dirent of dirHandle ) {
89- // Cap the overall time spent inside this function. Consider situations like
90- // a large number of machines using a shared network-mounted $HOME directory
91- // where lots and lots of log files end up and filesystem operations happen
92- // with network latency.
93- if ( Date . now ( ) - deletionStartTimestamp > maxDurationMs ) break ;
94-
95- if ( ! dirent . isFile ( ) ) continue ;
96- const logRegExp = new RegExp (
97- `^${ this . prefix } (?<id>[a-f0-9]{24})_log(\\.gz)?$` ,
98- 'i'
99- ) ;
100- const { id } = logRegExp . exec ( dirent . name ) ?. groups ?? { } ;
101- if ( ! id ) continue ;
102-
103- const fileTimestamp = + new ObjectId ( id ) . getTimestamp ( ) ;
104- const fullPath = path . join ( dir , dirent . name ) ;
105-
106- // If the file is older than expected, delete it. If the file is recent,
107- // add it to the list of seen files, and if that list is too large, remove
108- // the least recent file we've seen so far.
109- if ( fileTimestamp < deletionCutoffTimestamp ) {
110- await this . deleteFile ( fullPath ) ;
111- continue ;
112- }
113-
114- let fileSize : number | undefined ;
115- if ( this . _options . retentionGB ) {
116- try {
117- fileSize = ( await fs . stat ( fullPath ) ) . size ;
118- usedStorageSize += fileSize ;
119- } catch ( err ) {
120- this . _options . onerror ( err as Error , fullPath ) ;
89+ try {
90+ for await ( const dirent of dirHandle ) {
91+ // Cap the overall time spent inside this function. Consider situations like
92+ // a large number of machines using a shared network-mounted $HOME directory
93+ // where lots and lots of log files end up and filesystem operations happen
94+ // with network latency.
95+ if ( Date . now ( ) - deletionStartTimestamp > maxDurationMs ) break ;
96+
97+ if ( ! dirent . isFile ( ) ) continue ;
98+ const logRegExp = new RegExp (
99+ `^${ this . prefix } (?<id>[a-f0-9]{24})_log(\\.gz)?$` ,
100+ 'i'
101+ ) ;
102+ const { id } = logRegExp . exec ( dirent . name ) ?. groups ?? { } ;
103+ if ( ! id ) continue ;
104+
105+ const fileTimestamp = + new ObjectId ( id ) . getTimestamp ( ) ;
106+ const fullPath = path . join ( dir , dirent . name ) ;
107+
108+ // If the file is older than expected, delete it. If the file is recent,
109+ // add it to the list of seen files, and if that list is too large, remove
110+ // the least recent file we've seen so far.
111+ if ( fileTimestamp < deletionCutoffTimestamp ) {
112+ await this . deleteFile ( fullPath ) ;
121113 continue ;
122114 }
115+
116+ let fileSize : number | undefined ;
117+ if ( this . _options . retentionGB ) {
118+ try {
119+ fileSize = ( await fs . stat ( fullPath ) ) . size ;
120+ usedStorageSize += fileSize ;
121+ } catch ( err ) {
122+ this . _options . onerror ( err as Error , fullPath ) ;
123+ continue ;
124+ }
125+ }
126+
127+ if ( this . _options . maxLogFileCount || this . _options . retentionGB ) {
128+ leastRecentFileHeap . push ( { fullPath, fileTimestamp, fileSize } ) ;
129+ }
130+
131+ if (
132+ this . _options . maxLogFileCount &&
133+ leastRecentFileHeap . size ( ) > this . _options . maxLogFileCount
134+ ) {
135+ const toDelete = leastRecentFileHeap . pop ( ) ;
136+ if ( ! toDelete ) continue ;
137+ await this . deleteFile ( toDelete . fullPath ) ;
138+ usedStorageSize -= toDelete . fileSize ?? 0 ;
139+ }
123140 }
124-
125- if ( this . _options . maxLogFileCount || this . _options . retentionGB ) {
126- leastRecentFileHeap . push ( { fullPath, fileTimestamp, fileSize } ) ;
127- }
128-
129- if (
130- this . _options . maxLogFileCount &&
131- leastRecentFileHeap . size ( ) > this . _options . maxLogFileCount
132- ) {
133- const toDelete = leastRecentFileHeap . pop ( ) ;
134- if ( ! toDelete ) continue ;
135- await this . deleteFile ( toDelete . fullPath ) ;
136- usedStorageSize -= toDelete . fileSize ?? 0 ;
141+ } catch ( statErr : any ) {
142+ // Multiple processes may attempt to clean up log files in parallel.
143+ // A situation can arise where one process tries to read a file
144+ // that another process has already unlinked (see MONGOSH-1914).
145+ // To handle such scenarios, we will catch lstat errors and retry cleaning up
146+ // to let different processes reach out to different log files.
147+ if ( statErr . code === 'ENOENT' && remainingRetries > 0 ) {
148+ await this . cleanupOldLogFiles ( maxDurationMs - ( Date . now ( ) - deletionStartTimestamp ) , remainingRetries - 1 ) ;
137149 }
138150 }
139151
0 commit comments