@@ -36,7 +36,7 @@ export class MongoLogManager {
36
36
}
37
37
38
38
/** Clean up log files older than `retentionDays`. */
39
- async cleanupOldLogFiles ( maxDurationMs = 5_000 ) : Promise < void > {
39
+ async cleanupOldLogFiles ( maxDurationMs = 5_000 , remainingRetries = 1 ) : Promise < void > {
40
40
const dir = this . _options . directory ;
41
41
let dirHandle ;
42
42
try {
@@ -56,43 +56,54 @@ export class MongoLogManager {
56
56
fullPath : string ;
57
57
} > ( ( a , b ) => a . fileTimestamp - b . fileTimestamp ) ;
58
58
59
- for await ( const dirent of dirHandle ) {
60
- // Cap the overall time spent inside this function. Consider situations like
61
- // a large number of machines using a shared network-mounted $HOME directory
62
- // where lots and lots of log files end up and filesystem operations happen
63
- // with network latency.
64
- if ( Date . now ( ) - deletionStartTimestamp > maxDurationMs ) break ;
65
-
66
- if ( ! dirent . isFile ( ) ) continue ;
67
- const { id } =
68
- / ^ (?< id > [ a - f 0 - 9 ] { 24 } ) _ l o g ( \. g z ) ? $ / i. exec ( dirent . name ) ?. groups ?? { } ;
69
- if ( ! id ) continue ;
70
- const fileTimestamp = + new ObjectId ( id ) . getTimestamp ( ) ;
71
- const fullPath = path . join ( dir , dirent . name ) ;
72
- let toDelete : string | undefined ;
73
-
74
- // If the file is older than expected, delete it. If the file is recent,
75
- // add it to the list of seen files, and if that list is too large, remove
76
- // the least recent file we've seen so far.
77
- if ( fileTimestamp < deletionCutoffTimestamp ) {
78
- toDelete = fullPath ;
79
- } else if ( this . _options . maxLogFileCount ) {
80
- leastRecentFileHeap . push ( { fullPath, fileTimestamp } ) ;
81
- if ( leastRecentFileHeap . size ( ) > this . _options . maxLogFileCount ) {
82
- toDelete = leastRecentFileHeap . pop ( ) ?. fullPath ;
59
+ try {
60
+ for await ( const dirent of dirHandle ) {
61
+ // Cap the overall time spent inside this function. Consider situations like
62
+ // a large number of machines using a shared network-mounted $HOME directory
63
+ // where lots and lots of log files end up and filesystem operations happen
64
+ // with network latency.
65
+ if ( Date . now ( ) - deletionStartTimestamp > maxDurationMs ) break ;
66
+
67
+ if ( ! dirent . isFile ( ) ) continue ;
68
+ const { id } =
69
+ / ^ (?< id > [ a - f 0 - 9 ] { 24 } ) _ l o g ( \. g z ) ? $ / i. exec ( dirent . name ) ?. groups ?? { } ;
70
+ if ( ! id ) continue ;
71
+ const fileTimestamp = + new ObjectId ( id ) . getTimestamp ( ) ;
72
+ const fullPath = path . join ( dir , dirent . name ) ;
73
+ let toDelete : string | undefined ;
74
+
75
+ // If the file is older than expected, delete it. If the file is recent,
76
+ // add it to the list of seen files, and if that list is too large, remove
77
+ // the least recent file we've seen so far.
78
+ if ( fileTimestamp < deletionCutoffTimestamp ) {
79
+ toDelete = fullPath ;
80
+ } else if ( this . _options . maxLogFileCount ) {
81
+ leastRecentFileHeap . push ( { fullPath, fileTimestamp } ) ;
82
+ if ( leastRecentFileHeap . size ( ) > this . _options . maxLogFileCount ) {
83
+ toDelete = leastRecentFileHeap . pop ( ) ?. fullPath ;
84
+ }
83
85
}
84
- }
85
-
86
- if ( ! toDelete ) continue ;
87
- try {
88
- await fs . unlink ( toDelete ) ;
89
- // eslint-disable-next-line @typescript-eslint/no-explicit- any
90
- } catch ( err : any ) {
91
- if ( err ?. code !== 'ENOENT' ) {
92
- // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
93
- this . _options . onerror ( err , fullPath ) ;
86
+
87
+ if ( ! toDelete ) continue ;
88
+ try {
89
+ await fs . unlink ( toDelete ) ;
90
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
91
+ } catch ( err : any ) {
92
+ if ( err . code !== 'ENOENT' ) {
93
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
94
+ this . _options . onerror ( err , fullPath ) ;
95
+ }
94
96
}
95
97
}
98
+ } catch ( statErr : any ) {
99
+ // Multiple processes may attempt to clean up log files in parallel.
100
+ // A situation can arise where one process tries to read a file
101
+ // that another process has already unlinked (see MONGOSH-1914).
102
+ // To handle such scenarios, we will catch lstat errors and retry cleaning up
103
+ // to let different processes reach out to different log files.
104
+ if ( statErr . code === 'ENOENT' && remainingRetries > 0 ) {
105
+ await this . cleanupOldLogFiles ( maxDurationMs , remainingRetries - 1 ) ;
106
+ }
96
107
}
97
108
}
98
109
0 commit comments