Skip to content

Commit 7f867ac

Browse files
fix(mongodb-log-writer): issues with parallel mongosh executions MONGOSH-1914 (#506)
* fix(mongodb-log-writer): issues with parallel mongosh executions MONGOSH-1914 * test: clean up * feat: adjust maxDurationMs * test: remove extra sinon restore after rebase * Update packages/mongodb-log-writer/src/mongo-log-manager.ts Co-authored-by: Anna Henningsen <[email protected]> * refactor: compute deletionStartTimestamp and deletionCutoffTimestamp as the first thing --------- Co-authored-by: Anna Henningsen <[email protected]>
1 parent 65bf3a1 commit 7f867ac

File tree

2 files changed

+85
-51
lines changed

2 files changed

+85
-51
lines changed

packages/mongodb-log-writer/src/mongo-log-manager.spec.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ describe('MongoLogManager', function () {
2626
);
2727
await fs.mkdir(directory, { recursive: true });
2828
});
29+
2930
afterEach(async function () {
3031
await fs.rmdir(directory, { recursive: true });
3132
sinon.restore();
@@ -568,4 +569,25 @@ describe('MongoLogManager', function () {
568569
writer.end();
569570
await once(writer, 'finish');
570571
});
572+
573+
it('retries cleaning up old log files', async function () {
574+
const fakeDirHandle = {
575+
[Symbol.asyncIterator]: () => {
576+
throw Object.assign(new Error('File not found'), { code: 'ENOENT' });
577+
},
578+
close: sinon.stub().resolves(),
579+
};
580+
const opendirStub = sinon.stub(fs, 'opendir').resolves(fakeDirHandle as any);
581+
582+
retentionDays = 0.000001; // 86.4 ms
583+
const manager = new MongoLogManager({
584+
directory,
585+
retentionDays,
586+
onwarn,
587+
onerror,
588+
});
589+
590+
await manager.cleanupOldLogFiles();
591+
expect(opendirStub).to.have.been.calledTwice;
592+
});
571593
});

packages/mongodb-log-writer/src/mongo-log-manager.ts

Lines changed: 63 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,12 @@ export class MongoLogManager {
6262
}
6363

6464
/** Clean up log files older than `retentionDays`. */
65-
async cleanupOldLogFiles(maxDurationMs = 5_000): Promise<void> {
65+
async cleanupOldLogFiles(maxDurationMs = 5_000, remainingRetries = 1): Promise<void> {
66+
const deletionStartTimestamp = Date.now();
67+
// Delete files older than N days
68+
const deletionCutoffTimestamp =
69+
deletionStartTimestamp - this._options.retentionDays * 86400 * 1000;
70+
6671
const dir = this._options.directory;
6772
let dirHandle;
6873
try {
@@ -71,10 +76,6 @@ export class MongoLogManager {
7176
return;
7277
}
7378

74-
const deletionStartTimestamp = Date.now();
75-
// Delete files older than N days
76-
const deletionCutoffTimestamp =
77-
deletionStartTimestamp - this._options.retentionDays * 86400 * 1000;
7879
// Store the known set of least recent files in a heap in order to be able to
7980
// delete all but the most recent N files.
8081
const leastRecentFileHeap = new Heap<{
@@ -85,55 +86,66 @@ export class MongoLogManager {
8586

8687
let usedStorageSize = this._options.retentionGB ? 0 : -Infinity;
8788

88-
for await (const dirent of dirHandle) {
89-
// Cap the overall time spent inside this function. Consider situations like
90-
// a large number of machines using a shared network-mounted $HOME directory
91-
// where lots and lots of log files end up and filesystem operations happen
92-
// with network latency.
93-
if (Date.now() - deletionStartTimestamp > maxDurationMs) break;
94-
95-
if (!dirent.isFile()) continue;
96-
const logRegExp = new RegExp(
97-
`^${this.prefix}(?<id>[a-f0-9]{24})_log(\\.gz)?$`,
98-
'i'
99-
);
100-
const { id } = logRegExp.exec(dirent.name)?.groups ?? {};
101-
if (!id) continue;
102-
103-
const fileTimestamp = +new ObjectId(id).getTimestamp();
104-
const fullPath = path.join(dir, dirent.name);
105-
106-
// If the file is older than expected, delete it. If the file is recent,
107-
// add it to the list of seen files, and if that list is too large, remove
108-
// the least recent file we've seen so far.
109-
if (fileTimestamp < deletionCutoffTimestamp) {
110-
await this.deleteFile(fullPath);
111-
continue;
112-
}
113-
114-
let fileSize: number | undefined;
115-
if (this._options.retentionGB) {
116-
try {
117-
fileSize = (await fs.stat(fullPath)).size;
118-
usedStorageSize += fileSize;
119-
} catch (err) {
120-
this._options.onerror(err as Error, fullPath);
89+
try {
90+
for await (const dirent of dirHandle) {
91+
// Cap the overall time spent inside this function. Consider situations like
92+
// a large number of machines using a shared network-mounted $HOME directory
93+
// where lots and lots of log files end up and filesystem operations happen
94+
// with network latency.
95+
if (Date.now() - deletionStartTimestamp > maxDurationMs) break;
96+
97+
if (!dirent.isFile()) continue;
98+
const logRegExp = new RegExp(
99+
`^${this.prefix}(?<id>[a-f0-9]{24})_log(\\.gz)?$`,
100+
'i'
101+
);
102+
const { id } = logRegExp.exec(dirent.name)?.groups ?? {};
103+
if (!id) continue;
104+
105+
const fileTimestamp = +new ObjectId(id).getTimestamp();
106+
const fullPath = path.join(dir, dirent.name);
107+
108+
// If the file is older than expected, delete it. If the file is recent,
109+
// add it to the list of seen files, and if that list is too large, remove
110+
// the least recent file we've seen so far.
111+
if (fileTimestamp < deletionCutoffTimestamp) {
112+
await this.deleteFile(fullPath);
121113
continue;
122114
}
115+
116+
let fileSize: number | undefined;
117+
if (this._options.retentionGB) {
118+
try {
119+
fileSize = (await fs.stat(fullPath)).size;
120+
usedStorageSize += fileSize;
121+
} catch (err) {
122+
this._options.onerror(err as Error, fullPath);
123+
continue;
124+
}
125+
}
126+
127+
if (this._options.maxLogFileCount || this._options.retentionGB) {
128+
leastRecentFileHeap.push({ fullPath, fileTimestamp, fileSize });
129+
}
130+
131+
if (
132+
this._options.maxLogFileCount &&
133+
leastRecentFileHeap.size() > this._options.maxLogFileCount
134+
) {
135+
const toDelete = leastRecentFileHeap.pop();
136+
if (!toDelete) continue;
137+
await this.deleteFile(toDelete.fullPath);
138+
usedStorageSize -= toDelete.fileSize ?? 0;
139+
}
123140
}
124-
125-
if (this._options.maxLogFileCount || this._options.retentionGB) {
126-
leastRecentFileHeap.push({ fullPath, fileTimestamp, fileSize });
127-
}
128-
129-
if (
130-
this._options.maxLogFileCount &&
131-
leastRecentFileHeap.size() > this._options.maxLogFileCount
132-
) {
133-
const toDelete = leastRecentFileHeap.pop();
134-
if (!toDelete) continue;
135-
await this.deleteFile(toDelete.fullPath);
136-
usedStorageSize -= toDelete.fileSize ?? 0;
141+
} catch (statErr: any) {
142+
// Multiple processes may attempt to clean up log files in parallel.
143+
// A situation can arise where one process tries to read a file
144+
// that another process has already unlinked (see MONGOSH-1914).
145+
// To handle such scenarios, we will catch lstat errors and retry cleaning up
146+
// to let different processes reach out to different log files.
147+
if (statErr.code === 'ENOENT' && remainingRetries > 0) {
148+
await this.cleanupOldLogFiles(maxDurationMs - (Date.now() - deletionStartTimestamp), remainingRetries - 1);
137149
}
138150
}
139151

0 commit comments

Comments
 (0)