Skip to content

Commit e064de7

Browse files
authored
fix: recover VM availability after reconnect (#1947)
## Summary - make VM operations re-establish libvirt on demand instead of trusting a stale availability flag - retry `getDomains()` once when the libvirt connection drops mid-request - add unit coverage for stale availability state and dropped libvirt connections ## Root cause The VM service cached availability in `isVmsAvailable` and only updated that state from bootstrap and PID-file watcher events. During reconnect/bootstrap races, that flag could stay false even after libvirt was healthy again, so the dashboard surfaced `Failed to retrieve VM domains: VMs are not available` instead of self-healing. ## Testing - `pnpm exec vitest run src/unraid-api/graph/resolvers/vms/vms.service.unit.spec.ts src/unraid-api/graph/resolvers/vms/vms.resolver.spec.ts` - `pnpm exec tsc --noEmit -p tsconfig.json` <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * VM operations now recover more reliably from hypervisor connection errors (automatic single retry and consistent cleanup), improving stability during unexpected disconnects and shutdowns. * **Refactor** * Serialized hypervisor initialization and centralized access to prevent duplicate startups and ensure consistent cleanup across VM workflows. * **Tests** * Added comprehensive unit tests covering connection recovery, concurrent initialization, shutdown retrying, and cleanup scenarios. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent d3e0b95 commit e064de7

File tree

2 files changed

+380
-85
lines changed

2 files changed

+380
-85
lines changed

api/src/unraid-api/graph/resolvers/vms/vms.service.ts

Lines changed: 133 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import { VmDomain, VmState } from '@app/unraid-api/graph/resolvers/vms/vms.model
1515
export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
1616
private readonly logger = new Logger(VmsService.name);
1717
private hypervisor: InstanceType<typeof HypervisorClass> | null = null;
18+
private hypervisorInitialization: Promise<InstanceType<typeof HypervisorClass>> | null = null;
1819
private isVmsAvailable: boolean = false;
1920
private watcher: FSWatcher | null = null;
2021
private uri: string;
@@ -62,25 +63,16 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
6263
async onModuleDestroy() {
6364
this.logger.debug('Closing file watcher...');
6465
await this.watcher?.close();
65-
this.logger.debug('Closing hypervisor connection...');
66-
try {
67-
await this.hypervisor?.connectClose();
68-
} catch (error) {
69-
this.logger.warn(`Error closing hypervisor connection: ${(error as Error).message}`);
70-
}
71-
this.hypervisor = null;
72-
this.isVmsAvailable = false;
66+
await this.resetHypervisorConnection();
7367
this.logger.debug('VMs service cleanup complete.');
7468
}
7569

7670
private async attemptHypervisorInitializationAndWatch(): Promise<void> {
7771
try {
78-
await this.initializeHypervisor();
79-
this.isVmsAvailable = true;
72+
await this.initializeHypervisorOnce();
8073
this.logger.debug(`VMs service initialized successfully with URI: ${this.uri}`);
8174
await this.setupWatcher();
8275
} catch (error) {
83-
this.isVmsAvailable = false;
8476
this.logger.warn(
8577
`Initial hypervisor connection failed: ${error instanceof Error ? error.message : 'Unknown error'}. Setting up watcher.`
8678
);
@@ -107,13 +99,11 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
10799
`Libvirt PID file detected at ${this.pidPath}. Attempting connection...`
108100
);
109101
try {
110-
await this.initializeHypervisor();
111-
this.isVmsAvailable = true;
102+
await this.initializeHypervisorOnce();
112103
this.logger.log(
113104
'Hypervisor connection established successfully after PID file detection.'
114105
);
115106
} catch (error) {
116-
this.isVmsAvailable = false;
117107
this.logger.error(
118108
`Failed to initialize hypervisor after PID file detection: ${error instanceof Error ? error.message : 'Unknown error'}`
119109
);
@@ -123,18 +113,8 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
123113
this.logger.warn(
124114
`Libvirt PID file removed from ${this.pidPath}. Hypervisor likely stopped.`
125115
);
126-
this.isVmsAvailable = false;
127-
try {
128-
if (this.hypervisor) {
129-
await this.hypervisor.connectClose();
130-
this.logger.debug('Hypervisor connection closed due to PID file removal.');
131-
}
132-
} catch (closeError) {
133-
this.logger.error(
134-
`Error closing hypervisor connection after PID unlink: ${closeError instanceof Error ? closeError.message : 'Unknown error'}`
135-
);
136-
}
137-
this.hypervisor = null;
116+
await this.resetHypervisorConnection();
117+
this.logger.debug('Hypervisor connection closed due to PID file removal.');
138118
})
139119
.on('error', (error: unknown) => {
140120
this.logger.error(
@@ -143,10 +123,10 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
143123
});
144124
}
145125

146-
private async initializeHypervisor(): Promise<void> {
126+
private async initializeHypervisor(): Promise<InstanceType<typeof HypervisorClass>> {
147127
if (this.hypervisor && this.isVmsAvailable) {
148128
this.logger.debug('Hypervisor connection assumed active based on availability flag.');
149-
return;
129+
return this.hypervisor;
150130
}
151131

152132
this.logger.debug('Checking if libvirt process is running via PID file...');
@@ -174,16 +154,77 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
174154
if (!this.hypervisor) {
175155
throw new Error('Failed to connect to hypervisor');
176156
}
157+
158+
return this.hypervisor;
177159
}
178160

179-
public async setVmState(uuid: string, targetState: VmState): Promise<boolean> {
180-
if (!this.isVmsAvailable || !this.hypervisor) {
161+
private initializeHypervisorOnce(): Promise<InstanceType<typeof HypervisorClass>> {
162+
if (this.hypervisor && this.isVmsAvailable) {
163+
return Promise.resolve(this.hypervisor);
164+
}
165+
166+
if (this.hypervisorInitialization) {
167+
this.logger.debug('Waiting for in-flight hypervisor initialization to finish.');
168+
return this.hypervisorInitialization;
169+
}
170+
171+
this.hypervisorInitialization = (async () => {
172+
try {
173+
const hypervisor = await this.initializeHypervisor();
174+
this.isVmsAvailable = true;
175+
return hypervisor;
176+
} catch (error) {
177+
await this.resetHypervisorConnection();
178+
throw error;
179+
} finally {
180+
this.hypervisorInitialization = null;
181+
}
182+
})();
183+
184+
return this.hypervisorInitialization;
185+
}
186+
187+
private async resetHypervisorConnection(): Promise<void> {
188+
const hypervisor = this.hypervisor;
189+
this.hypervisor = null;
190+
this.isVmsAvailable = false;
191+
192+
this.logger.debug('Closing hypervisor connection...');
193+
try {
194+
await hypervisor?.connectClose();
195+
} catch (error) {
196+
this.logger.warn(`Error closing hypervisor connection: ${(error as Error).message}`);
197+
}
198+
}
199+
200+
private async ensureHypervisorAvailable(): Promise<InstanceType<typeof HypervisorClass>> {
201+
if (this.isVmsAvailable && this.hypervisor) {
202+
return this.hypervisor;
203+
}
204+
205+
try {
206+
return await this.initializeHypervisorOnce();
207+
} catch (error) {
181208
throw new GraphQLError('VMs are not available');
182209
}
210+
}
211+
212+
private isConnectionError(error: unknown): error is Error {
213+
if (!(error instanceof Error)) {
214+
return false;
215+
}
216+
217+
return /virConnect|libvirt|socket is closed|connection (is|was) closed|not connected/i.test(
218+
error.message
219+
);
220+
}
221+
222+
public async setVmState(uuid: string, targetState: VmState): Promise<boolean> {
223+
const hypervisor = await this.ensureHypervisorAvailable();
183224

184225
try {
185226
this.logger.debug(`Looking up domain with UUID: ${uuid}`);
186-
const domain = await this.hypervisor.domainLookupByUUIDString(uuid);
227+
const domain = await hypervisor.domainLookupByUUIDString(uuid);
187228
this.logger.debug(`Found domain, getting info...`);
188229
const info = await domain.getInfo();
189230
this.logger.debug(`Current domain state: ${info.state}`);
@@ -209,7 +250,7 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
209250
this.logger.debug(`Initiating graceful shutdown for domain...`);
210251
await domain.shutdown();
211252

212-
const shutdownSuccess = await this.waitForDomainShutdown(domain);
253+
const shutdownSuccess = await this.waitForDomainShutdown(domain, hypervisor);
213254
if (!shutdownSuccess) {
214255
this.logger.debug('Graceful shutdown failed, forcing domain stop...');
215256
await domain.destroy();
@@ -287,13 +328,11 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
287328
}
288329

289330
public async forceStopVm(uuid: string): Promise<boolean> {
290-
if (!this.isVmsAvailable || !this.hypervisor) {
291-
throw new GraphQLError('VMs are not available');
292-
}
331+
const hypervisor = await this.ensureHypervisorAvailable();
293332

294333
try {
295334
this.logger.debug(`Looking up domain with UUID: ${uuid}`);
296-
const domain = await this.hypervisor.domainLookupByUUIDString(uuid);
335+
const domain = await hypervisor.domainLookupByUUIDString(uuid);
297336
this.logger.debug(`Found domain, force stopping...`);
298337

299338
await domain.destroy();
@@ -306,18 +345,16 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
306345
}
307346

308347
public async rebootVm(uuid: string): Promise<boolean> {
309-
if (!this.isVmsAvailable || !this.hypervisor) {
310-
throw new GraphQLError('VMs are not available');
311-
}
348+
const hypervisor = await this.ensureHypervisorAvailable();
312349

313350
try {
314351
this.logger.debug(`Looking up domain with UUID: ${uuid}`);
315-
const domain = await this.hypervisor.domainLookupByUUIDString(uuid);
352+
const domain = await hypervisor.domainLookupByUUIDString(uuid);
316353
this.logger.debug(`Found domain, rebooting...`);
317354

318355
await domain.shutdown();
319356

320-
const shutdownSuccess = await this.waitForDomainShutdown(domain);
357+
const shutdownSuccess = await this.waitForDomainShutdown(domain, hypervisor);
321358
if (!shutdownSuccess) {
322359
throw new Error('Graceful shutdown failed, please force stop the VM and try again');
323360
}
@@ -332,13 +369,11 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
332369
}
333370

334371
public async resetVm(uuid: string): Promise<boolean> {
335-
if (!this.isVmsAvailable || !this.hypervisor) {
336-
throw new GraphQLError('VMs are not available');
337-
}
372+
const hypervisor = await this.ensureHypervisorAvailable();
338373

339374
try {
340375
this.logger.debug(`Looking up domain with UUID: ${uuid}`);
341-
const domain = await this.hypervisor.domainLookupByUUIDString(uuid);
376+
const domain = await hypervisor.domainLookupByUUIDString(uuid);
342377
this.logger.debug(`Found domain, resetting...`);
343378

344379
await domain.destroy();
@@ -353,61 +388,74 @@ export class VmsService implements OnApplicationBootstrap, OnModuleDestroy {
353388
}
354389

355390
public async getDomains(): Promise<Array<VmDomain>> {
356-
if (!this.isVmsAvailable) {
357-
throw new GraphQLError('VMs are not available');
358-
}
359-
if (!this.hypervisor) {
360-
throw new GraphQLError('Libvirt is not initialized');
361-
}
362-
391+
let hypervisor = await this.ensureHypervisorAvailable();
363392
try {
364-
const hypervisor = this.hypervisor;
365-
this.logger.debug('Getting all domains...');
366-
const domains = await hypervisor.connectListAllDomains(
367-
ConnectListAllDomainsFlags.ACTIVE | ConnectListAllDomainsFlags.INACTIVE
368-
);
369-
this.logger.debug(`Found ${domains.length} domains`);
370-
371-
const resolvedDomains: Array<VmDomain> = await Promise.all(
372-
domains.map(async (domain) => {
373-
const info = await domain.getInfo();
374-
const name = await domain.getName();
375-
const uuid = await domain.getUUIDString();
376-
const state = this.mapDomainStateToVmState(info.state);
377-
378-
return {
379-
id: uuid,
380-
uuid,
381-
name,
382-
state,
383-
};
384-
})
385-
);
386-
387-
return resolvedDomains;
393+
return await this.listDomains(hypervisor);
388394
} catch (error: unknown) {
389-
if (error instanceof Error && error.message.includes('virConnectListAllDomains')) {
395+
let finalError = error;
396+
397+
if (this.isConnectionError(finalError)) {
398+
this.logger.warn(
399+
`VM domain lookup lost its libvirt connection: ${finalError.message}. Resetting and retrying once.`
400+
);
401+
await this.resetHypervisorConnection();
402+
hypervisor = await this.ensureHypervisorAvailable();
403+
404+
try {
405+
return await this.listDomains(hypervisor);
406+
} catch (retryError) {
407+
finalError = retryError;
408+
}
409+
}
410+
411+
if (finalError instanceof Error && finalError.message.includes('virConnectListAllDomains')) {
390412
this.logger.error(
391-
`Failed to list domains, possibly due to connection issue: ${error.message}`
413+
`Failed to list domains, possibly due to connection issue: ${finalError.message}`
392414
);
393415
} else {
394416
this.logger.error(
395-
`Failed to get domains: ${error instanceof Error ? error.message : 'Unknown error'}`
417+
`Failed to get domains: ${finalError instanceof Error ? finalError.message : 'Unknown error'}`
396418
);
397419
}
398420
throw new GraphQLError(
399-
`Failed to get domains: ${error instanceof Error ? error.message : 'Unknown error'}`
421+
`Failed to get domains: ${finalError instanceof Error ? finalError.message : 'Unknown error'}`
400422
);
401423
}
402424
}
403425

404-
private async waitForDomainShutdown(domain: Domain, maxRetries: number = 10): Promise<boolean> {
405-
if (!this.hypervisor) {
406-
throw new Error('Hypervisor is not initialized');
407-
}
426+
private async listDomains(
427+
hypervisor: InstanceType<typeof HypervisorClass>
428+
): Promise<Array<VmDomain>> {
429+
this.logger.debug('Getting all domains...');
430+
const domains = await hypervisor.connectListAllDomains(
431+
ConnectListAllDomainsFlags.ACTIVE | ConnectListAllDomainsFlags.INACTIVE
432+
);
433+
this.logger.debug(`Found ${domains.length} domains`);
434+
435+
return Promise.all(
436+
domains.map(async (domain) => {
437+
const info = await domain.getInfo();
438+
const name = await domain.getName();
439+
const uuid = await domain.getUUIDString();
440+
const state = this.mapDomainStateToVmState(info.state);
441+
442+
return {
443+
id: uuid,
444+
uuid,
445+
name,
446+
state,
447+
};
448+
})
449+
);
450+
}
408451

452+
private async waitForDomainShutdown(
453+
domain: Domain,
454+
hypervisor: InstanceType<typeof HypervisorClass>,
455+
maxRetries: number = 10
456+
): Promise<boolean> {
409457
for (let i = 0; i < maxRetries; i++) {
410-
const currentInfo = await this.hypervisor.domainGetInfo(domain);
458+
const currentInfo = await hypervisor.domainGetInfo(domain);
411459
if (currentInfo.state === DomainState.SHUTOFF) {
412460
this.logger.debug('Domain shutdown completed successfully');
413461
return true;

0 commit comments

Comments
 (0)