Skip to content

Commit 2bb182c

Browse files
authored
KVM Host HA enhancement for StorPool storage (#8045)
Extending the current functionality of KVM Host HA for the StorPool storage plugin and the option for easy integration for the rest of the storage plugins to support Host HA This extension works like the current NFS storage implementation. It allows it to be used simultaneously with NFS and StorPool storage or only with StorPool primary storage. If it is used with different primary storages like NFS and StorPool, and one of the health checks fails for storage, there is an option to report the failure to the management with the global config kvm.ha.fence.on.storage.heartbeat.failure. By default this option is disabled when enabled the Host HA service will continue with the checks on the host and eventually will fence the host
1 parent 0caf18b commit 2bb182c

File tree

39 files changed

+866
-244
lines changed

39 files changed

+866
-244
lines changed

api/src/main/java/com/cloud/agent/api/to/HostTO.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ public class HostTO {
2424
private NetworkTO publicNetwork;
2525
private NetworkTO storageNetwork1;
2626
private NetworkTO storageNetwork2;
27+
private String parent;
2728

2829
protected HostTO() {
2930
}
@@ -40,6 +41,9 @@ public HostTO(Host vo) {
4041
if (vo.getStorageIpAddressDeux() != null) {
4142
storageNetwork2 = new NetworkTO(vo.getStorageIpAddressDeux(), vo.getStorageNetmaskDeux(), vo.getStorageMacAddressDeux());
4243
}
44+
if (vo.getParent() != null) {
45+
parent = vo.getParent();
46+
}
4347
}
4448

4549
public String getGuid() {
@@ -81,4 +85,12 @@ public NetworkTO getStorageNetwork2() {
8185
public void setStorageNetwork2(NetworkTO storageNetwork2) {
8286
this.storageNetwork2 = storageNetwork2;
8387
}
88+
89+
public String getParent() {
90+
return parent;
91+
}
92+
93+
public void setParent(String parent) {
94+
this.parent = parent;
95+
}
8496
}

core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
public class CheckOnHostCommand extends Command {
2626
HostTO host;
27+
boolean reportCheckFailureIfOneStorageIsDown;
2728

2829
protected CheckOnHostCommand() {
2930
}
@@ -33,10 +34,20 @@ public CheckOnHostCommand(Host host) {
3334
setWait(20);
3435
}
3536

37+
public CheckOnHostCommand(Host host, boolean reportCheckFailureIfOneStorageIsDown) {
38+
super();
39+
this.host = new HostTO(host);
40+
this.reportCheckFailureIfOneStorageIsDown = reportCheckFailureIfOneStorageIsDown;
41+
}
42+
3643
public HostTO getHost() {
3744
return host;
3845
}
3946

47+
public boolean isCheckFailedOnOneStorage() {
48+
return reportCheckFailureIfOneStorageIsDown;
49+
}
50+
4051
@Override
4152
public boolean executeInSequence() {
4253
return false;

core/src/main/java/com/cloud/agent/api/FenceCommand.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package com.cloud.agent.api;
2121

22+
import com.cloud.agent.api.to.HostTO;
2223
import com.cloud.host.Host;
2324
import com.cloud.vm.VirtualMachine;
2425

@@ -32,13 +33,16 @@ public FenceCommand() {
3233
String hostGuid;
3334
String hostIp;
3435
boolean inSeq;
36+
HostTO host;
37+
boolean reportCheckFailureIfOneStorageIsDown;
3538

3639
public FenceCommand(VirtualMachine vm, Host host) {
3740
super();
3841
vmName = vm.getInstanceName();
3942
hostGuid = host.getGuid();
4043
hostIp = host.getPrivateIpAddress();
4144
inSeq = false;
45+
this.host = new HostTO(host);
4246
}
4347

4448
public void setSeq(boolean inseq) {
@@ -61,4 +65,16 @@ public String getHostIp() {
6165
public boolean executeInSequence() {
6266
return inSeq;
6367
}
68+
69+
public HostTO getHost() {
70+
return host;
71+
}
72+
73+
public boolean isReportCheckFailureIfOneStorageIsDown() {
74+
return reportCheckFailureIfOneStorageIsDown;
75+
}
76+
77+
public void setReportCheckFailureIfOneStorageIsDown(boolean reportCheckFailureIfOneStorageIsDown) {
78+
this.reportCheckFailureIfOneStorageIsDown = reportCheckFailureIfOneStorageIsDown;
79+
}
6480
}

engine/api/src/main/java/org/apache/cloudstack/engine/subsystem/api/storage/PrimaryDataStoreDriver.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323

2424
import com.cloud.host.Host;
2525
import com.cloud.storage.StoragePool;
26+
import com.cloud.storage.Volume;
27+
import com.cloud.storage.Storage.StoragePoolType;
2628
import com.cloud.utils.Pair;
2729

2830
public interface PrimaryDataStoreDriver extends DataStoreDriver {
@@ -132,4 +134,8 @@ enum QualityOfServiceState { MIGRATION, NO_MIGRATION }
132134
* @param tagValue The value of the VM's tag
133135
*/
134136
void provideVmTags(long vmId, long volumeId, String tagValue);
137+
138+
boolean isStorageSupportHA(StoragePoolType type);
139+
140+
void detachVolumeFromAllStorageNodes(Volume volume);
135141
}

engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ public interface HighAvailabilityManager extends Manager {
7272
+ " which are registered for the HA event that were successful and are now ready to be purged.",
7373
true, Cluster);
7474

75+
public static final ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
76+
"Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone);
77+
7578
public enum WorkType {
7679
Migration, // Migrating VMs off of a host.
7780
Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now.

plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,12 @@
2727
import com.cloud.host.dao.HostDao;
2828
import com.cloud.hypervisor.Hypervisor;
2929
import com.cloud.resource.ResourceManager;
30-
import com.cloud.storage.Storage.StoragePoolType;
3130
import com.cloud.utils.component.AdapterBase;
31+
32+
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreDriver;
33+
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProvider;
34+
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
35+
import org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver;
3236
import org.apache.cloudstack.ha.HAManager;
3337
import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao;
3438
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
@@ -49,6 +53,8 @@ public class KVMInvestigator extends AdapterBase implements Investigator {
4953
private PrimaryDataStoreDao _storagePoolDao;
5054
@Inject
5155
private HAManager haManager;
56+
@Inject
57+
private DataStoreProviderManager dataStoreProviderMgr;
5258

5359
@Override
5460
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
@@ -78,31 +84,21 @@ public Status isAgentAlive(Host agent) {
7884
}
7985

8086
List<StoragePoolVO> clusterPools = _storagePoolDao.listPoolsByCluster(agent.getClusterId());
81-
boolean hasNfs = false;
82-
for (StoragePoolVO pool : clusterPools) {
83-
if (pool.getPoolType() == StoragePoolType.NetworkFilesystem) {
84-
hasNfs = true;
85-
break;
86-
}
87-
}
88-
if (!hasNfs) {
87+
boolean storageSupportHA = storageSupportHa(clusterPools);
88+
if (!storageSupportHA) {
8989
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType());
90-
for (StoragePoolVO pool : zonePools) {
91-
if (pool.getPoolType() == StoragePoolType.NetworkFilesystem) {
92-
hasNfs = true;
93-
break;
94-
}
95-
}
90+
storageSupportHA = storageSupportHa(zonePools);
9691
}
97-
if (!hasNfs) {
92+
if (!storageSupportHA) {
9893
s_logger.warn(
9994
"Agent investigation was requested on host " + agent + ", but host does not support investigation because it has no NFS storage. Skipping investigation.");
10095
return Status.Disconnected;
10196
}
10297

10398
Status hostStatus = null;
10499
Status neighbourStatus = null;
105-
CheckOnHostCommand cmd = new CheckOnHostCommand(agent);
100+
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
101+
CheckOnHostCommand cmd = new CheckOnHostCommand(agent, reportFailureIfOneStorageIsDown);
106102

107103
try {
108104
Answer answer = _agentMgr.easySend(agent.getId(), cmd);
@@ -145,4 +141,20 @@ public Status isAgentAlive(Host agent) {
145141
s_logger.debug("HA: HOST is ineligible legacy state " + hostStatus + " for host " + agent.getId());
146142
return hostStatus;
147143
}
144+
145+
private boolean storageSupportHa(List<StoragePoolVO> pools) {
146+
boolean storageSupportHA = false;
147+
for (StoragePoolVO pool : pools) {
148+
DataStoreProvider storeProvider = dataStoreProviderMgr.getDataStoreProvider(pool.getStorageProviderName());
149+
DataStoreDriver storeDriver = storeProvider.getDataStoreDriver();
150+
if (storeDriver instanceof PrimaryDataStoreDriver) {
151+
PrimaryDataStoreDriver primaryStoreDriver = (PrimaryDataStoreDriver)storeDriver;
152+
if (primaryStoreDriver.isStorageSupportHA(pool.getPoolType())) {
153+
storageSupportHA = true;
154+
break;
155+
}
156+
}
157+
}
158+
return storageSupportHA;
159+
}
148160
}

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.libvirt.StoragePoolInfo;
2525
import org.libvirt.StoragePoolInfo.StoragePoolState;
2626

27+
import com.cloud.hypervisor.kvm.storage.KVMStoragePool;
2728
import com.cloud.utils.script.OutputInterpreter;
2829
import com.cloud.utils.script.OutputInterpreter.AllLinesParser;
2930
import com.cloud.utils.script.Script;
@@ -41,26 +42,76 @@ public static enum PoolType {
4142
PrimaryStorage, SecondaryStorage
4243
}
4344

44-
public static class NfsStoragePool {
45-
String _poolUUID;
46-
String _poolIp;
47-
String _poolMountSourcePath;
48-
String _mountDestPath;
49-
PoolType _type;
45+
public static class HAStoragePool {
46+
String poolUuid;
47+
String poolIp;
48+
String poolMountSourcePath;
49+
String mountDestPath;
50+
PoolType poolType;
51+
KVMStoragePool pool;
52+
53+
public HAStoragePool(KVMStoragePool pool, String host, String path, PoolType type) {
54+
this.pool = pool;
55+
this.poolUuid = pool.getUuid();
56+
this.mountDestPath = pool.getLocalPath();
57+
this.poolIp = host;
58+
this.poolMountSourcePath = path;
59+
this.poolType = type;
60+
}
61+
62+
public String getPoolUUID() {
63+
return poolUuid;
64+
}
65+
66+
public void setPoolUUID(String poolUuid) {
67+
this.poolUuid = poolUuid;
68+
}
69+
70+
public String getPoolIp() {
71+
return poolIp;
72+
}
73+
74+
public void setPoolIp(String poolIp) {
75+
this.poolIp = poolIp;
76+
}
77+
78+
public String getPoolMountSourcePath() {
79+
return poolMountSourcePath;
80+
}
81+
82+
public void setPoolMountSourcePath(String poolMountSourcePath) {
83+
this.poolMountSourcePath = poolMountSourcePath;
84+
}
85+
86+
public String getMountDestPath() {
87+
return mountDestPath;
88+
}
89+
90+
public void setMountDestPath(String mountDestPath) {
91+
this.mountDestPath = mountDestPath;
92+
}
93+
94+
public PoolType getType() {
95+
return poolType;
96+
}
97+
98+
public void setType(PoolType type) {
99+
this.poolType = type;
100+
}
101+
102+
public KVMStoragePool getPool() {
103+
return pool;
104+
}
50105

51-
public NfsStoragePool(String poolUUID, String poolIp, String poolSourcePath, String mountDestPath, PoolType type) {
52-
_poolUUID = poolUUID;
53-
_poolIp = poolIp;
54-
_poolMountSourcePath = poolSourcePath;
55-
_mountDestPath = mountDestPath;
56-
_type = type;
106+
public void setPool(KVMStoragePool pool) {
107+
this.pool = pool;
57108
}
58109
}
59110

60-
protected String checkingMountPoint(NfsStoragePool pool, String poolName) {
61-
String mountSource = pool._poolIp + ":" + pool._poolMountSourcePath;
111+
protected String checkingMountPoint(HAStoragePool pool, String poolName) {
112+
String mountSource = pool.getPoolIp() + ":" + pool.getPoolMountSourcePath();
62113
String mountPaths = Script.runSimpleBashScript("cat /proc/mounts | grep " + mountSource);
63-
String destPath = pool._mountDestPath;
114+
String destPath = pool.getMountDestPath();
64115

65116
if (mountPaths != null) {
66117
String token[] = mountPaths.split(" ");
@@ -100,12 +151,12 @@ protected String checkingMountPoint(NfsStoragePool pool, String poolName) {
100151
return destPath;
101152
}
102153

103-
protected String getMountPoint(NfsStoragePool storagePool) {
154+
protected String getMountPoint(HAStoragePool storagePool) {
104155

105156
StoragePool pool = null;
106157
String poolName = null;
107158
try {
108-
pool = LibvirtConnection.getConnection().storagePoolLookupByUUIDString(storagePool._poolUUID);
159+
pool = LibvirtConnection.getConnection().storagePoolLookupByUUIDString(storagePool.getPoolUUID());
109160
if (pool != null) {
110161
StoragePoolInfo spi = pool.getInfo();
111162
if (spi.state != StoragePoolState.VIR_STORAGE_POOL_RUNNING) {

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java

Lines changed: 13 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,18 @@
2222

2323
import org.apache.log4j.Logger;
2424

25-
import com.cloud.utils.script.OutputInterpreter;
26-
import com.cloud.utils.script.Script;
25+
import com.cloud.agent.api.to.HostTO;
2726

2827
public class KVMHAChecker extends KVMHABase implements Callable<Boolean> {
2928
private static final Logger s_logger = Logger.getLogger(KVMHAChecker.class);
30-
private List<NfsStoragePool> nfsStoragePools;
31-
private String hostIp;
32-
private long heartBeatCheckerTimeout = 360000; // 6 minutes
29+
private List<HAStoragePool> storagePools;
30+
private HostTO host;
31+
private boolean reportFailureIfOneStorageIsDown;
3332

34-
public KVMHAChecker(List<NfsStoragePool> pools, String host) {
35-
this.nfsStoragePools = pools;
36-
this.hostIp = host;
33+
public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportFailureIfOneStorageIsDown) {
34+
this.storagePools = pools;
35+
this.host = host;
36+
this.reportFailureIfOneStorageIsDown = reportFailureIfOneStorageIsDown;
3737
}
3838

3939
/*
@@ -44,30 +44,14 @@ public KVMHAChecker(List<NfsStoragePool> pools, String host) {
4444
public Boolean checkingHeartBeat() {
4545
boolean validResult = false;
4646

47-
String hostAndPools = String.format("host IP [%s] in pools [%s]", hostIp, nfsStoragePools.stream().map(pool -> pool._poolIp).collect(Collectors.joining(", ")));
47+
String hostAndPools = String.format("host IP [%s] in pools [%s]", host.getPrivateNetwork().getIp(), storagePools.stream().map(pool -> pool.getPoolUUID()).collect(Collectors.joining(", ")));
4848

4949
s_logger.debug(String.format("Checking heart beat with KVMHAChecker for %s", hostAndPools));
5050

51-
for (NfsStoragePool pool : nfsStoragePools) {
52-
Script cmd = new Script(s_heartBeatPath, heartBeatCheckerTimeout, s_logger);
53-
cmd.add("-i", pool._poolIp);
54-
cmd.add("-p", pool._poolMountSourcePath);
55-
cmd.add("-m", pool._mountDestPath);
56-
cmd.add("-h", hostIp);
57-
cmd.add("-r");
58-
cmd.add("-t", String.valueOf(_heartBeatUpdateFreq / 1000));
59-
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
60-
String result = cmd.execute(parser);
61-
String parsedLine = parser.getLine();
62-
63-
s_logger.debug(String.format("Checking heart beat with KVMHAChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].", cmd.toString(), result, parsedLine,
64-
pool._poolIp));
65-
66-
if (result == null && parsedLine.contains("DEAD")) {
67-
s_logger.warn(String.format("Checking heart beat with KVMHAChecker command [%s] returned [%s]. [%s]. It may cause a shutdown of host IP [%s].", cmd.toString(),
68-
result, parsedLine, hostIp));
69-
} else {
70-
validResult = true;
51+
for (HAStoragePool pool : storagePools) {
52+
validResult = pool.getPool().checkingHeartBeat(pool, host);
53+
if (reportFailureIfOneStorageIsDown && !validResult) {
54+
break;
7155
}
7256
}
7357

0 commit comments

Comments
 (0)