From 91b8ed8709e46b15d4f23d22c18eb61a50470dfe Mon Sep 17 00:00:00 2001 From: Suresh Kumar Anaparti Date: Mon, 8 Jun 2026 17:21:03 +0530 Subject: [PATCH 1/5] KVM HA: Fix CheckOnHostAnswer success flag when there is no heartbeat --- .../kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java | 2 +- .../wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java index f901fd97ca76..c069416cca54 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java @@ -55,7 +55,7 @@ public Answer execute(final CheckOnHostCommand command, final LibvirtComputingRe if (hasHeartBeat) { return new CheckOnHostAnswer(command, true, "Heart is beating"); } else { - return new CheckOnHostAnswer(command, "Heart is not beating"); + return new CheckOnHostAnswer(command, false, "Heart is not beating"); } } catch (final InterruptedException e) { return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't get status of host: InterruptedException"); diff --git a/plugins/hypervisors/xenserver/src/main/java/com/cloud/hypervisor/xenserver/resource/wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java b/plugins/hypervisors/xenserver/src/main/java/com/cloud/hypervisor/xenserver/resource/wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java index 5535b7e84c87..9a675c33e74a 100644 --- a/plugins/hypervisors/xenserver/src/main/java/com/cloud/hypervisor/xenserver/resource/wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java +++ b/plugins/hypervisors/xenserver/src/main/java/com/cloud/hypervisor/xenserver/resource/wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java @@ -31,6 +31,6 @@ public final class CitrixCheckOnHostCommandWrapper extends CommandWrapper Date: Mon, 8 Jun 2026 17:32:23 +0530 Subject: [PATCH 2/5] add log --- .../org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index af7441c4fd29..96fd42d4d544 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -109,6 +109,7 @@ public Status getHostAgentStatus(Host host) { } Status hostStatusFromNeighbour = checkHostStatusWithNeighbourHosts(host); + logger.debug("{} status reported from itself: {} and neighbor: {}", host.toString(), hostStatusFromItself, hostStatusFromNeighbour); Status hostStatus = hostStatusFromItself; if (hostStatusFromNeighbour == Status.Up && (hostStatusFromItself == Status.Disconnected || hostStatusFromItself == Status.Down)) { hostStatus = Status.Disconnected; From 82be657ac1c8c18f51d8a557f5e8c1777b252c2c Mon Sep 17 00:00:00 2001 From: Suresh Kumar Anaparti Date: Mon, 8 Jun 2026 17:44:08 +0530 Subject: [PATCH 3/5] typo --- .../com/cloud/hypervisor/vmware/resource/VmwareResource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/vmware/resource/VmwareResource.java b/plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/vmware/resource/VmwareResource.java index 831e222200ab..7b9f258920c5 100644 --- a/plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/vmware/resource/VmwareResource.java +++ b/plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/vmware/resource/VmwareResource.java @@ -5807,7 +5807,7 @@ protected Answer execute(PingTestCommand cmd) { } protected Answer execute(CheckOnHostCommand cmd) { - return new CheckOnHostAnswer(cmd, null, "Not Implmeneted"); + return new CheckOnHostAnswer(cmd, null, "Not Implemented"); } protected Answer execute(ModifySshKeysCommand cmd) { From b2caf7e17ece3670d45e602aaf608b1821a582d0 Mon Sep 17 00:00:00 2001 From: Suresh Kumar Anaparti Date: Mon, 8 Jun 2026 18:38:28 +0530 Subject: [PATCH 4/5] unit test update --- .../kvm/resource/LibvirtComputingResourceTest.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java index 662b09a30448..aaebd8c7aebc 100644 --- a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java +++ b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java @@ -61,6 +61,7 @@ import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; +import com.cloud.agent.api.CheckOnHostAnswer; import com.google.gson.JsonObject; import com.google.gson.JsonParser; @@ -3130,7 +3131,9 @@ public void testCheckOnHostCommand() { assertNotNull(wrapper); final Answer answer = wrapper.execute(command, libvirtComputingResourceMock); - assertFalse(answer.getResult()); + assertTrue(answer.getResult()); + assertTrue(answer instanceof CheckOnHostAnswer); + assertFalse(((CheckOnHostAnswer)answer).isAlive()); verify(libvirtComputingResourceMock, times(1)).getMonitor(); } From 1613b4175feb012526d5f91f5ecec1a10275735e Mon Sep 17 00:00:00 2001 From: Andrija Panic <45762285+andrijapanicsb@users.noreply.github.com> Date: Mon, 8 Jun 2026 18:19:03 +0200 Subject: [PATCH 5/5] KVM HA: fence by confirming host power state, not the power-off result KVMHAProvider.fence() declared a host fenced only when the out-of-band power-off command reported success. Against an already-off chassis the BMC rejects the power-off (e.g. Redfish returns HTTP 409), so fence() failed and the host stayed stuck in the Fencing HA state, which maps to Disconnected (not Down). VM-HA therefore never restarted the VMs until the dead host was powered back on. Fencing now succeeds based on the actual chassis power state: - if the host is already powered off (OOBM STATUS == Off), treat it as fenced; - otherwise issue a best-effort power-off and confirm via OOBM STATUS; - only a confirmed Off state counts as success; if the state cannot be confirmed (e.g. unreachable BMC) the fence fails and is retried, to avoid split-brain. Also map Redfish PowerOperation.OFF to ForceOff (hard power-off) instead of GracefulShutdown, consistent with the ipmitool driver and appropriate for fencing an unresponsive host (SOFT remains the graceful ACPI shutdown). Fixes #13376 --- .../cloudstack/kvm/ha/KVMHAProvider.java | 58 +++++++++++--- .../cloudstack/kvm/ha/KVMHostHATest.java | 76 +++++++++++++++++++ .../driver/redfish/RedfishWrapper.java | 6 +- 3 files changed, 129 insertions(+), 11 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java index f0b5cfc337de..352c1b14724e 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java @@ -33,6 +33,7 @@ import org.apache.cloudstack.ha.provider.HARecoveryException; import org.apache.cloudstack.ha.provider.host.HAAbstractHostProvider; import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation; +import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState; import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService; import org.joda.time.DateTime; @@ -85,18 +86,57 @@ public boolean recover(Host r) throws HARecoveryException { @Override public boolean fence(Host r) throws HAFenceException { + if (!outOfBandManagementService.isOutOfBandManagementEnabled(r)) { + logger.warn("Out-of-band management is not enabled/configured for host {}; cannot fence it.", r); + return false; + } + // Fencing must guarantee the host is powered off, and the only reliable signal for that is + // the actual chassis power state - not the return code of the power-off command. An already-off + // (or just-turned-off) host can make the power-off command report an error/conflict even though + // the host is down; conversely, an unreachable BMC must NOT be treated as a successful fence, to + // avoid split-brain. Therefore: (1) if already off, consider it fenced; (2) otherwise issue a + // best-effort power-off; (3) declare the fence successful only if the power state is confirmed Off. try { - if (outOfBandManagementService.isOutOfBandManagementEnabled(r)){ - final OutOfBandManagementResponse resp = outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null); - return resp.getSuccess(); - } else { - logger.warn("OOBM fence operation failed for this host {}", r); - return false; + if (isHostPoweredOff(r)) { + logger.info("Host {} is already powered off; considering it fenced.", r); + return true; } - } catch (Exception e){ - logger.warn("OOBM service is not configured or enabled for this host {} error is {}", r, e.getMessage()); - throw new HAFenceException(String.format("OBM service is not configured or enabled for this host %s", r.getName()), e); + + try { + outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null); + } catch (Exception e) { + // The power-off may legitimately fail (e.g. the chassis is already off or the command + // conflicts with the current power state). Do not fail here - verify the actual power + // state below instead of trusting the command result. + logger.warn("Out-of-band power-off command for host {} did not complete successfully ({}); verifying the actual power state.", r, e.getMessage()); + } + + if (isHostPoweredOff(r)) { + logger.info("Confirmed host {} is powered off; fencing successful.", r); + return true; + } + + logger.warn("Could not confirm host {} is powered off after the fence power-off; fencing will be retried.", r); + return false; + } catch (Exception e) { + logger.warn("Out-of-band fence operation failed for host {}: {}", r, e.getMessage()); + throw new HAFenceException(String.format("Out-of-band fence operation failed for host %s", r.getName()), e); + } + } + + /** + * Returns {@code true} only when the host's chassis power is positively confirmed to be Off via + * out-of-band management. Any failure to determine the power state (e.g. an unreachable BMC) returns + * {@code false}, so that a host whose state cannot be confirmed is never treated as fenced. + */ + protected boolean isHostPoweredOff(Host r) { + try { + final OutOfBandManagementResponse statusResponse = outOfBandManagementService.executePowerOperation(r, PowerOperation.STATUS, null); + return statusResponse != null && PowerState.Off.equals(statusResponse.getPowerState()); + } catch (Exception e) { + logger.warn("Unable to determine the out-of-band power state of host {}: {}", r, e.getMessage()); + return false; } } diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java index a94fb01475a3..8b7cfc854c08 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java @@ -20,10 +20,20 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.isNull; import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import org.apache.cloudstack.api.response.OutOfBandManagementResponse; import org.apache.cloudstack.ha.provider.HACheckerException; +import org.apache.cloudstack.ha.provider.HAFenceException; +import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation; +import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState; +import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService; import org.joda.time.DateTime; import org.junit.Before; import org.junit.Test; @@ -34,6 +44,7 @@ import com.cloud.exception.StorageUnavailableException; import com.cloud.host.Host; import com.cloud.hypervisor.Hypervisor.HypervisorType; +import com.cloud.utils.exception.CloudRuntimeException; @RunWith(MockitoJUnitRunner.class) public class KVMHostHATest { @@ -42,12 +53,21 @@ public class KVMHostHATest { private Host host; @Mock private KVMHostActivityChecker kvmHostActivityChecker; + @Mock + private OutOfBandManagementService outOfBandManagementService; private KVMHAProvider kvmHAProvider; @Before public void setup() { kvmHAProvider = new KVMHAProvider(); kvmHAProvider.hostActivityChecker = kvmHostActivityChecker; + kvmHAProvider.outOfBandManagementService = outOfBandManagementService; + } + + private OutOfBandManagementResponse powerStatusResponse(PowerState state) { + OutOfBandManagementResponse response = mock(OutOfBandManagementResponse.class); + lenient().when(response.getPowerState()).thenReturn(state); + return response; } @Test @@ -80,4 +100,60 @@ public void testHostActivityForDownHost() throws HACheckerException, StorageUnav assertFalse(kvmHAProvider.hasActivity(host, dt)); } + @Test + public void testFenceWhenOutOfBandManagementNotEnabled() throws HAFenceException { + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(false); + assertFalse(kvmHAProvider.fence(host)); + verify(outOfBandManagementService, never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()); + } + + @Test + public void testFenceWhenHostAlreadyPoweredOff() throws HAFenceException { + OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off); + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())).thenReturn(offStatus); + assertTrue(kvmHAProvider.fence(host)); + // The host is already off, so no power-off command should be issued. + verify(outOfBandManagementService, never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()); + } + + @Test + public void testFenceConfirmedOffAfterPowerOff() throws HAFenceException { + OutOfBandManagementResponse onStatus = powerStatusResponse(PowerState.On); + OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off); + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true); + // First STATUS (pre-check) returns On, second STATUS (post power-off) returns Off. + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())) + .thenReturn(onStatus, offStatus); + assertTrue(kvmHAProvider.fence(host)); + verify(outOfBandManagementService).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()); + } + + @Test + public void testFencePowerOffCommandFailsButHostConfirmedOff() throws HAFenceException { + // Regression: the power-off command fails (e.g. the chassis is already off and the BMC returns + // HTTP 409), but the host is actually off. Fencing must succeed because the confirmed power state - + // not the power-off command's result - is authoritative. + OutOfBandManagementResponse onStatus = powerStatusResponse(PowerState.On); + OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off); + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())) + .thenReturn(onStatus, offStatus); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull())) + .thenThrow(new CloudRuntimeException("power-off failed: HTTP 409")); + assertTrue(kvmHAProvider.fence(host)); + } + + @Test + public void testFenceFailsWhenPowerStateCannotBeConfirmedOff() throws HAFenceException { + // The host cannot be confirmed off (e.g. an unreachable BMC). Fencing must NOT succeed, to avoid + // restarting VMs while the host may still be running (split-brain). + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())) + .thenThrow(new CloudRuntimeException("BMC unreachable")); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull())) + .thenThrow(new CloudRuntimeException("BMC unreachable")); + assertFalse(kvmHAProvider.fence(host)); + } + } diff --git a/plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java b/plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java index 09cee3b21aec..17199de1c71e 100644 --- a/plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java +++ b/plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java @@ -31,7 +31,9 @@ public RedfishClient.RedfishResetCmd parsePowerCommand(OutOfBandManagement.Power case ON: return RedfishClient.RedfishResetCmd.On; case OFF: - return RedfishClient.RedfishResetCmd.GracefulShutdown; + // OFF is a hard power-off (consistent with the ipmitool driver and required for HA + // fencing of an unresponsive host); SOFT performs the graceful ACPI shutdown. + return RedfishClient.RedfishResetCmd.ForceOff; case CYCLE: return RedfishClient.RedfishResetCmd.PowerCycle; case RESET: @@ -39,7 +41,7 @@ public RedfishClient.RedfishResetCmd parsePowerCommand(OutOfBandManagement.Power case SOFT: return RedfishClient.RedfishResetCmd.GracefulShutdown; case STATUS: - throw new IllegalStateException(String.format("%s is not a valid Redfish Reset command [%s]", operation)); + throw new IllegalStateException(String.format("%s is not a valid Redfish Reset command", operation)); default: throw new IllegalStateException(String.format("Redfish does not support operation [%s]", operation)); }