diff --git a/compute/src/main/java/org/zstack/compute/vm/VmNicManagerImpl.java b/compute/src/main/java/org/zstack/compute/vm/VmNicManagerImpl.java index 25c0b005d71..31b3e35d32a 100644 --- a/compute/src/main/java/org/zstack/compute/vm/VmNicManagerImpl.java +++ b/compute/src/main/java/org/zstack/compute/vm/VmNicManagerImpl.java @@ -58,6 +58,10 @@ public void afterAddIpAddress(String vmNicUUid, String usedIpUuid) { SQL.New(UsedIpVO.class).eq(UsedIpVO_.uuid, usedIpUuid).set(UsedIpVO_.vmNicUuid, vmNicUUid).update(); VmNicVO nic = Q.New(VmNicVO.class).eq(VmNicVO_.uuid, vmNicUUid).find(); + if (nic == null) { + logger.debug(String.format("VmNic[uuid:%s] not found, skip afterAddIpAddress", vmNicUUid)); + return; + } UsedIpVO temp = null; /* if there is ipv4 addresses, we put the first attached ipv4 address to VmNic.ip @@ -88,6 +92,10 @@ public void afterAddIpAddress(String vmNicUUid, String usedIpUuid) { @Override public void afterDelIpAddress(String vmNicUUid, String usedIpUuid) { VmNicVO nic = Q.New(VmNicVO.class).eq(VmNicVO_.uuid, vmNicUUid).find(); + if (nic == null) { + logger.debug(String.format("VmNic[uuid:%s] not found, skip afterDelIpAddress", vmNicUUid)); + return; + } if (nic.getUsedIpUuid() != null && !nic.getUsedIpUuid().equals(usedIpUuid)) { return; } diff --git a/conf/i18n/globalErrorCodeMapping/global-error-en_US.json b/conf/i18n/globalErrorCodeMapping/global-error-en_US.json index 32eb4c8f056..715e823d95e 100644 --- a/conf/i18n/globalErrorCodeMapping/global-error-en_US.json +++ b/conf/i18n/globalErrorCodeMapping/global-error-en_US.json @@ -3374,7 +3374,7 @@ "ORG_ZSTACK_NETWORK_HUAWEI_IMASTER_10019": "delete token of SDN controller [IP:%s] failed because %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10004": "Cannot execute volume mapping to host flow due to invalid volume ID.%s", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10007": "port forwarding rule [uuid:%s] has not been attached to any virtual machine network interface, cannot detach", - "ORG_ZSTACK_MEVOCO_10088": "cannot take a snapshot for volumes[%s] when volume[uuid: %s] is not attached", + "ORG_ZSTACK_MEVOCO_10088": "cannot create snapshot for volume[uuid:%s] because it is not attached to any VM instance. Please attach the volume to a VM first. Affected volumes: %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10005": "Cannot execute map LUN to host flow due to invalid LUN type: %s", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10008": "port forwarding rule [uuid:%s] has been associated with vm nic [uuid:%s], cannot be reassigned again", "ORG_ZSTACK_MEVOCO_10087": "A Running VM[uuid:%s] has no associated Host UUID.", diff --git a/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json b/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json index 84609838ddc..01960e8eb45 100644 --- a/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json +++ b/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json @@ -3374,7 +3374,7 @@ "ORG_ZSTACK_NETWORK_HUAWEI_IMASTER_10019": "删除 SDN 控制器 [IP:%s] 的令牌失败,因为 %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10004": "无法执行映射LUN到主机流程,无效的LUN ID", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10007": "端口转发规则 rule[uuid:%s] 没有绑定到任何 VM 的网卡上,无法解除绑定", - "ORG_ZSTACK_MEVOCO_10088": "无法为挂载状态以外的卷[%s]创建快照", + "ORG_ZSTACK_MEVOCO_10088": "无法为云盘[uuid:%s]创建快照,因为该云盘未挂载到任何云主机。请先将云盘挂载到云主机后再创建快照。相关云盘: %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10005": "无法执行映射LUN到主机流程,无效的LUN类型", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10008": "端口转发规则[uuid:%s]已绑定到VM网卡[uuid:%s],无法再次绑定", "ORG_ZSTACK_MEVOCO_10087": "如何一个运行中的VM[uuid:%s]没有宿主机uuid?", diff --git a/core/src/main/java/org/zstack/core/cloudbus/ResourceDestinationMakerImpl.java b/core/src/main/java/org/zstack/core/cloudbus/ResourceDestinationMakerImpl.java index 08a776f1db2..9ead578395d 100755 --- a/core/src/main/java/org/zstack/core/cloudbus/ResourceDestinationMakerImpl.java +++ b/core/src/main/java/org/zstack/core/cloudbus/ResourceDestinationMakerImpl.java @@ -27,27 +27,27 @@ public class ResourceDestinationMakerImpl implements ManagementNodeChangeListene private DatabaseFacade dbf; @Override - public void nodeJoin(ManagementNodeInventory inv) { + public synchronized void nodeJoin(ManagementNodeInventory inv) { nodeHash.add(inv.getUuid()); nodes.put(inv.getUuid(), new NodeInfo(inv)); } @Override - public void nodeLeft(ManagementNodeInventory inv) { + public synchronized void nodeLeft(ManagementNodeInventory inv) { String nodeId = inv.getUuid(); nodeHash.remove(nodeId); nodes.remove(nodeId); } @Override - public void iAmDead(ManagementNodeInventory inv) { + public synchronized void iAmDead(ManagementNodeInventory inv) { String nodeId = inv.getUuid(); nodeHash.remove(nodeId); nodes.remove(nodeId); } @Override - public void iJoin(ManagementNodeInventory inv) { + public synchronized void iJoin(ManagementNodeInventory inv) { List lst = Q.New(ManagementNodeVO.class).list(); lst.forEach((ManagementNodeVO node) -> { nodeHash.add(node.getUuid()); @@ -56,7 +56,7 @@ public void iJoin(ManagementNodeInventory inv) { } @Override - public String makeDestination(String resourceUuid) { + public synchronized String makeDestination(String resourceUuid) { String nodeUuid = nodeHash.get(resourceUuid); if (nodeUuid == null) { throw new CloudRuntimeException("Cannot find any available management node to send message"); @@ -66,18 +66,18 @@ public String makeDestination(String resourceUuid) { } @Override - public boolean isManagedByUs(String resourceUuid) { + public synchronized boolean isManagedByUs(String resourceUuid) { String nodeUuid = makeDestination(resourceUuid); return nodeUuid.equals(Platform.getManagementServerId()); } @Override - public Collection getManagementNodesInHashRing() { - return nodeHash.getNodes(); + public synchronized Collection getManagementNodesInHashRing() { + return new ArrayList<>(nodeHash.getNodes()); } @Override - public NodeInfo getNodeInfo(String nodeUuid) { + public synchronized NodeInfo getNodeInfo(String nodeUuid) { NodeInfo info = nodes.get(nodeUuid); if (info == null) { ManagementNodeVO vo = dbf.findByUuid(nodeUuid, ManagementNodeVO.class); @@ -93,17 +93,17 @@ public NodeInfo getNodeInfo(String nodeUuid) { } @Override - public Collection getAllNodeInfo() { - return nodes.values(); + public synchronized Collection getAllNodeInfo() { + return new ArrayList<>(nodes.values()); } @Override - public int getManagementNodeCount() { - return nodes.values().size(); + public synchronized int getManagementNodeCount() { + return nodes.size(); } - public boolean isNodeInCircle(String nodeId) { + public synchronized boolean isNodeInCircle(String nodeId) { return nodeHash.hasNode(nodeId); } } diff --git a/core/src/main/java/org/zstack/core/thread/DispatchQueueImpl.java b/core/src/main/java/org/zstack/core/thread/DispatchQueueImpl.java index e298cdd787f..961b192ac7c 100755 --- a/core/src/main/java/org/zstack/core/thread/DispatchQueueImpl.java +++ b/core/src/main/java/org/zstack/core/thread/DispatchQueueImpl.java @@ -302,7 +302,7 @@ private class SyncTaskFuture extends AbstractFuture { public SyncTaskFuture(SyncTask task) { super(task); - this.parentContext = Context.current(); + this.parentContext = isTelemetryEnabled() ? Context.current() : null; } private SyncTask getTask() { diff --git a/header/src/main/java/org/zstack/header/storage/addon/primary/ExternalPrimaryStorageInventory.java b/header/src/main/java/org/zstack/header/storage/addon/primary/ExternalPrimaryStorageInventory.java index 7808c227623..a15ed211307 100644 --- a/header/src/main/java/org/zstack/header/storage/addon/primary/ExternalPrimaryStorageInventory.java +++ b/header/src/main/java/org/zstack/header/storage/addon/primary/ExternalPrimaryStorageInventory.java @@ -4,8 +4,10 @@ import org.zstack.header.storage.primary.PrimaryStorageInventory; import org.zstack.utils.gson.JSONObjectUtil; +import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; @Inventory(mappingVOClass = ExternalPrimaryStorageVO.class) @@ -59,6 +61,7 @@ public ExternalPrimaryStorageInventory(ExternalPrimaryStorageVO lvo) { super(lvo); identity = lvo.getIdentity(); config = JSONObjectUtil.toObject(lvo.getConfig(), LinkedHashMap.class); + desensitizeConfig(config); addonInfo = JSONObjectUtil.toObject(lvo.getAddonInfo(), LinkedHashMap.class); outputProtocols = lvo.getOutputProtocols().stream().map(PrimaryStorageOutputProtocolRefVO::getOutputProtocol).collect(Collectors.toList()); defaultProtocol = lvo.getDefaultProtocol(); @@ -68,6 +71,35 @@ public static ExternalPrimaryStorageInventory valueOf(ExternalPrimaryStorageVO l return new ExternalPrimaryStorageInventory(lvo); } + private static void desensitizeConfig(Map config) { + if (config == null) return; + desensitizeUrlList(config, "mdsUrls"); + desensitizeUrlList(config, "mdsInfos"); + } + + private static void desensitizeUrlList(Map config, String key) { + Object urls = config.get(key); + if (urls instanceof List) { + List desensitized = new ArrayList<>(); + for (Object url : (List) urls) { + desensitized.add(desensitizeUrl(String.valueOf(url))); + } + config.put(key, desensitized); + } + } + + private static String desensitizeUrl(String url) { + int atIndex = url.lastIndexOf('@'); + if (atIndex > 0) { + int schemeIndex = url.indexOf("://"); + if (schemeIndex >= 0 && schemeIndex < atIndex) { + return url.substring(0, schemeIndex + 3) + "***" + url.substring(atIndex); + } + return "***" + url.substring(atIndex); + } + return url; + } + public String getIdentity() { return identity; } diff --git a/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java b/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java index 7007c592aea..99ee2173b98 100755 --- a/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java +++ b/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java @@ -847,7 +847,9 @@ public void setBootMode(String bootMode) { public long getRootDiskAllocateSize() { if (rootDiskOffering == null) { - return this.getImageSpec().getInventory().getSize(); + long virtualSize = this.getImageSpec().getInventory().getSize(); + long actualSize = this.getImageSpec().getInventory().getActualSize(); + return Math.max(virtualSize, actualSize); } return rootDiskOffering.getDiskSize(); } diff --git a/header/src/main/java/org/zstack/header/vm/VmInstanceState.java b/header/src/main/java/org/zstack/header/vm/VmInstanceState.java index 8a755b52fda..49303e23252 100755 --- a/header/src/main/java/org/zstack/header/vm/VmInstanceState.java +++ b/header/src/main/java/org/zstack/header/vm/VmInstanceState.java @@ -168,6 +168,7 @@ public enum VmInstanceState { new Transaction(VmInstanceStateEvent.destroyed, VmInstanceState.Destroyed), new Transaction(VmInstanceStateEvent.destroying, VmInstanceState.Destroying), new Transaction(VmInstanceStateEvent.running, VmInstanceState.Running), + new Transaction(VmInstanceStateEvent.stopped, VmInstanceState.Stopped), new Transaction(VmInstanceStateEvent.expunging, VmInstanceState.Expunging) ); Destroyed.transactions( diff --git a/network/src/main/java/org/zstack/network/l3/L3BasicNetwork.java b/network/src/main/java/org/zstack/network/l3/L3BasicNetwork.java index 5536a5fc487..b1b0b92d497 100755 --- a/network/src/main/java/org/zstack/network/l3/L3BasicNetwork.java +++ b/network/src/main/java/org/zstack/network/l3/L3BasicNetwork.java @@ -1075,6 +1075,13 @@ private void handle(APIGetFreeIpMsg msg) { } limit -= freeIpInventorys.size(); } + + Set reservedIpRanges = self.getReservedIpRanges(); + if (reservedIpRanges != null && !reservedIpRanges.isEmpty()) { + freeIpInventorys.removeIf(freeIp -> reservedIpRanges.stream().anyMatch( + r -> NetworkUtils.isInRange(freeIp.getIp(), r.getStartIp(), r.getEndIp()))); + } + reply.setInventories(freeIpInventorys); bus.reply(msg, reply); diff --git a/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java b/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java index d80b40a1d6a..8b387306683 100755 --- a/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java +++ b/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java @@ -5446,7 +5446,7 @@ private void deleteSnapshotOnPrimaryStorage(final DeleteSnapshotOnPrimaryStorage httpCall(DELETE_SNAPSHOT_PATH, cmd, DeleteSnapshotRsp.class, new ReturnValueCompletion(msg) { @Override public void success(DeleteSnapshotRsp returnValue) { - osdHelper.releaseAvailableCapacity(msg.getSnapshot().getPrimaryStorageInstallPath(), msg.getSnapshot().getSize()); + osdHelper.releaseAvailableCapWithRatio(msg.getSnapshot().getPrimaryStorageInstallPath(), msg.getSnapshot().getSize()); bus.reply(msg, reply); completion.done(); } diff --git a/plugin/loadBalancer/src/main/java/org/zstack/network/service/lb/LoadBalancerApiInterceptor.java b/plugin/loadBalancer/src/main/java/org/zstack/network/service/lb/LoadBalancerApiInterceptor.java index 97b88c919c2..0d9946d5320 100755 --- a/plugin/loadBalancer/src/main/java/org/zstack/network/service/lb/LoadBalancerApiInterceptor.java +++ b/plugin/loadBalancer/src/main/java/org/zstack/network/service/lb/LoadBalancerApiInterceptor.java @@ -39,6 +39,7 @@ import org.zstack.network.service.vip.VipVO_; import org.zstack.tag.PatternedSystemTag; import org.zstack.tag.TagManager; +import org.zstack.core.upgrade.UpgradeGlobalConfig; import org.zstack.utils.*; import org.zstack.utils.function.ForEachFunction; import org.zstack.utils.logging.CLogger; @@ -152,10 +153,22 @@ public APIMessage intercept(APIMessage msg) throws ApiMessageInterceptionExcepti validate((APIGetCandidateVmNicsForLoadBalancerServerGroupMsg)msg); } else if (msg instanceof APIChangeLoadBalancerBackendServerMsg) { validate((APIChangeLoadBalancerBackendServerMsg)msg); + } else if (msg instanceof APIDeleteLoadBalancerMsg) { + validate((APIDeleteLoadBalancerMsg) msg); } return msg; } + private void validate(APIDeleteLoadBalancerMsg msg) { + if (UpgradeGlobalConfig.GRAYSCALE_UPGRADE.value(Boolean.class)) { + LoadBalancerVO lb = dbf.findByUuid(msg.getUuid(), LoadBalancerVO.class); + if (lb != null && lb.getType() == LoadBalancerType.SLB) { + throw new ApiMessageInterceptionException(argerr( + "cannot delete the standalone load balancer[uuid:%s] during grayscale upgrade", msg.getUuid())); + } + } + } + private void validate(APIDeleteAccessControlListMsg msg) { /*List refs = Q.New(LoadBalancerListenerACLRefVO.class).select(LoadBalancerListenerACLRefVO_.listenerUuid) .eq(LoadBalancerListenerACLRefVO_.aclUuid, msg.getUuid()).isNull(LoadBalancerListenerACLRefVO_.serverGroupUuid).listValues(); diff --git a/plugin/sdnController/src/main/java/org/zstack/sdnController/SdnControllerApiInterceptor.java b/plugin/sdnController/src/main/java/org/zstack/sdnController/SdnControllerApiInterceptor.java index 97840150a1e..fa0f9c1932d 100644 --- a/plugin/sdnController/src/main/java/org/zstack/sdnController/SdnControllerApiInterceptor.java +++ b/plugin/sdnController/src/main/java/org/zstack/sdnController/SdnControllerApiInterceptor.java @@ -87,6 +87,8 @@ public APIMessage intercept(APIMessage msg) throws ApiMessageInterceptionExcepti validate((APIPullSdnControllerTenantMsg) msg); } else if (msg instanceof APIChangeSdnControllerMsg) { validate((APIChangeSdnControllerMsg) msg); + } else if (msg instanceof APIRemoveSdnControllerMsg) { + validate((APIRemoveSdnControllerMsg) msg); } setServiceId(msg); @@ -311,6 +313,17 @@ private boolean isOverlappedVlanRange(int start, int end, Integer startVlan, Int (start <= startVlan && end >= endVlan); } + private void validate(APIRemoveSdnControllerMsg msg) { + long poolCount = Q.New(HardwareL2VxlanNetworkPoolVO.class) + .eq(HardwareL2VxlanNetworkPoolVO_.sdnControllerUuid, msg.getUuid()) + .count(); + if (poolCount > 0) { + throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_SDNCONTROLLER_10031, + "could not remove sdn controller[uuid:%s] because it still has %d L2 vxlan network pool(s) attached. Please detach them first", + msg.getUuid(), poolCount)); + } + } + private void validate(APIChangeSdnControllerMsg msg) { if (msg.getVlanRanges() != null && !msg.getVlanRanges().isEmpty()) { validateVlanRanges(msg.getVlanRanges()); diff --git a/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java b/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java index db06239acb3..276ab367ba1 100644 --- a/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java +++ b/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java @@ -179,7 +179,10 @@ public List getActiveClients(String installPath, String prot if (VolumeProtocol.CBD.toString().equals(protocol)) { GetVolumeClientsCmd cmd = new GetVolumeClientsCmd(); cmd.setPath(installPath); - GetVolumeClientsRsp rsp = syncHttpCall(GET_VOLUME_CLIENTS_PATH, cmd, GetVolumeClientsRsp.class); + GetVolumeClientsRsp rsp = new HttpCaller<>(GET_VOLUME_CLIENTS_PATH, cmd, GetVolumeClientsRsp.class, + null, TimeUnit.SECONDS, 30, true) + .setTryNext(true) + .syncCall(); List clients = new ArrayList<>(); if (!rsp.isSuccess()) { @@ -1411,6 +1414,11 @@ public class HttpCaller { private boolean tryNext = false; + HttpCaller setTryNext(boolean tryNext) { + this.tryNext = tryNext; + return this; + } + public HttpCaller(String path, AgentCommand cmd, Class retClass, ReturnValueCompletion callback) { this(path, cmd, retClass, callback, null, 0, false); } diff --git a/portal/src/main/java/org/zstack/portal/managementnode/ManagementNodeManagerImpl.java b/portal/src/main/java/org/zstack/portal/managementnode/ManagementNodeManagerImpl.java index a945ab77274..4ece718ff52 100755 --- a/portal/src/main/java/org/zstack/portal/managementnode/ManagementNodeManagerImpl.java +++ b/portal/src/main/java/org/zstack/portal/managementnode/ManagementNodeManagerImpl.java @@ -74,6 +74,7 @@ import java.sql.SQLException; import java.sql.Timestamp; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -107,6 +108,15 @@ public class ManagementNodeManagerImpl extends AbstractService implements Manage // A dictionary (nodeId -> ManagementNodeInventory) of joined management Node final private Map joinedManagementNodes = new ConcurrentHashMap<>(); + // Lock to serialize lifecycle events from heartbeat reconciliation and canonical event callbacks, + // preventing race conditions where a nodeJoin event is immediately followed by a stale nodeLeft + // from the heartbeat thread, or vice versa. See ZSTAC-77711. + private final Object lifecycleLock = new Object(); + + // Track nodes found in hash ring but missing from DB. Only call nodeLeft after a node + // is missing for two consecutive heartbeat cycles, to avoid removing nodes that just joined. + private final Set suspectedMissingFromDb = new HashSet<>(); + private static int NODE_STARTING = 0; private static int NODE_RUNNING = 1; private static int NODE_FAILED = -1; @@ -368,12 +378,16 @@ protected void run(Map tokens, Object data) { ManagementNodeLifeCycleData d = (ManagementNodeLifeCycleData) data; - if (LifeCycle.NodeJoin.toString().equals(d.getLifeCycle())) { - nodeLifeCycle.nodeJoin(d.getInventory()); - } else if (LifeCycle.NodeLeft.toString().equals(d.getLifeCycle())) { - nodeLifeCycle.nodeLeft(d.getInventory()); - } else { - throw new CloudRuntimeException(String.format("unknown lifecycle[%s]", d.getLifeCycle())); + synchronized (lifecycleLock) { + if (LifeCycle.NodeJoin.toString().equals(d.getLifeCycle())) { + // Clear from suspected set since the node is confirmed alive + suspectedMissingFromDb.remove(d.getInventory().getUuid()); + nodeLifeCycle.nodeJoin(d.getInventory()); + } else if (LifeCycle.NodeLeft.toString().equals(d.getLifeCycle())) { + nodeLifeCycle.nodeLeft(d.getInventory()); + } else { + throw new CloudRuntimeException(String.format("unknown lifecycle[%s]", d.getLifeCycle())); + } } } }; @@ -860,34 +874,55 @@ private void checkAllNodesHealth() { Set nodeUuidsInDb = nodesInDb.stream().map(ManagementNodeVO::getUuid).collect(Collectors.toSet()); - // When a node is dying, we may not receive the the dead notification because the message bus may be also dead - // at that moment. By checking if the node UUID is still in our hash ring, we know what nodes should be kicked out - destinationMaker.getManagementNodesInHashRing().forEach(nodeUuid -> { - if (!nodeUuidsInDb.contains(nodeUuid)) { - logger.warn(String.format("found that a management node[uuid:%s] had no heartbeat in database but still in our hash ring," + - "notify that it's dead", nodeUuid)); - ManagementNodeInventory inv = new ManagementNodeInventory(); - inv.setUuid(nodeUuid); - inv.setHostName(destinationMaker.getNodeInfo(nodeUuid).getNodeIP()); - - nodeLifeCycle.nodeLeft(inv); - } - }); - - // check if any node missing in our hash ring - nodesInDb.forEach(n -> { - if (n.getUuid().equals(node().getUuid()) || suspects.contains(n)) { - return; - } - - new Runnable() { - @Override - @AsyncThread - public void run() { - nodeLifeCycle.nodeJoin(ManagementNodeInventory.valueOf(n)); + // Reconcile hash ring with DB under lifecycleLock to prevent race with + // canonical event callbacks (nodeJoin/nodeLeft). See ZSTAC-77711. + synchronized (lifecycleLock) { + // When a node is dying, we may not receive the dead notification because the message bus may be also dead + // at that moment. By checking if the node UUID is still in our hash ring, we know what nodes should be kicked out. + // Use two-round confirmation: first round marks as suspected, second round actually removes. + Set currentSuspected = new HashSet<>(); + destinationMaker.getManagementNodesInHashRing().forEach(nodeUuid -> { + if (!nodeUuidsInDb.contains(nodeUuid)) { + if (suspectedMissingFromDb.contains(nodeUuid)) { + // Second consecutive detection — confirmed missing, remove from hash ring + logger.warn(String.format("management node[uuid:%s] confirmed missing from database for two consecutive" + + " heartbeat cycles, removing from hash ring", nodeUuid)); + ManagementNodeInventory inv = new ManagementNodeInventory(); + inv.setUuid(nodeUuid); + try { + inv.setHostName(destinationMaker.getNodeInfo(nodeUuid).getNodeIP()); + } catch (Exception e) { + logger.warn(String.format("cannot get node info for node[uuid:%s], use empty hostname", nodeUuid)); + } + + nodeLifeCycle.nodeLeft(inv); + } else { + // First detection — mark as suspected, defer removal to next cycle + logger.warn(String.format("management node[uuid:%s] not found in database but still in hash ring," + + " marking as suspected (will remove on next heartbeat if still missing)", nodeUuid)); + currentSuspected.add(nodeUuid); + } } - }.run(); - }); + }); + // Update suspected set: only keep nodes that are newly suspected this round + suspectedMissingFromDb.clear(); + suspectedMissingFromDb.addAll(currentSuspected); + + // check if any node missing in our hash ring + nodesInDb.forEach(n -> { + if (n.getUuid().equals(node().getUuid()) || suspects.contains(n)) { + return; + } + + new Runnable() { + @Override + @AsyncThread + public void run() { + nodeLifeCycle.nodeJoin(ManagementNodeInventory.valueOf(n)); + } + }.run(); + }); + } } @Override diff --git a/test/src/test/groovy/org/zstack/test/integration/network/sdnController/HardwareVxlanCase.groovy b/test/src/test/groovy/org/zstack/test/integration/network/sdnController/HardwareVxlanCase.groovy index d5f0c01eb14..1cebbf68020 100644 --- a/test/src/test/groovy/org/zstack/test/integration/network/sdnController/HardwareVxlanCase.groovy +++ b/test/src/test/groovy/org/zstack/test/integration/network/sdnController/HardwareVxlanCase.groovy @@ -41,6 +41,11 @@ class HardwareVxlanCase extends SubCase { @Override void clean() { + // ZSTAC-80186: delete pools created in createEnv() before env.delete() + // removes the SDN controller (which now validates no attached pools) + queryL2VxlanNetworkPool {}.each { pool -> + deleteL2Network { uuid = pool.uuid } + } env.delete() } diff --git a/utils/src/main/java/org/zstack/utils/clouderrorcode/CloudOperationsErrorCode.java b/utils/src/main/java/org/zstack/utils/clouderrorcode/CloudOperationsErrorCode.java index a0f09d4f1e9..c9624e59ea9 100644 --- a/utils/src/main/java/org/zstack/utils/clouderrorcode/CloudOperationsErrorCode.java +++ b/utils/src/main/java/org/zstack/utils/clouderrorcode/CloudOperationsErrorCode.java @@ -6274,6 +6274,8 @@ public class CloudOperationsErrorCode { public static final String ORG_ZSTACK_STORAGE_BACKUP_10133 = "ORG_ZSTACK_STORAGE_BACKUP_10133"; + public static final String ORG_ZSTACK_STORAGE_BACKUP_CANCEL_TIMEOUT = "ORG_ZSTACK_STORAGE_BACKUP_CANCEL_TIMEOUT"; + public static final String ORG_ZSTACK_COMPUTE_10000 = "ORG_ZSTACK_COMPUTE_10000"; public static final String ORG_ZSTACK_COMPUTE_10001 = "ORG_ZSTACK_COMPUTE_10001"; @@ -15264,6 +15266,8 @@ public class CloudOperationsErrorCode { public static final String ORG_ZSTACK_SDNCONTROLLER_10030 = "ORG_ZSTACK_SDNCONTROLLER_10030"; + public static final String ORG_ZSTACK_SDNCONTROLLER_10031 = "ORG_ZSTACK_SDNCONTROLLER_10031"; + public static final String ORG_ZSTACK_TEST_INTEGRATION_PREMIUM_ZSV_SNAPSHOT_10000 = "ORG_ZSTACK_TEST_INTEGRATION_PREMIUM_ZSV_SNAPSHOT_10000"; public static final String ORG_ZSTACK_TEST_INTEGRATION_PREMIUM_ZSV_SNAPSHOT_10001 = "ORG_ZSTACK_TEST_INTEGRATION_PREMIUM_ZSV_SNAPSHOT_10001";