Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package org.zstack.compute.vm;

import org.springframework.beans.factory.annotation.Autowired;
import org.zstack.core.cloudbus.CloudBusCallBack;
import org.zstack.core.componentloader.PluginRegistry;
import org.zstack.core.db.Q;
import org.zstack.core.gc.GC;
import org.zstack.core.gc.GCCompletion;
import org.zstack.core.gc.TimeBasedGarbageCollector;
import org.zstack.header.host.HostVO;
import org.zstack.header.message.MessageReply;
import org.zstack.header.storage.primary.CleanupVmInstanceMetadataOnPrimaryStorageMsg;
import org.zstack.header.storage.primary.PrimaryStorageConstant;
import org.zstack.header.storage.primary.PrimaryStorageVO;
import org.zstack.header.storage.primary.PrimaryStorageVO_;
import org.zstack.header.vm.metadata.VmMetadataPathBuildExtensionPoint;
import org.zstack.utils.Utils;
import org.zstack.utils.logging.CLogger;

public class CleanupVmInstanceMetadataOnPrimaryStorageGC extends TimeBasedGarbageCollector {
private static final CLogger logger = Utils.getLogger(CleanupVmInstanceMetadataOnPrimaryStorageGC.class);

@Autowired
private PluginRegistry pluginRgty;

@GC
public String primaryStorageUuid;
@GC
public String vmUuid;
@GC
public String rootVolumeUuid;
@GC
public String metadataPath;
@GC
public String hostUuid;

public static String getGCName(String vmUuid) {
return String.format("gc-cleanup-vm-metadata-%s", vmUuid);
}

@Override
protected void triggerNow(GCCompletion completion) {
if (!dbf.isExist(primaryStorageUuid, PrimaryStorageVO.class)) {
logger.debug(String.format("[MetadataCleanupGC] primary storage[uuid:%s] no longer exists, " +
"cancel gc for vm[uuid:%s]", primaryStorageUuid, vmUuid));
completion.cancel();
return;
}

String psType = Q.New(PrimaryStorageVO.class).select(PrimaryStorageVO_.type).eq(PrimaryStorageVO_.uuid, primaryStorageUuid).findValue();
if (psType == null) {
logger.debug(String.format("[MetadataCleanupGC] primary storage[uuid:%s] type not found, " +
"cancel gc for vm[uuid:%s]", primaryStorageUuid, vmUuid));
completion.cancel();
return;
}

VmMetadataPathBuildExtensionPoint ext = pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class);
boolean requireHost = ext != null && ext.requireHostForCleanup();

Comment on lines +58 to +60
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

缺少 metadata 扩展时应直接取消 GC。

这里把 ext == null 当成“无需 host”继续往下走,会把不支持元数据清理的主存储也送进 cleanup 流程。最终通常只会收到 operation not supported,GC 会进入无意义的周期性失败重试。

可参考的修正方式
         VmMetadataPathBuildExtensionPoint ext = pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class);
-        boolean requireHost = ext != null && ext.requireHostForCleanup();
+        if (ext == null) {
+            logger.debug(String.format("[MetadataCleanupGC] ps[uuid:%s, type:%s] does not support vm metadata cleanup, cancel gc for vm[uuid:%s]",
+                    primaryStorageUuid, psType, vmUuid));
+            completion.cancel();
+            return;
+        }
+        boolean requireHost = ext.requireHostForCleanup();
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@compute/src/main/java/org/zstack/compute/vm/CleanupVmInstanceMetadataOnPrimaryStorageGC.java`
around lines 58 - 60, 问题:当 VmMetadataPathBuildExtensionPoint 扩展不存在时代码把 ext==null
视为“不需要 host”并继续将该主存储加入清理流程,导致对不支持元数据清理的主存储反复失败。修复:在
CleanupVmInstanceMetadataOnPrimaryStorageGC 中定位到使用
pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class)
的位置(变量 ext),当 ext == null 时直接取消本次 GC(例如立即返回或跳过该主存储),不要将其当作 requireHost=false
处理;如果保留布尔变量 requireHost(调用 ext.requireHostForCleanup()),先做 ext != null 的判断,再根据
ext 决定是否继续,并在跳过时加入一条可追踪的日志以便排查。

// Determine effective hostUuid based on whether the PS type requires a host for cleanup.
String effectiveHostUuid = hostUuid;
if (!requireHost) {
effectiveHostUuid = null;
} else {
if (effectiveHostUuid == null) {
logger.debug(String.format("[MetadataCleanupGC] hostUuid is null and ps[uuid:%s, type:%s] " +
"requires host for cleanup, cancel gc for vm[uuid:%s]",
primaryStorageUuid, psType, vmUuid));
completion.cancel();
return;
}
if (!dbf.isExist(effectiveHostUuid, HostVO.class)) {
logger.debug(String.format("[MetadataCleanupGC] host[uuid:%s] no longer exists " +
"and ps[uuid:%s, type:%s] requires host for cleanup, " +
"metadata is unreachable, cancel gc for vm[uuid:%s]",
effectiveHostUuid, primaryStorageUuid, psType, vmUuid));
completion.cancel();
return;
}
}

CleanupVmInstanceMetadataOnPrimaryStorageMsg msg = new CleanupVmInstanceMetadataOnPrimaryStorageMsg();
msg.setPrimaryStorageUuid(primaryStorageUuid);
msg.setVmInstanceUuid(vmUuid);
msg.setRootVolumeUuid(rootVolumeUuid);
msg.setMetadataPath(metadataPath);
msg.setHostUuid(effectiveHostUuid);

bus.makeTargetServiceIdByResourceUuid(msg, PrimaryStorageConstant.SERVICE_ID, primaryStorageUuid);
bus.send(msg, new CloudBusCallBack(completion) {
@Override
public void run(MessageReply reply) {
if (reply.isSuccess()) {
logger.info(String.format("[MetadataCleanupGC] successfully cleaned up metadata " +
"for vm[uuid:%s] on ps[uuid:%s]", vmUuid, primaryStorageUuid));
completion.success();
} else {
logger.warn(String.format("[MetadataCleanupGC] failed to clean up metadata " +
"for vm[uuid:%s] on ps[uuid:%s]: %s", vmUuid, primaryStorageUuid, reply.getError()));
completion.fail(reply.getError());
}
}
});
}
}
143 changes: 143 additions & 0 deletions compute/src/main/java/org/zstack/compute/vm/VmExpungeMetadataFlow.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
package org.zstack.compute.vm;

import org.springframework.beans.factory.annotation.Autowire;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Configurable;
import org.zstack.core.cloudbus.CloudBus;
import org.zstack.core.cloudbus.CloudBusCallBack;
import org.zstack.core.componentloader.PluginRegistry;
import org.zstack.core.db.Q;
import org.zstack.header.core.workflow.FlowTrigger;
import org.zstack.header.core.workflow.NoRollbackFlow;
import org.zstack.header.message.MessageReply;
import org.zstack.header.storage.primary.CleanupVmInstanceMetadataOnPrimaryStorageMsg;
import org.zstack.header.storage.primary.PrimaryStorageConstant;
import org.zstack.header.storage.primary.PrimaryStorageVO;
import org.zstack.header.storage.primary.PrimaryStorageVO_;
import org.zstack.header.vm.VmInstanceConstant;
import org.zstack.header.vm.VmInstanceSpec;
import org.zstack.header.vm.metadata.VmMetadataPathBuildExtensionPoint;
import org.zstack.header.volume.VolumeInventory;
import org.zstack.utils.Utils;
import org.zstack.utils.logging.CLogger;

import java.util.Map;
import java.util.concurrent.TimeUnit;

@Configurable(preConstruction = true, autowire = Autowire.BY_TYPE)
public class VmExpungeMetadataFlow extends NoRollbackFlow {
private static final CLogger logger = Utils.getLogger(VmExpungeMetadataFlow.class);

@Autowired
private CloudBus bus;
@Autowired
private PluginRegistry pluginRgty;

@Override
public void run(FlowTrigger trigger, Map data) {
if (!VmGlobalConfig.VM_METADATA_ENABLED.value(Boolean.class)) {
trigger.next();
return;
}

final VmInstanceSpec spec = (VmInstanceSpec) data.get(VmInstanceConstant.Params.VmInstanceSpec.toString());
if (spec == null || spec.getVmInventory() == null) {
logger.warn("[MetadataExpunge] missing VmInstanceSpec or VmInventory, skip metadata cleanup");
trigger.next();
return;
}

final String vmUuid = spec.getVmInventory().getUuid();

VolumeInventory rootVolume = spec.getVmInventory().getRootVolume();
String psUuid = rootVolume != null ? rootVolume.getPrimaryStorageUuid() : null;
if (psUuid == null) {
logger.debug(String.format("[MetadataExpunge] vm[uuid:%s] root volume has no primaryStorageUuid, " +
"skipping metadata cleanup", vmUuid));
trigger.next();
return;
}


String psType = Q.New(PrimaryStorageVO.class).select(PrimaryStorageVO_.type).eq(PrimaryStorageVO_.uuid, psUuid).findValue();
if (psType == null) {
logger.warn(String.format("[MetadataExpunge] primary storage[uuid:%s] not found for vm[uuid:%s], " +
"skip metadata cleanup", psUuid, vmUuid));
trigger.next();
return;
}

VmMetadataPathBuildExtensionPoint ext = pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class);
if (ext == null) {
logger.warn(String.format("[MetadataExpunge] no VmMetadataPathBuildExtensionPoint found for ps[uuid:%s, type:%s], " +
"skip metadata cleanup", psUuid, psType));
trigger.next();
return;
}
final String metadataPath;
try {
metadataPath = ext.buildVmMetadataPath(psUuid, vmUuid);
} catch (Exception e) {
logger.warn(String.format("[MetadataExpunge] failed to build metadata path for vm[uuid:%s] on ps[uuid:%s], " +
"skip metadata cleanup: %s", vmUuid, psUuid, e.getMessage()));
trigger.next();
return;
}

String hostUuid = null;
if (ext.requireHostForCleanup()) {
hostUuid = spec.getVmInventory().getHostUuid();
if (hostUuid == null) {
hostUuid = spec.getVmInventory().getLastHostUuid();
}

if (hostUuid == null) {
logger.warn(String.format("[MetadataExpunge] vm[uuid:%s] hostUuid is null, " +
"ps[uuid:%s, type:%s] requires host for cleanup, skip without submitting GC",
vmUuid, psUuid, psType));
trigger.next();
return;
}
}

String rootVolumeUuid = rootVolume.getUuid();
CleanupVmInstanceMetadataOnPrimaryStorageMsg cmsg = new CleanupVmInstanceMetadataOnPrimaryStorageMsg();
cmsg.setPrimaryStorageUuid(psUuid);
cmsg.setVmInstanceUuid(vmUuid);
cmsg.setMetadataPath(metadataPath);
cmsg.setRootVolumeUuid(rootVolumeUuid);
cmsg.setHostUuid(hostUuid);
final String finalPsUuid = psUuid;
final String finalHostUuid = hostUuid;

bus.makeTargetServiceIdByResourceUuid(cmsg, PrimaryStorageConstant.SERVICE_ID, psUuid);
bus.send(cmsg, new CloudBusCallBack(trigger) {
@Override
public void run(MessageReply reply) {
if (reply.isSuccess()) {
logger.info(String.format("[MetadataExpunge] successfully deleted metadata for vm[uuid:%s] on ps[uuid:%s]",
vmUuid, finalPsUuid));
} else {
logger.warn(String.format("[MetadataExpunge] failed to delete metadata for vm[uuid:%s] on ps[uuid:%s]: %s, " +
"submitting GC job for retry", vmUuid, finalPsUuid, reply.getError()));
submitGC(finalPsUuid, vmUuid, rootVolumeUuid, metadataPath, finalHostUuid);
}
trigger.next();
}
});
}

private void submitGC(String psUuid, String vmUuid, String rootVolumeUuid, String metadataPath, String hostUuid) {
CleanupVmInstanceMetadataOnPrimaryStorageGC gc = new CleanupVmInstanceMetadataOnPrimaryStorageGC();
gc.NAME = CleanupVmInstanceMetadataOnPrimaryStorageGC.getGCName(vmUuid);
gc.primaryStorageUuid = psUuid;
gc.vmUuid = vmUuid;
gc.rootVolumeUuid = rootVolumeUuid;
gc.metadataPath = metadataPath;
gc.hostUuid = hostUuid;
long gcIntervalSec = TimeUnit.HOURS.toSeconds(VmGlobalConfig.VM_METADATA_CLEANUP_GC_INTERVAL.value(Long.class));
gc.deduplicateSubmit(gcIntervalSec, TimeUnit.SECONDS);

logger.info(String.format("[MetadataExpunge] submitted GC job [%s] for vm[uuid:%s] on ps[uuid:%s]", gc.NAME, vmUuid, psUuid));
}
}
57 changes: 57 additions & 0 deletions compute/src/main/java/org/zstack/compute/vm/VmGlobalConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,61 @@ public class VmGlobalConfig {
@GlobalConfigValidation(validValues = {"None", "AuthenticAMD"})
@BindResourceConfig(value = {VmInstanceVO.class})
public static GlobalConfig VM_CPUID_VENDOR = new GlobalConfig(CATEGORY, "vm.cpuid.vendor");

@GlobalConfigDef(defaultValue = "true", type = Boolean.class, description = "whether reset TPM state after VM clone")
@GlobalConfigValidation(validValues = {"true", "false"})
@BindResourceConfig(value = {VmInstanceVO.class, ClusterVO.class})
public static GlobalConfig RESET_TPM_AFTER_VM_CLONE = new GlobalConfig(CATEGORY, "reset.tpm.after.vm.clone");

@GlobalConfigDef(defaultValue = "false", type = Boolean.class, description = "allowed TPM VM start without KMS")
@GlobalConfigValidation(validValues = {"true", "false"})
public static GlobalConfig ALLOWED_TPM_VM_WITHOUT_KMS = new GlobalConfig(CATEGORY, "allowed.tpm.vm.without.kms");

@GlobalConfigValidation(validValues = {"true", "false"})
public static GlobalConfig VM_METADATA_ENABLED = new GlobalConfig(CATEGORY, "vm.metadata.enabled");

@GlobalConfigValidation()
public static GlobalConfig VM_METADATA_LAST_REFRESH_VERSION = new GlobalConfig(CATEGORY, "vm.metadata.lastRefreshVersion");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 100)
public static GlobalConfig VM_METADATA_FLUSH_CONCURRENCY = new GlobalConfig(CATEGORY, "vm.metadata.flush.concurrency");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 300)
public static GlobalConfig VM_METADATA_FLUSH_POLL_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.flush.pollInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 1000)
public static GlobalConfig VM_METADATA_FLUSH_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.flush.batchSize");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 168)
public static GlobalConfig VM_METADATA_CLEANUP_GC_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.cleanup.gc.interval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 10)
public static GlobalConfig VM_METADATA_FLUSH_MAX_RETRY = new GlobalConfig(CATEGORY, "vm.metadata.flush.maxRetry");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 120)
public static GlobalConfig VM_METADATA_FLUSH_ZOMBIE_CLAIM_THRESHOLD = new GlobalConfig(CATEGORY, "vm.metadata.flush.zombieClaimThreshold");

@GlobalConfigValidation(numberGreaterThan = 21599, numberLessThan = 172801)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 86400)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 1000)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_MAX_CYCLES = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryMaxCycles");

@GlobalConfigValidation(numberGreaterThan = 0)
public static GlobalConfig VM_METADATA_PAYLOAD_REJECT_THRESHOLD = new GlobalConfig(CATEGORY, "vm.metadata.payload.rejectThreshold");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 86400)
public static GlobalConfig VM_METADATA_MAINTENANCE_ORPHAN_CHECK_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.orphanCheckInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 20)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryBatchSize");

@GlobalConfigValidation(numberGreaterThan = 9, numberLessThan = 201)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftBatchSize");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 31)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_BATCH_SLEEP_SEC = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftBatchSleepSec");
}
Loading