Skip to content
20 changes: 20 additions & 0 deletions plugin/kvm/src/main/java/org/zstack/kvm/KVMAgentCommands.java
Original file line number Diff line number Diff line change
Expand Up @@ -3587,6 +3587,26 @@ public static class MigrateVmCmd extends AgentCommand implements HasThreadContex
private boolean reload;
@GrayVersion(value = "5.0.0")
private long bandwidth;
@GrayVersion(value = "4.8.38")
private boolean useTls;
@GrayVersion(value = "4.8.38")
private String srcHostManagementIp;

public String getSrcHostManagementIp() {
return srcHostManagementIp;
}

public void setSrcHostManagementIp(String srcHostManagementIp) {
this.srcHostManagementIp = srcHostManagementIp;
}

public boolean isUseTls() {
return useTls;
}

public void setUseTls(boolean useTls) {
this.useTls = useTls;
}

public Integer getDownTime() {
return downTime;
Expand Down
1 change: 1 addition & 0 deletions plugin/kvm/src/main/java/org/zstack/kvm/KVMConstant.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ public interface KVMConstant {
String KVM_DELETE_CONSOLE_FIREWALL_PATH = "/vm/console/deletefirewall";
String KVM_UPDATE_HOST_OS_PATH = "/host/updateos";
String KVM_HOST_UPDATE_DEPENDENCY_PATH = "/host/updatedependency";

String HOST_SHUTDOWN = "/host/shutdown";
String HOST_REBOOT = "/host/reboot";
String HOST_UPDATE_SPICE_CHANNEL_CONFIG_PATH = "/host/updateSpiceChannelConfig";
Expand Down
4 changes: 4 additions & 0 deletions plugin/kvm/src/main/java/org/zstack/kvm/KVMGlobalConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ public class KVMGlobalConfig {
@BindResourceConfig({HostVO.class, ClusterVO.class})
public static GlobalConfig RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE = new GlobalConfig(CATEGORY, "reconnect.host.restart.libvirtd.service");

@GlobalConfigValidation(validValues = {"true", "false"})
@GlobalConfigDef(defaultValue = "true", type = Boolean.class, description = "enable TLS encryption for libvirt remote connections (migration)")
public static GlobalConfig LIBVIRT_TLS_ENABLED = new GlobalConfig(CATEGORY, "libvirt.tls.enabled");

@GlobalConfigValidation(numberGreaterThan = 0)
public static GlobalConfig KVMAGENT_PHYSICAL_MEMORY_USAGE_ALARM_THRESHOLD = new GlobalConfig(CATEGORY, "kvmagent.physicalmemory.usage.alarm.threshold");

Expand Down
106 changes: 106 additions & 0 deletions plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.zstack.core.db.SQLBatch;
import org.zstack.core.db.SimpleQuery;
import org.zstack.core.db.SimpleQuery.Op;
import org.zstack.core.jsonlabel.JsonLabel;
import org.zstack.core.thread.*;
import org.zstack.core.timeout.ApiTimeoutManager;
import org.zstack.core.upgrade.UpgradeChecker;
Expand Down Expand Up @@ -124,6 +125,20 @@ public class KVMHost extends HostBase implements Host {
protected static OperationChecker allowedOperations = new OperationChecker(true);
protected static OperationChecker skipOperations = new OperationChecker(true);

public static Set<String> parseSanIps(String sanOutput) {
Set<String> sanIps = new HashSet<>();
if (sanOutput == null || sanOutput.isEmpty()) {
return sanIps;
}
for (String line : sanOutput.split(",|\n")) {
String trimmed = line.trim();
if (trimmed.startsWith("IP Address:")) {
sanIps.add(trimmed.substring("IP Address:".length()).trim());
}
}
return sanIps;
}

@Autowired
@Qualifier("KVMHostFactory")
protected KVMHostFactory factory;
Expand Down Expand Up @@ -3063,6 +3078,7 @@ public void run(final FlowTrigger trigger, Map data) {
cmd.setDestHostIp(dstHostMigrateIp);
cmd.setSrcHostIp(srcHostMigrateIp);
cmd.setDestHostManagementIp(dstHostMnIp);
cmd.setSrcHostManagementIp(srcHostMnIp);
cmd.setMigrateFromDestination(migrateFromDestination);
cmd.setStorageMigrationPolicy(storageMigrationPolicy == null ? null : storageMigrationPolicy.toString());
cmd.setVmUuid(vmUuid);
Expand All @@ -3074,6 +3090,8 @@ public void run(final FlowTrigger trigger, Map data) {
cmd.setDownTime(s.downTime);
cmd.setBandwidth(s.bandwidth);
cmd.setNics(nicTos);
cmd.setUseTls(KVMGlobalConfig.LIBVIRT_TLS_ENABLED.value(Boolean.class)
&& rcf.getResourceConfigValue(KVMGlobalConfig.RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE, self.getUuid(), Boolean.class));
Comment on lines +3093 to +3094

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

同时校验源端和目的端的 TLS 就绪条件。

Line 3094 这里只读取了 self.getUuid() 上的 RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE,但同一个方法里实际发起迁移的宿主机会在 srcHostUuid / dstHostUuid 之间切换,而且证书部署与 libvirtd 重启策略也是按 host 维度在 connectHook() 里生效的。这样一来,只要两端配置不一致,就可能在对端尚未具备 TLS 迁移条件时仍把 useTls 置为 true,直接把跨主机迁移打到 TLS 路径上并失败。这里至少要同时检查 srcHostUuiddstHostUuid,或者复用一个统一的 “host TLS ready” 判定。

💡 建议修改
-                        cmd.setUseTls(KVMGlobalConfig.LIBVIRT_TLS_ENABLED.value(Boolean.class)
-                                && rcf.getResourceConfigValue(KVMGlobalConfig.RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE, self.getUuid(), Boolean.class));
+                        boolean srcTlsReady = rcf.getResourceConfigValue(
+                                KVMGlobalConfig.RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE,
+                                srcHostUuid, Boolean.class);
+                        boolean dstTlsReady = rcf.getResourceConfigValue(
+                                KVMGlobalConfig.RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE,
+                                dstHostUuid, Boolean.class);
+                        cmd.setUseTls(KVMGlobalConfig.LIBVIRT_TLS_ENABLED.value(Boolean.class)
+                                && srcTlsReady
+                                && dstTlsReady);
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@plugin/kvm/src/main/java/org/zstack/kvm/KVMHost.java` around lines 3093 -
3094, The call that sets cmd.setUseTls currently only checks
RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE for self.getUuid(), which can enable TLS
even if the other peer isn't ready; update the logic so useTls is true only when
both endpoints are TLS-ready (e.g., check KVMGlobalConfig.LIBVIRT_TLS_ENABLED
plus the per-host RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE for both srcHostUuid
and dstHostUuid), or factor out a hostTlsReady(hostUuid) helper (used by
connectHook()) and call it for both srcHostUuid and dstHostUuid before setting
cmd.setUseTls.


if (s.diskMigrationMap != null) {
Map<String, VolumeTO> diskMigrationMap = new HashMap<>();
Expand Down Expand Up @@ -5527,6 +5545,72 @@ public void run(FlowTrigger trigger, Map data) {
}
});

flow(new NoRollbackFlow() {
String __name__ = "check-tls-certs-if-needed";

@Override
public boolean skip(Map data) {
// ZSTAC-84446: run detection whenever TLS is enabled so check
// and first-deploy share the same IP source.
return CoreGlobalProperty.UNIT_TEST_ON
|| !KVMGlobalConfig.LIBVIRT_TLS_ENABLED.value(Boolean.class);
}

@Override
public void run(FlowTrigger trigger, Map data) {
// ZSTAC-84446: detection is best-effort. SSH failures must NOT
// break reconnect; on error we skip and let the deploy step
// fall back to mgmtIp + EXTRA_IPS.
try {
String managementIp = getSelf().getManagementIp();

SshShell sshShell = new SshShell();
sshShell.setHostname(managementIp);
sshShell.setUsername(getSelf().getUsername());
sshShell.setPassword(getSelf().getPassword());
sshShell.setPort(getSelf().getPort());

// Same logic as zstack-utility host_plugin.fact() so MN's
// expectation matches what the host itself reports.
String certIpList = KVMHostUtils.collectHostIps(
sshShell, self.getUuid(), managementIp);
List<String> allIps = new ArrayList<>(Arrays.asList(certIpList.split(",")));
// Save detected IPs so apply-ansible-playbook can union with
// EXTRA_IPS without running a second SSH.
data.put("TLS_DETECTED_IPS", certIpList);

SshResult sanResult = sshShell.runCommand(
"openssl x509 -in /etc/pki/libvirt/servercert.pem -noout -ext subjectAltName 2>/dev/null");

boolean needDeploy = false;
if (sanResult.isSshFailure() || sanResult.getReturnCode() != 0
|| sanResult.getStdout() == null || sanResult.getStdout().trim().isEmpty()) {
logger.info(String.format("TLS cert not found or unreadable on host[uuid:%s], need deploy", self.getUuid()));
needDeploy = true;
} else {
Set<String> sanIps = parseSanIps(sanResult.getStdout());
for (String ip : allIps) {
if (!sanIps.contains(ip)) {
logger.info(String.format("TLS cert SAN missing IP %s on host[uuid:%s], need deploy", ip, self.getUuid()));
needDeploy = true;
break;
}
}
}

if (needDeploy) {
data.put("NEED_DEPLOY_TLS_CERT", true);
}
} catch (Exception e) {
logger.warn(String.format(
"TLS cert detection failed on host[uuid:%s], continue connect flow: %s",
self.getUuid(), e.getMessage()), e);
}

trigger.next();
}
});

flow(new NoRollbackFlow() {
String __name__ = "apply-ansible-playbook";

Expand Down Expand Up @@ -5665,6 +5749,27 @@ public void run(final FlowTrigger trigger, Map data) {
deployArguments.setSkipPackages(info.getSkipPackages());
deployArguments.setUpdatePackages(String.valueOf(CoreGlobalProperty.UPDATE_PKG_WHEN_CONNECT));

String managementIp = getSelf().getManagementIp();
String detectedIps = (String) data.get("TLS_DETECTED_IPS");
String tlsCertIps = KVMHostUtils.unionTlsCertIps(
self.getUuid(), managementIp, detectedIps);
deployArguments.setTlsCertIps(tlsCertIps);

// ZSTAC-84446: force ansible re-run only when policy allows;
// see KVMHostUtils#shouldForceTlsRedeploy.
Boolean needDeployTlsCert = (Boolean) data.get("NEED_DEPLOY_TLS_CERT");
boolean allowRestart = rcf.getResourceConfigValue(
KVMGlobalConfig.RECONNECT_HOST_RESTART_LIBVIRTD_SERVICE,
self.getUuid(), Boolean.class);
if (KVMHostUtils.shouldForceTlsRedeploy(
Boolean.TRUE.equals(needDeployTlsCert), allowRestart, info.isNewAdded())) {
runner.setForceRun(true);
deployArguments.setRestartLibvirtd("true");
} else if (Boolean.TRUE.equals(needDeployTlsCert)) {
logger.info(String.format("TLS cert needs deploy on host[uuid:%s], skip " +
"force-run to keep kvmagent PID stable", self.getUuid()));
}

if (deployArguments.isForceRun()) {
runner.setForceRun(true);
}
Expand Down Expand Up @@ -5850,6 +5955,7 @@ public void fail(ErrorCode errorCode) {

flow(createCollectHostFactsFlow(info));


if (info.isNewAdded()) {
flow(new NoRollbackFlow() {
String __name__ = "check-qemu-libvirt-version";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ public class KVMHostDeployArguments extends SyncTimeRequestedDeployArguments {
private String restartLibvirtd;
@SerializedName("extra_packages")
private String extraPackages;
@SerializedName("tls_cert_ips")
private String tlsCertIps;

private transient boolean forceRun = false;

Expand Down Expand Up @@ -128,6 +130,14 @@ public void setExtraPackages(String extraPackages) {
this.extraPackages = extraPackages;
}

public String getTlsCertIps() {
return tlsCertIps;
}

public void setTlsCertIps(String tlsCertIps) {
this.tlsCertIps = tlsCertIps;
}

public String getEnableSpiceTls() {
return enableSpiceTls;
}
Expand Down
56 changes: 56 additions & 0 deletions plugin/kvm/src/main/java/org/zstack/kvm/KVMHostFactory.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.zstack.kvm;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.util.UriComponentsBuilder;
Expand All @@ -10,6 +11,8 @@
import org.zstack.header.errorcode.ErrorCode;
import org.zstack.core.config.*;
import org.zstack.core.config.schema.GuestOsCharacter;
import org.zstack.core.jsonlabel.JsonLabel;
import org.zstack.core.jsonlabel.JsonLabelInventory;
import org.zstack.header.tag.SystemTagInventory;
import org.zstack.header.tag.SystemTagLifeCycleListener;
import org.zstack.header.tag.SystemTagValidator;
Expand Down Expand Up @@ -52,6 +55,7 @@
import org.zstack.kvm.KVMAgentCommands.TransmitVmOperationToMnCmd;
import org.zstack.utils.CollectionUtils;
import org.zstack.utils.IpRangeSet;
import org.zstack.utils.ShellUtils;
import org.zstack.utils.SizeUtils;
import org.zstack.utils.Utils;
import org.zstack.utils.data.SizeUnit;
Expand Down Expand Up @@ -90,6 +94,10 @@ public class KVMHostFactory extends AbstractService implements HypervisorFactory
ManagementNodeReadyExtensionPoint, MaxDataVolumeNumberExtensionPoint, HypervisorMessageFactory {
private static final CLogger logger = Utils.getLogger(KVMHostFactory.class);

private static final String LIBVIRT_TLS_CA_KEY = "libvirtTLSCA";
private static final String LIBVIRT_TLS_PRIVATE_KEY = "libvirtTLSPrivateKey";
private static final String CA_DIR = "/var/lib/zstack/pki/CA";

public static final HypervisorType hypervisorType = new HypervisorType(KVMConstant.KVM_HYPERVISOR_TYPE);
public static final VolumeFormat QCOW2_FORMAT = new VolumeFormat(VolumeConstant.VOLUME_FORMAT_QCOW2, hypervisorType);
public static final VolumeFormat RAW_FORMAT = new VolumeFormat(VolumeConstant.VOLUME_FORMAT_RAW, hypervisorType);
Expand Down Expand Up @@ -366,8 +374,56 @@ private void processKvmagentPhysicalMemUsageAbnormal(KVMAgentCommands.HostProces
bus.send(restartKvmAgentMsg);
}

private void initLibvirtTlsCA() {
if (CoreGlobalProperty.UNIT_TEST_ON) {
return;
}

try {
ShellUtils.run(String.format("mkdir -p %s", CA_DIR));
ShellUtils.run("chown -R zstack:zstack /var/lib/zstack/pki");

File caFile = new File(CA_DIR + "/cacert.pem");
File keyFile = new File(CA_DIR + "/cakey.pem");

// Local CA missing — generate with openssl
// NOTE: ShellUtils.run() prepends sudo only to the first command in &&-chains,
// so each command must be a separate call.
if (!caFile.exists() || !keyFile.exists()) {
ShellUtils.run(String.format(
"openssl genrsa -out %s/cakey.pem 4096", CA_DIR));
ShellUtils.run(String.format(
"openssl req -new -x509 -days 3650 -key %s/cakey.pem " +
"-out %s/cacert.pem -subj '/O=ZStack/CN=ZStack Libvirt CA'",
CA_DIR, CA_DIR));
ShellUtils.run(String.format("chown zstack:zstack %s/cakey.pem %s/cacert.pem",
CA_DIR, CA_DIR));
ShellUtils.run(String.format("chmod 600 %s/cakey.pem", CA_DIR));
ShellUtils.run(String.format("chmod 644 %s/cacert.pem", CA_DIR));
}

String ca = FileUtils.readFileToString(caFile).trim();
String key = FileUtils.readFileToString(keyFile).trim();

// createIfAbsent: DB has no record → write; DB has record → return DB value
JsonLabelInventory caInv = new JsonLabel().createIfAbsent(LIBVIRT_TLS_CA_KEY, ca);
JsonLabelInventory keyInv = new JsonLabel().createIfAbsent(LIBVIRT_TLS_PRIVATE_KEY, key);
Comment on lines +409 to +410

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

避免把 CA 证书和私钥拆成两个独立的 createIfAbsent() 写入。

Line 409-410 在 HA 并发启动下有竞态:两个 MN 可以先各自生成不同的本地 CA,然后分别抢先写入 libvirtTLSCAlibvirtTLSPrivateKey,最终把不匹配的证书/私钥组合持久化下来,后续主机证书签发会直接失效。这里需要把证书和私钥作为一个原子单元持久化,或者在初始化阶段加全局锁/事务,避免跨 key 交叉写入。

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@plugin/kvm/src/main/java/org/zstack/kvm/KVMHostFactory.java` around lines 409
- 410, The current code in KVMHostFactory uses two separate
JsonLabel.createIfAbsent calls for LIBVIRT_TLS_CA_KEY and
LIBVIRT_TLS_PRIVATE_KEY (variables caInv and keyInv), which can race under
concurrent HA starts; fix by persisting the CA and private key atomically
instead of two independent writes: either serialize both values into a single
JSON object and call one createIfAbsent for a single label (store CA+key
together), or wrap the pair of createIfAbsent operations in a
global/cluster-wide lock or a DB transaction so they cannot interleave; update
references to JsonLabel.createIfAbsent, LIBVIRT_TLS_CA_KEY,
LIBVIRT_TLS_PRIVATE_KEY and the caInv/keyInv creation site in KVMHostFactory
accordingly.


// Use DB as source of truth — overwrite local files (HA: MN2 uses MN1's CA from DB)
FileUtils.writeStringToFile(caFile, caInv.getLabelValue());
FileUtils.writeStringToFile(keyFile, keyInv.getLabelValue());
ShellUtils.run(String.format("chmod 600 %s/cakey.pem", CA_DIR));
ShellUtils.run(String.format("chmod 644 %s/cacert.pem", CA_DIR));

logger.info("Libvirt TLS CA initialized and persisted to database");
} catch (Exception e) {
logger.warn("Failed to initialize libvirt TLS CA", e);
Comment on lines +419 to +420

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

TLS CA 初始化失败时不要只记日志后继续启动。

当前 kvm.libvirt.tls.enabled 默认开启,这里只打 warning 会让管理节点在没有可用 CA 的情况下继续启动,直到后续 host reconnect / migrate 才以更隐蔽的方式失败。至少在 TLS 启用时应 fail fast,或者显式关闭该能力并给出可见告警。

💡 建议修改
         } catch (Exception e) {
-            logger.warn("Failed to initialize libvirt TLS CA", e);
+            if (KVMGlobalConfig.LIBVIRT_TLS_ENABLED.value(Boolean.class)) {
+                throw new CloudRuntimeException("Failed to initialize libvirt TLS CA", e);
+            }
+            logger.warn("Failed to initialize libvirt TLS CA while TLS is disabled", e);
         }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
} catch (Exception e) {
logger.warn("Failed to initialize libvirt TLS CA", e);
} catch (Exception e) {
if (KVMGlobalConfig.LIBVIRT_TLS_ENABLED.value(Boolean.class)) {
throw new CloudRuntimeException("Failed to initialize libvirt TLS CA", e);
}
logger.warn("Failed to initialize libvirt TLS CA while TLS is disabled", e);
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@plugin/kvm/src/main/java/org/zstack/kvm/KVMHostFactory.java` around lines 419
- 420, The catch in KVMHostFactory that currently swallows exceptions when
initializing the libvirt TLS CA (logger.warn("Failed to initialize libvirt TLS
CA", e)) must not allow the system to keep starting with TLS enabled; update the
handler in the method where libvirt TLS CA is initialized (KVMHostFactory
initialization block) to either (a) fail fast by rethrowing a RuntimeException
(or call System.exit/Service startup abort) when kvm.libvirt.tls.enabled is
true, or (b) explicitly disable the TLS feature by setting the internal
tlsEnabled flag to false and emit a clear ERROR-level log and a visible
alert/metric; ensure you reference the kvm.libvirt.tls.enabled config, the tls
initialization code path in KVMHostFactory, and any internal field (e.g.,
tlsEnabled / libvirtTlsCaInitialized) so the change prevents silent startup with
TLS claimed enabled.

}
}

@Override
public boolean start() {
initLibvirtTlsCA();
deployAnsibleModule();
populateExtensions();
configKVMDeviceType();
Expand Down
Loading