From 621783496b18ab2d8f0f7332049f0ba5324313e0 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Tue, 2 Jun 2026 16:18:33 +0800 Subject: [PATCH 01/17] Add metadata-lease self-fencing foundation for CN->DN broadcast HA Table-model DDL and ~30 other ConfigNode->DataNode metadata broadcasts fail when any single DataNode is unreachable, because the ConfigNode requires every registered DataNode to acknowledge a cache invalidation before committing (to stop a partitioned DataNode from serving stale CN-pushed caches and generating dirty data). This adds the test-covered foundation for a metadata-lease/fencing mechanism that lets such operations tolerate DataNode failures without sacrificing correctness. DataNode side: MetadataLeaseManager tracks the lease via the ConfigNode heartbeat (monotonic clock); isFenced() when no heartbeat within metadata_lease_fence_ms (default 20s); fires recovery listeners when a heartbeat arrives after a fence. DataNodeTableCache fail-closed (retryable error) in getTableInWrite/getTable while fenced, and invalidateAll() registered as a recovery listener so a recovered DataNode re-fetches fresh schema. getDataNodeHeartBeat records the heartbeat; a metadata_lease_heartbeat_age_ms gauge is exposed. ConfigNode side: DataNodeContactTracker records, per DataNode, the time the ConfigNode last received a successful heartbeat response, stamped on the ConfigNode clock only on success and never advanced by onError (recorded in DataNodeHeartbeatHandler.onComplete) - the sound signal for deciding whether an unreachable DataNode has self-fenced. MetadataBroadcastVerdict is the pure decision logic (capability checked first; FENCED_SAFE only via hbAge>=T_proceed or retired-from-routing; no additive fast-path). No ConfigNode procedure control flow is changed yet (the verdict is not wired into procedures); the DataNode fail-closed is active only while a DataNode is actually fenced. Config: metadata_lease_fence_ms in CommonConfig. 20 new unit tests. --- .../heartbeat/DataNodeHeartbeatHandler.java | 6 + .../manager/lease/DataNodeContactTracker.java | 100 ++++++++++++++ .../lease/MetadataBroadcastVerdict.java | 113 ++++++++++++++++ .../lease/DataNodeContactTrackerTest.java | 73 ++++++++++ .../lease/MetadataBroadcastVerdictTest.java | 116 ++++++++++++++++ .../impl/DataNodeInternalRPCServiceImpl.java | 5 + .../lease/MetadataLeaseManager.java | 127 ++++++++++++++++++ .../table/DataNodeTableCache.java | 44 +++++- .../metrics/DataNodeMetricsHelper.java | 3 + .../service/metrics/MetadataLeaseMetrics.java | 50 +++++++ .../lease/MetadataLeaseManagerTest.java | 110 +++++++++++++++ .../table/DataNodeTableCacheLeaseTest.java | 63 +++++++++ .../conf/iotdb-system.properties.template | 10 ++ .../iotdb/commons/conf/CommonConfig.java | 15 +++ .../iotdb/commons/conf/CommonDescriptor.java | 5 + 15 files changed, 839 insertions(+), 1 deletion(-) create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java create mode 100644 iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java create mode 100644 iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java create mode 100644 iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java create mode 100644 iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java index e7a31b1dc73eb..65f4940769865 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java @@ -26,6 +26,7 @@ import org.apache.iotdb.commons.cluster.RegionStatus; import org.apache.iotdb.confignode.conf.ConfigNodeConfig; import org.apache.iotdb.confignode.conf.ConfigNodeDescriptor; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.load.LoadManager; import org.apache.iotdb.confignode.manager.load.cache.consensus.ConsensusGroupHeartbeatSample; import org.apache.iotdb.confignode.manager.load.cache.node.NodeHeartbeatSample; @@ -82,6 +83,11 @@ public DataNodeHeartbeatHandler( @Override public void onComplete(TDataNodeHeartbeatResp heartbeatResp) { + // A successful response confirms ConfigNode->DataNode contact; stamp it on the ConfigNode clock + // for the metadata-lease verdict. Kept separate from the load-cache samples (which record the + // echoed send-time) and deliberately not touched in onError, so failures never advance it. + DataNodeContactTracker.getInstance().recordSuccessfulResponse(nodeId); + // Update NodeCache loadManager .getLoadCache() diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java new file mode 100644 index 0000000000000..08393053a79d6 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import java.util.Collection; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.LongSupplier; + +/** + * Tracks, per DataNode, the time the ConfigNode last received a successful heartbeat + * response from it, stamped with the ConfigNode's own monotonic clock at receipt. + * + *

This is the sound signal for deciding whether an unreachable DataNode has self-fenced (used by + * the metadata-lease verdict). It must be kept separate from the load-cache {@code + * NodeHeartbeatSample}s, which (a) record the heartbeat send time echoed back by the + * DataNode — not response receipt — and (b) are advanced to the current time by failure ({@code + * onError}) samples. Either property would break the verdict: send-time can make the ConfigNode + * believe a DataNode is fenced while it just renewed from a delayed heartbeat, and failure-advanced + * time would keep the age from ever growing. + * + *

By construction there is no method that advances the time on failure: only {@link + * #recordSuccessfulResponse(int)} updates it. A never-contacted DataNode reads as age 0 (treated as + * just-contacted) so the verdict never wrongly declares an unknown DataNode fenced. + */ +public class DataNodeContactTracker { + + private final LongSupplier nanoClock; + + private final Map lastSuccessfulResponseNanos = new ConcurrentHashMap<>(); + + private DataNodeContactTracker() { + this(System::nanoTime); + } + + DataNodeContactTracker(final LongSupplier nanoClock) { + this.nanoClock = nanoClock; + } + + /** Record that a successful heartbeat response from the DataNode was just received. */ + public void recordSuccessfulResponse(final int dataNodeId) { + lastSuccessfulResponseNanos.put(dataNodeId, nanoClock.getAsLong()); + } + + /** + * Milliseconds since the ConfigNode last received a successful heartbeat response from the + * DataNode. Returns 0 (treated as just-contacted) if never recorded — conservative, so an unknown + * DataNode is never declared fenced. + */ + public long getMillisSinceLastSuccessfulResponse(final int dataNodeId) { + final Long lastNanos = lastSuccessfulResponseNanos.get(dataNodeId); + if (lastNanos == null) { + return 0L; + } + final long elapsedNanos = nanoClock.getAsLong() - lastNanos; + return elapsedNanos > 0 ? elapsedNanos / 1_000_000L : 0L; + } + + /** + * On acquiring leadership, treat all currently-registered DataNodes as just-contacted, so a new + * leader does not declare a DataNode fenced based on absent/stale history. + */ + public void onLeadershipAcquired(final Collection registeredDataNodeIds) { + final long now = nanoClock.getAsLong(); + for (final Integer dataNodeId : registeredDataNodeIds) { + lastSuccessfulResponseNanos.put(dataNodeId, now); + } + } + + public void removeDataNode(final int dataNodeId) { + lastSuccessfulResponseNanos.remove(dataNodeId); + } + + public static DataNodeContactTracker getInstance() { + return DataNodeContactTrackerHolder.INSTANCE; + } + + private static final class DataNodeContactTrackerHolder { + private static final DataNodeContactTracker INSTANCE = new DataNodeContactTracker(); + + private DataNodeContactTrackerHolder() {} + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java new file mode 100644 index 0000000000000..07259b7bd6ef7 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import java.util.Collection; + +/** + * Pure decision logic for the ConfigNode's metadata-broadcast verdict: after broadcasting a + * cache-invalidation/update to all DataNodes, decide whether it is safe to commit the metadata + * change given which DataNodes acknowledged and the state of those that did not. + * + *

Rules (design v5): + * + *

+ * + *

The overall verdict is {@code PROCEED} when no DataNode is {@code UNSAFE}; otherwise {@code + * WAIT} until the wait budget is exhausted, then {@code FAIL}. There is no "additive fast-path": + * every Tier-A operation follows the same rule (so a Running-but-unacked DataNode is never + * skipped). + */ +public final class MetadataBroadcastVerdict { + + public enum Verdict { + PROCEED, + WAIT, + FAIL + } + + public enum Disposition { + ACKED, + SAFE_GONE, + FENCED_SAFE, + UNSAFE + } + + private MetadataBroadcastVerdict() {} + + /** Per-DataNode inputs for one broadcast round. */ + public static final class DataNodeState { + private final boolean acked; + private final boolean retiredOrFenceAcked; + private final boolean supportsFencing; + private final long hbAgeMs; + + public DataNodeState( + final boolean acked, + final boolean retiredOrFenceAcked, + final boolean supportsFencing, + final long hbAgeMs) { + this.acked = acked; + this.retiredOrFenceAcked = retiredOrFenceAcked; + this.supportsFencing = supportsFencing; + this.hbAgeMs = hbAgeMs; + } + } + + public static Disposition classify(final DataNodeState state, final long tProceedMs) { + if (state.acked) { + return Disposition.ACKED; + } + if (state.retiredOrFenceAcked) { + return Disposition.SAFE_GONE; + } + if (!state.supportsFencing) { + // Capability is checked before any timing test: a DataNode that cannot self-fence can never + // be assumed fenced, no matter how long it has been silent. + return Disposition.UNSAFE; + } + if (state.hbAgeMs >= tProceedMs) { + return Disposition.FENCED_SAFE; + } + return Disposition.UNSAFE; + } + + public static Verdict decide( + final Collection states, + final long tProceedMs, + final boolean waitBudgetExhausted) { + for (final DataNodeState state : states) { + if (classify(state, tProceedMs) == Disposition.UNSAFE) { + return waitBudgetExhausted ? Verdict.FAIL : Verdict.WAIT; + } + } + return Verdict.PROCEED; + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java new file mode 100644 index 0000000000000..80b4498d13bf2 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +import static org.junit.Assert.assertEquals; + +public class DataNodeContactTrackerTest { + + private static final int DN = 3; + + @Test + public void reportsMillisSinceLastSuccessfulResponse() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(1234)); + assertEquals(1234L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + } + + @Test + public void ageKeepsGrowingWithoutSuccessfulResponse() { + // Failures must NOT refresh the contact time. This is enforced structurally: only + // recordSuccessfulResponse updates it, so with no further success the age keeps growing. + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.SECONDS.toNanos(30)); + assertEquals(30_000L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + } + + @Test + public void leadershipAcquisitionResetsContactToNow() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.SECONDS.toNanos(30)); // would otherwise look stale + tracker.onLeadershipAcquired(Arrays.asList(DN, 4)); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(4)); + } + + @Test + public void neverContactedReadsAsZeroSoVerdictTreatsAsRecent() { + // Conservative: an unknown DataNode must NOT look fenced (else the verdict would wrongly + // proceed past it), so its age reads as 0 until a real success/expiry is observed. + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(999)); + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java new file mode 100644 index 0000000000000..32248d56c9886 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.DataNodeState; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Disposition; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; + +import static org.junit.Assert.assertEquals; + +public class MetadataBroadcastVerdictTest { + + private static final long T_PROCEED_MS = 25_000L; + + // acked + private static DataNodeState acked() { + return new DataNodeState(true, false, true, 0L); + } + + // capable, unacked, out of contact >= T_proceed -> provably fenced + private static DataNodeState fencedSafe() { + return new DataNodeState(false, false, true, T_PROCEED_MS + 1); + } + + // capable, unacked, heartbeat still fresh -> still possibly serving + private static DataNodeState freshUnacked() { + return new DataNodeState(false, false, true, 1_000L); + } + + // ---- classify ---- + + @Test + public void incapableDataNodeIsNeverFencedSafe() { + // Review point 4: capability is checked before any timing test; an old DN that cannot + // self-fence must be UNSAFE even if it has been silent far longer than T_proceed. + final DataNodeState oldDnLongSilent = new DataNodeState(false, false, false, T_PROCEED_MS * 10); + assertEquals( + Disposition.UNSAFE, MetadataBroadcastVerdict.classify(oldDnLongSilent, T_PROCEED_MS)); + } + + @Test + public void retiredFromRoutingIsSafeGone() { + // Review point 5: only "removed from routing / explicit fence-shutdown ack" is safe-gone, + // regardless of capability or how recently it was seen. + final DataNodeState retired = new DataNodeState(false, true, false, 0L); + assertEquals(Disposition.SAFE_GONE, MetadataBroadcastVerdict.classify(retired, T_PROCEED_MS)); + } + + @Test + public void removingButStillRoutableIsUnsafe() { + // Review point 5: a node that is merely Removing (still routable, not retired) with a fresh + // heartbeat must NOT be treated as safe. + assertEquals( + Disposition.UNSAFE, MetadataBroadcastVerdict.classify(freshUnacked(), T_PROCEED_MS)); + } + + @Test + public void capableAndLongSilentIsFencedSafe() { + assertEquals( + Disposition.FENCED_SAFE, MetadataBroadcastVerdict.classify(fencedSafe(), T_PROCEED_MS)); + } + + // ---- decide ---- + + @Test + public void allAckedProceeds() { + assertEquals( + Verdict.PROCEED, + MetadataBroadcastVerdict.decide(Arrays.asList(acked(), acked()), T_PROCEED_MS, false)); + } + + @Test + public void unackedButAllFencedSafeProceeds() { + assertEquals( + Verdict.PROCEED, + MetadataBroadcastVerdict.decide(Arrays.asList(acked(), fencedSafe()), T_PROCEED_MS, false)); + } + + @Test + public void freshUnackedWaitsWhileBudgetRemains() { + assertEquals( + Verdict.WAIT, + MetadataBroadcastVerdict.decide( + Collections.singletonList(freshUnacked()), T_PROCEED_MS, false)); + } + + @Test + public void freshUnackedFailsWhenWaitBudgetExhausted() { + assertEquals( + Verdict.FAIL, + MetadataBroadcastVerdict.decide( + Collections.singletonList(freshUnacked()), T_PROCEED_MS, true)); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index e5c996405649d..1d84d22b2b690 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -189,6 +189,7 @@ import org.apache.iotdb.db.queryengine.plan.statement.crud.InsertRowStatement; import org.apache.iotdb.db.queryengine.plan.statement.crud.QueryStatement; import org.apache.iotdb.db.schemaengine.SchemaEngine; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.schemaregion.ISchemaRegion; import org.apache.iotdb.db.schemaengine.schemaregion.read.resp.info.ITimeSeriesSchemaInfo; import org.apache.iotdb.db.schemaengine.schemaregion.read.resp.reader.ISchemaReader; @@ -2226,6 +2227,10 @@ private PathPatternTree filterPathPatternTree(PathPatternTree patternTree, Strin public TDataNodeHeartbeatResp getDataNodeHeartBeat(TDataNodeHeartbeatReq req) throws TException { TDataNodeHeartbeatResp resp = new TDataNodeHeartbeatResp(); + // Renew the metadata lease: receiving a ConfigNode heartbeat means this DataNode is still in + // contact with the cluster and may keep trusting its ConfigNode-pushed metadata caches. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + // Judging leader if necessary if (req.isNeedJudgeLeader()) { // Always get logical clock before judging leader diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java new file mode 100644 index 0000000000000..c5a1feabf8805 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.lease; + +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.commons.utils.TestOnly; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.LongSupplier; + +/** + * Tracks the DataNode's "metadata lease" with the ConfigNode. The ConfigNode periodically sends + * heartbeats to the DataNode; while these arrive the DataNode may trust its ConfigNode-pushed + * metadata caches (table/tree schema, device attributes, templates, TTL, permissions, ...). If no + * heartbeat is received within {@code metadata_lease_fence_ms} ({@code T_fence}), the lease has + * expired and the DataNode must self-fence: stop trusting those caches so a partitioned DataNode + * cannot serve stale schema and generate dirty data. + * + *

This class only tracks the lease state; wiring fail-closed behavior into the read/write/auth + * paths and resync-on-recovery is done by the respective subsystems. + * + *

A monotonic clock ({@link System#nanoTime()}) is used so the lease is immune to wall-clock + * adjustments. The clock and fence threshold are injectable for testing. + */ +public class MetadataLeaseManager { + + private static final Logger LOGGER = LoggerFactory.getLogger(MetadataLeaseManager.class); + + private final List leaseRecoveryListeners = new CopyOnWriteArrayList<>(); + + private final LongSupplier nanoClock; + private final LongSupplier fenceThresholdMsSupplier; + + private volatile long lastConfigNodeHeartbeatNanos; + + private MetadataLeaseManager() { + this( + System::nanoTime, + () -> CommonDescriptor.getInstance().getConfig().getMetadataLeaseFenceMs()); + } + + MetadataLeaseManager(final LongSupplier nanoClock, final LongSupplier fenceThresholdMsSupplier) { + this.nanoClock = nanoClock; + this.fenceThresholdMsSupplier = fenceThresholdMsSupplier; + // Startup registration performs a full resync, so treat construction time as a fresh contact. + this.lastConfigNodeHeartbeatNanos = nanoClock.getAsLong(); + } + + /** + * Register a listener to run when the lease recovers, i.e. a ConfigNode heartbeat arrives after + * the lease had expired. Push-maintained caches (e.g. {@code DataNodeTableCache}) register here + * to invalidate themselves on recovery, since they may have missed ConfigNode pushes while + * fenced; subsequent lookups then re-fetch fresh state instead of trusting stale entries. + */ + public void addLeaseRecoveryListener(final Runnable listener) { + leaseRecoveryListeners.add(listener); + } + + /** + * Renew the lease: record that a ConfigNode heartbeat has just been received. If the lease had + * expired (the DataNode was fenced), this heartbeat is a recovery, so the registered recovery + * listeners run to drop possibly-stale ConfigNode-pushed caches before they are trusted again. + */ + public void recordConfigNodeHeartbeat() { + final boolean wasFenced = isFenced(); + this.lastConfigNodeHeartbeatNanos = nanoClock.getAsLong(); + if (wasFenced) { + for (final Runnable listener : leaseRecoveryListeners) { + try { + listener.run(); + } catch (final Exception e) { + // A misbehaving listener must not break heartbeat processing / lease renewal. + LOGGER.warn("Metadata lease recovery listener failed", e); + } + } + } + } + + /** Milliseconds elapsed since the last ConfigNode heartbeat was received (never negative). */ + public long getMillisSinceLastConfigNodeHeartbeat() { + final long elapsedNanos = nanoClock.getAsLong() - lastConfigNodeHeartbeatNanos; + return elapsedNanos > 0 ? elapsedNanos / 1_000_000L : 0L; + } + + /** Whether the metadata lease has expired (no ConfigNode heartbeat within {@code T_fence}). */ + public boolean isFenced() { + return getMillisSinceLastConfigNodeHeartbeat() > fenceThresholdMsSupplier.getAsLong(); + } + + /** Force the lease to appear expired, for tests that exercise fail-closed behavior. */ + @TestOnly + public void expireLeaseForTest() { + this.lastConfigNodeHeartbeatNanos = + nanoClock.getAsLong() - (fenceThresholdMsSupplier.getAsLong() + 1_000L) * 1_000_000L; + } + + public static MetadataLeaseManager getInstance() { + return MetadataLeaseManagerHolder.INSTANCE; + } + + private static final class MetadataLeaseManagerHolder { + private static final MetadataLeaseManager INSTANCE = new MetadataLeaseManager(); + + private MetadataLeaseManagerHolder() {} + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java index f545a9bda237d..f109aa5ef84d6 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java @@ -20,6 +20,7 @@ package org.apache.iotdb.db.schemaengine.table; import org.apache.iotdb.calc.plan.relational.metadata.CommonMetadataUtils; +import org.apache.iotdb.commons.exception.IoTDBRuntimeException; import org.apache.iotdb.commons.schema.table.NonCommittableTsTable; import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.commons.schema.table.TsTableInternalRPCUtil; @@ -29,6 +30,7 @@ import org.apache.iotdb.db.conf.IoTDBDescriptor; import org.apache.iotdb.db.i18n.DataNodeSchemaMessages; import org.apache.iotdb.db.queryengine.plan.execution.config.executor.ClusterConfigTaskExecutor; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.tsfile.utils.Pair; @@ -73,7 +75,10 @@ public class DataNodeTableCache implements ITableCache { IoTDBDescriptor.getInstance().getConfig().getDataNodeTableCacheSemaphorePermitNum()); private DataNodeTableCache() { - // Do nothing + // On lease recovery (a ConfigNode heartbeat after this DataNode was fenced), this cache may + // have + // missed ConfigNode pushes, so drop everything and let subsequent lookups re-fetch fresh state. + MetadataLeaseManager.getInstance().addLeaseRecoveryListener(this::invalidateAll); } private static final class DataNodeTableCacheHolder { @@ -263,6 +268,22 @@ public void invalid(String database) { } } + /** + * Drop the entire cache. Used on metadata-lease recovery: after the DataNode was fenced it may + * have missed ConfigNode pushes, so the cached schema is no longer trustworthy and must be + * re-fetched lazily on the next lookup. + */ + public void invalidateAll() { + readWriteLock.writeLock().lock(); + try { + databaseTableMap.clear(); + preUpdateTableMap.clear(); + instanceVersion.incrementAndGet(); + } finally { + readWriteLock.writeLock().unlock(); + } + } + @GuardedBy("TableDeviceSchemaCache#writeLock") @Override public void invalid(String database, final String tableName) { @@ -313,7 +334,27 @@ public long getInstanceVersion() { return instanceVersion.get(); } + /** + * Fail closed when the metadata lease has expired: a fenced DataNode may hold a stale + * table-schema cache (it could have missed a ConfigNode invalidation while partitioned), so + * refuse to serve it rather than risk validating writes/queries against stale schema and + * producing dirty data. The error is retryable; the operation succeeds again once the lease + * recovers and the cache resyncs. + */ + private void failIfMetadataLeaseFenced() { + final MetadataLeaseManager lease = MetadataLeaseManager.getInstance(); + if (lease.isFenced()) { + throw new IoTDBRuntimeException( + String.format( + "DataNode metadata lease expired (%d ms since last ConfigNode heartbeat); refusing to " + + "serve table schema from a possibly-stale cache, please retry.", + lease.getMillisSinceLastConfigNodeHeartbeat()), + TSStatusCode.INTERNAL_REQUEST_RETRY_ERROR.getStatusCode()); + } + } + public TsTable getTableInWrite(final String database, final String tableName) { + failIfMetadataLeaseFenced(); final TsTable result = getTableInCache(database, tableName); return Objects.nonNull(result) ? result : getTable(database, tableName, false); } @@ -327,6 +368,7 @@ public TsTable getTable(final String database, final String tableName) { * #preUpdateTableMap}, due to the failure of "commit" or rollback of "pre-update". */ public TsTable getTable(String database, final String tableName, final boolean force) { + failIfMetadataLeaseFenced(); database = PathUtils.unQualifyDatabaseName(database); final Map> preUpdateTables = mayGetTableInPreUpdateMap(database, tableName); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java index 2183d6ac8812b..e2204e8cf0b57 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java @@ -108,6 +108,9 @@ public static void bind() { // bind memory related metrics metricService.addMetricSet(GlobalMemoryMetrics.getInstance()); + + // bind metadata lease (ConfigNode heartbeat freshness) metrics + metricService.addMetricSet(new MetadataLeaseMetrics()); } private static void initSystemMetrics(MetricService metricService) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java new file mode 100644 index 0000000000000..99b33befc9acf --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.service.metrics; + +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.metrics.AbstractMetricService; +import org.apache.iotdb.metrics.metricsets.IMetricSet; +import org.apache.iotdb.metrics.utils.MetricLevel; +import org.apache.iotdb.metrics.utils.MetricType; + +/** + * Exposes the DataNode's metadata-lease state for observability: how long it has been since the + * last ConfigNode heartbeat was received. A value approaching {@code metadata_lease_fence_ms} + * indicates the DataNode is about to (or has) self-fenced its ConfigNode-pushed metadata caches. + */ +public class MetadataLeaseMetrics implements IMetricSet { + + private static final String METADATA_LEASE_HEARTBEAT_AGE_MS = "metadata_lease_heartbeat_age_ms"; + + @Override + public void bindTo(final AbstractMetricService metricService) { + metricService.createAutoGauge( + METADATA_LEASE_HEARTBEAT_AGE_MS, + MetricLevel.IMPORTANT, + MetadataLeaseManager.getInstance(), + MetadataLeaseManager::getMillisSinceLastConfigNodeHeartbeat); + } + + @Override + public void unbindFrom(final AbstractMetricService metricService) { + metricService.remove(MetricType.AUTO_GAUGE, METADATA_LEASE_HEARTBEAT_AGE_MS); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java new file mode 100644 index 0000000000000..70d0f0ece1a51 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.lease; + +import org.junit.Test; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class MetadataLeaseManagerTest { + + private static final long T_FENCE_MS = 20_000L; + + private MetadataLeaseManager newManager(final AtomicLong nowNanos) { + return new MetadataLeaseManager(nowNanos::get, () -> T_FENCE_MS); + } + + @Test + public void notFencedWithinThresholdAfterHeartbeat() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + manager.recordConfigNodeHeartbeat(); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS - 1)); + assertFalse(manager.isFenced()); + } + + @Test + public void fencedAfterThresholdElapsedWithoutHeartbeat() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + manager.recordConfigNodeHeartbeat(); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + } + + @Test + public void recoversFromFencedAfterNewHeartbeat() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + manager.recordConfigNodeHeartbeat(); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + + manager.recordConfigNodeHeartbeat(); + assertFalse(manager.isFenced()); + } + + @Test + public void reportsMillisSinceLastHeartbeat() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + manager.recordConfigNodeHeartbeat(); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(1234)); + assertEquals(1234L, manager.getMillisSinceLastConfigNodeHeartbeat()); + } + + @Test + public void runsRecoveryListenerWhenHeartbeatArrivesAfterFence() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + final AtomicInteger recoveries = new AtomicInteger(); + manager.addLeaseRecoveryListener(recoveries::incrementAndGet); + + manager.recordConfigNodeHeartbeat(); + assertEquals(0, recoveries.get()); + + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + + manager.recordConfigNodeHeartbeat(); + assertEquals(1, recoveries.get()); + assertFalse(manager.isFenced()); + } + + @Test + public void doesNotRunRecoveryListenerWhenLeaseNeverExpired() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + final AtomicInteger recoveries = new AtomicInteger(); + manager.addLeaseRecoveryListener(recoveries::incrementAndGet); + + for (int i = 0; i < 3; i++) { + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(1_000)); + manager.recordConfigNodeHeartbeat(); + } + assertEquals(0, recoveries.get()); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java new file mode 100644 index 0000000000000..709679e86b362 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.table; + +import org.apache.iotdb.commons.exception.IoTDBRuntimeException; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.junit.After; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class DataNodeTableCacheLeaseTest { + + @After + public void renewLease() { + // Renew so a forced-fenced lease does not leak into other tests sharing this JVM fork. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + } + + @Test + public void getTableInWriteFailsClosedWhenLeaseFenced() { + MetadataLeaseManager.getInstance().expireLeaseForTest(); + try { + DataNodeTableCache.getInstance().getTableInWrite("root.db", "t"); + fail("expected fail-closed retry exception while the metadata lease is fenced"); + } catch (final IoTDBRuntimeException e) { + // A fenced DataNode must refuse to validate writes against a possibly-stale cache, and the + // error must be the retryable one (not, say, table-not-exists from the stale cache). + assertEquals(TSStatusCode.INTERNAL_REQUEST_RETRY_ERROR.getStatusCode(), e.getErrorCode()); + } + } + + @Test + public void getTableFailsClosedWhenLeaseFenced() { + MetadataLeaseManager.getInstance().expireLeaseForTest(); + try { + DataNodeTableCache.getInstance().getTable("root.db", "t"); + fail("expected fail-closed retry exception while the metadata lease is fenced"); + } catch (final IoTDBRuntimeException e) { + assertEquals(TSStatusCode.INTERNAL_REQUEST_RETRY_ERROR.getStatusCode(), e.getErrorCode()); + } + } +} diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index 4d7b5bcea87eb..52c44f973e291 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -744,6 +744,16 @@ failure_detector_phi_threshold=30 # Datatype: long failure_detector_phi_acceptable_pause_in_ms=10000 +# A DataNode self-fences its ConfigNode-pushed metadata caches (table/tree schema, templates, TTL, +# permissions, ...) if it has not received a ConfigNode heartbeat within this duration, so a +# partitioned DataNode stops trusting stale caches. Kept aligned with +# failure_detector_fixed_threshold_in_ms so a DataNode fences itself around the same time the +# cluster would consider it down. The ConfigNode also uses this to decide how long it must wait +# before treating an unreachable DataNode as safely fenced. +# effectiveMode: restart +# Datatype: long +metadata_lease_fence_ms=20000 + # Whether to enable topology probing between DataNodes # effectiveMode: hot_reload # Datatype: Boolean diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java index 6a8956e423b48..aa81003b1b2e6 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java @@ -439,6 +439,13 @@ public class CommonConfig { private volatile long remoteWriteMaxRetryDurationInMs = 60000; + // The DataNode self-fences its ConfigNode-pushed metadata caches (table/tree schema, template, + // TTL, permission, ...) if it has not received a ConfigNode heartbeat within this duration. Kept + // aligned with the failure detector threshold so a partitioned DataNode stops trusting stale + // caches around the same time the cluster would consider it dead. Also used by the ConfigNode to + // derive how long it must wait before treating an unreachable DataNode as safely fenced. + private volatile long metadataLeaseFenceMs = 20_000; + private final RateLimiter querySamplingRateLimiter = RateLimiter.create(160); // if querySamplingRateLimiter < 0, means that there is no rate limit, we need to full sample all // the queries @@ -2679,6 +2686,14 @@ public void setRemoteWriteMaxRetryDurationInMs(long remoteWriteMaxRetryDurationI this.remoteWriteMaxRetryDurationInMs = remoteWriteMaxRetryDurationInMs; } + public long getMetadataLeaseFenceMs() { + return metadataLeaseFenceMs; + } + + public void setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + this.metadataLeaseFenceMs = metadataLeaseFenceMs; + } + public int getArenaNum() { return arenaNum; } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java index 5cd954a09f7b8..014ad3e879785 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java @@ -306,6 +306,11 @@ public void loadCommonProps(TrimProperties properties) throws IOException { properties.getProperty( "path_log_max_size", String.valueOf(config.getPathLogMaxSize())))); + config.setMetadataLeaseFenceMs( + Long.parseLong( + properties.getProperty( + "metadata_lease_fence_ms", String.valueOf(config.getMetadataLeaseFenceMs())))); + loadRetryProperties(properties); loadBinaryAllocatorProps(properties); } From bdf1c5cdaafea57dd0488c2a845f08a992e57753 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Tue, 2 Jun 2026 16:37:17 +0800 Subject: [PATCH 02/17] Wire DataNode metadata-lease-fencing capability into the heartbeat Add an optional supportsMetadataLeaseFencing flag to TDataNodeHeartbeatResp. The DataNode advertises it (true); the ConfigNode records it per-DataNode in DataNodeContactTracker. The verdict checks capability before any liveness/timing test, so a not-yet-upgraded DataNode that omits the flag is recorded as not-capable and never treated as fenced (strict, rolling-upgrade safe). DataNodeContactTracker gains recordCapability/supportsFencing (default false). 3 new unit tests. --- .../heartbeat/DataNodeHeartbeatHandler.java | 7 +++++- .../manager/lease/DataNodeContactTracker.java | 19 ++++++++++++++ .../lease/DataNodeContactTrackerTest.java | 25 +++++++++++++++++++ .../impl/DataNodeInternalRPCServiceImpl.java | 4 +++ .../src/main/thrift/datanode.thrift | 3 +++ 5 files changed, 57 insertions(+), 1 deletion(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java index 65f4940769865..c08aee0d1bdc1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java @@ -86,7 +86,12 @@ public void onComplete(TDataNodeHeartbeatResp heartbeatResp) { // A successful response confirms ConfigNode->DataNode contact; stamp it on the ConfigNode clock // for the metadata-lease verdict. Kept separate from the load-cache samples (which record the // echoed send-time) and deliberately not touched in onError, so failures never advance it. - DataNodeContactTracker.getInstance().recordSuccessfulResponse(nodeId); + final DataNodeContactTracker contactTracker = DataNodeContactTracker.getInstance(); + contactTracker.recordSuccessfulResponse(nodeId); + contactTracker.recordCapability( + nodeId, + heartbeatResp.isSetSupportsMetadataLeaseFencing() + && heartbeatResp.isSupportsMetadataLeaseFencing()); // Update NodeCache loadManager diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java index 08393053a79d6..3bcba9e11bdd1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java @@ -46,6 +46,11 @@ public class DataNodeContactTracker { private final Map lastSuccessfulResponseNanos = new ConcurrentHashMap<>(); + // Whether each DataNode reports that it supports metadata-lease self-fencing. Defaults to false + // for not-yet-reported / not-yet-upgraded DataNodes, so the verdict treats them conservatively + // (never FENCED-SAFE) until they prove capability. + private final Map supportsFencing = new ConcurrentHashMap<>(); + private DataNodeContactTracker() { this(System::nanoTime); } @@ -84,8 +89,22 @@ public void onLeadershipAcquired(final Collection registeredDataNodeIds } } + /** Record whether a DataNode reports support for metadata-lease self-fencing. */ + public void recordCapability(final int dataNodeId, final boolean dnSupportsFencing) { + supportsFencing.put(dataNodeId, dnSupportsFencing); + } + + /** + * Whether the DataNode is known to support self-fencing. Defaults to false (conservative): an + * unknown/old DataNode is never treated as fenced by the verdict. + */ + public boolean supportsFencing(final int dataNodeId) { + return supportsFencing.getOrDefault(dataNodeId, false); + } + public void removeDataNode(final int dataNodeId) { lastSuccessfulResponseNanos.remove(dataNodeId); + supportsFencing.remove(dataNodeId); } public static DataNodeContactTracker getInstance() { diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java index 80b4498d13bf2..207b257ac9f2c 100644 --- a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java @@ -26,6 +26,8 @@ import java.util.concurrent.atomic.AtomicLong; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; public class DataNodeContactTrackerTest { @@ -70,4 +72,27 @@ public void neverContactedReadsAsZeroSoVerdictTreatsAsRecent() { final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(999)); } + + @Test + public void unknownDataNodeIsNotFencingCapable() { + final DataNodeContactTracker tracker = new DataNodeContactTracker(new AtomicLong()::get); + assertFalse(tracker.supportsFencing(DN)); + } + + @Test + public void recordsAndUpdatesFencingCapability() { + final DataNodeContactTracker tracker = new DataNodeContactTracker(new AtomicLong()::get); + tracker.recordCapability(DN, true); + assertTrue(tracker.supportsFencing(DN)); + tracker.recordCapability(DN, false); + assertFalse(tracker.supportsFencing(DN)); + } + + @Test + public void removeDataNodeClearsCapability() { + final DataNodeContactTracker tracker = new DataNodeContactTracker(new AtomicLong()::get); + tracker.recordCapability(DN, true); + tracker.removeDataNode(DN); + assertFalse(tracker.supportsFencing(DN)); + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 1d84d22b2b690..aec41b3fd7388 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -2278,6 +2278,10 @@ public TDataNodeHeartbeatResp getDataNodeHeartBeat(TDataNodeHeartbeatReq req) th AuthorityChecker.getAuthorityFetcher().refreshToken(); resp.setHeartbeatTimestamp(req.getHeartbeatTimestamp()); resp.setStatus(commonConfig.getNodeStatus().getStatus()); + // Advertise that this DataNode supports metadata-lease self-fencing, so the ConfigNode may + // treat + // it as safely fenced when unreachable (older DataNodes that omit this are handled strictly). + resp.setSupportsMetadataLeaseFencing(true); if (commonConfig.getStatusReason() != null) { resp.setStatusReason(commonConfig.getStatusReason()); } diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index 92a7602b34dee..74e9054592247 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -316,6 +316,9 @@ struct TDataNodeHeartbeatResp { 15: optional list pipeRemainingEventCountList 16: optional list pipeRemainingTimeList 17: optional map dataRegionRawDataSize + // Whether this DataNode supports metadata-lease self-fencing. Used by the ConfigNode during + // rolling upgrade: an unreachable DataNode that does not report this cannot be assumed fenced. + 18: optional bool supportsMetadataLeaseFencing } struct TPipeHeartbeatReq { From 9901f4082efbd6fc296e3a1c5d5b7e02ed024205 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Tue, 2 Jun 2026 20:11:16 +0800 Subject: [PATCH 03/17] Fail closed on permission cache while metadata lease is fenced The DataNode permission cache (ClusterAuthorityFetcher) is invalidated by a ConfigNode broadcast after GRANT/REVOKE. A DataNode partitioned from the ConfigNode can miss that broadcast and keep authorizing a privilege that was already revoked. The pre-existing refreshToken() timeout did not close this window: it only marks the cache stale when a heartbeat finally arrives after a long gap, so during an ongoing partition (no heartbeat at all) refreshToken() is never called and the stale cache keeps being served until recovery. checkCacheAvailable() now also drops the cache when MetadataLeaseManager reports the lease fenced. isFenced() is evaluated on the DataNode's own clock and needs no heartbeat to fire, so an ongoing partition forces a re-fetch from the ConfigNode, which fails closed while partitioned (deny, not allow). --- .../db/auth/ClusterAuthorityFetcher.java | 12 ++- .../ClusterAuthorityFetcherLeaseTest.java | 79 +++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java index 9ee232c921b83..2236dffffbfa7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java @@ -57,6 +57,7 @@ import org.apache.iotdb.db.queryengine.plan.relational.type.AuthorRType; import org.apache.iotdb.db.queryengine.plan.statement.StatementType; import org.apache.iotdb.db.queryengine.plan.statement.sys.AuthorStatement; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -531,8 +532,15 @@ public void refreshToken() { heartBeatTimeStamp = currentTime; } - private void checkCacheAvailable() { - if (cacheOutDate) { + // Package-private for testing (ClusterAuthorityFetcherLeaseTest). + void checkCacheAvailable() { + // cacheOutDate is set by refreshToken() only when a heartbeat finally arrives after a long gap, + // so it cannot catch an *ongoing* ConfigNode partition (no heartbeat arrives, refreshToken() is + // never called). isFenced() is evaluated on this DataNode's own clock and fires without any + // heartbeat: while fenced we drop the permission cache and force a re-fetch from the + // ConfigNode, + // which fails closed while partitioned, so a missed REVOKE cannot keep authorizing a privilege. + if (cacheOutDate || MetadataLeaseManager.getInstance().isFenced()) { iAuthorCache.invalidAllCache(); } cacheOutDate = false; diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java new file mode 100644 index 0000000000000..fec9a63c4a2fe --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.auth; + +import org.apache.iotdb.commons.auth.entity.User; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +/** + * When the DataNode metadata lease has expired (no ConfigNode heartbeat within {@code T_fence}), + * the permission cache must not be trusted: a partitioned DataNode could have missed a REVOKE + * broadcast and would otherwise keep authorizing a privilege that was already revoked. {@link + * ClusterAuthorityFetcher#checkCacheAvailable()} therefore drops the cache while fenced, forcing a + * re-fetch from the ConfigNode (which fails closed while the DataNode is partitioned). + * + *

This closes a window the pre-existing {@code refreshToken()} timeout did not: that timeout + * only marks the cache stale when a heartbeat finally arrives after a long gap, so during an + * ongoing partition (no heartbeat at all) the stale cache kept being served. {@code + * isFenced()} is evaluated on the DataNode's own clock and needs no heartbeat to fire. + */ +public class ClusterAuthorityFetcherLeaseTest { + + @After + public void tearDown() { + // Restore the process-wide lease singleton so other tests in this JVM are unaffected. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + } + + @Test + public void fencedLeaseDropsPermissionCache() { + final ClusterAuthorityFetcher fetcher = new ClusterAuthorityFetcher(new BasicAuthorityCache()); + final User user = new User("user_fenced", "password"); + fetcher.getAuthorCache().putUserCache(user.getName(), user); + Assert.assertNotNull(fetcher.getAuthorCache().getUserCache(user.getName())); + + MetadataLeaseManager.getInstance().expireLeaseForTest(); + fetcher.checkCacheAvailable(); + + Assert.assertNull( + "a fenced DataNode must drop its permission cache so a missed REVOKE cannot keep authorizing", + fetcher.getAuthorCache().getUserCache(user.getName())); + } + + @Test + public void activeLeaseKeepsPermissionCache() { + final ClusterAuthorityFetcher fetcher = new ClusterAuthorityFetcher(new BasicAuthorityCache()); + final User user = new User("user_active", "password"); + fetcher.getAuthorCache().putUserCache(user.getName(), user); + + // An active lease (a ConfigNode heartbeat was just received) must not needlessly drop the + // cache. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + fetcher.checkCacheAvailable(); + + Assert.assertNotNull( + "an active lease must not needlessly drop the permission cache", + fetcher.getAuthorCache().getUserCache(user.getName())); + } +} From f3fb4fe257b2f861c6acdd5119ed5d522ceb8267 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Tue, 2 Jun 2026 21:36:39 +0800 Subject: [PATCH 04/17] Force tree-schema cache miss while metadata lease is fenced The tree-model schema cache (TreeDeviceSchemaCacheManager) is read-through: on a miss the caller re-fetches from the quorum-backed SchemaRegion, and a ConfigNode broadcast only invalidates entries after a DELETE TIMESERIES / datatype change. A DataNode partitioned from the ConfigNode can miss that broadcast and keep a stale cached entry, then validate a write or resolve a query against schema that no longer exists. All six cache reads funnel through getDeviceSchema(String[]); route them through getDeviceSchemaOrMissWhenFenced, which returns null (a miss) while the lease is fenced so the caller re-fetches from the authoritative SchemaRegion. This is more available than hard-failing (the op still succeeds whenever the SchemaRegion quorum is reachable, and only fails closed when it is not) and keeps the gate tree-scoped, since getDeviceSchema is also used by table-model fetching. On lease recovery cleanUp() drops entries cached before the partition that were never re-read while fenced. --- .../cache/TreeDeviceSchemaCacheManager.java | 34 ++++-- ...TreeDeviceSchemaCacheManagerLeaseTest.java | 102 ++++++++++++++++++ 2 files changed, 130 insertions(+), 6 deletions(-) create mode 100644 iotdb-core/datanode/src/test/java/org/apache/iotdb/db/metadata/cache/TreeDeviceSchemaCacheManagerLeaseTest.java diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java index 93d75aeff2d8c..697a6a931c9d7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java @@ -29,6 +29,7 @@ import org.apache.iotdb.db.queryengine.common.schematree.ClusterSchemaTree; import org.apache.iotdb.db.queryengine.common.schematree.IMeasurementSchemaInfo; import org.apache.iotdb.db.queryengine.plan.analyze.schema.ISchemaComputation; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.template.ClusterTemplateManager; import org.apache.iotdb.db.schemaengine.template.ITemplateManager; @@ -65,6 +66,10 @@ public class TreeDeviceSchemaCacheManager { private TreeDeviceSchemaCacheManager() { tableDeviceSchemaCache = TableDeviceSchemaCache.getInstance(); + // On lease recovery (a ConfigNode heartbeat after this DataNode was fenced), drop everything: + // entries cached before the partition may have missed a ConfigNode invalidation, and forcing a + // miss while fenced does not cover entries that were never re-read during the partition. + MetadataLeaseManager.getInstance().addLeaseRecoveryListener(this::cleanUp); } public static TreeDeviceSchemaCacheManager getInstance() { @@ -92,6 +97,23 @@ public void releaseWriteLock() { readWriteLock.writeLock().unlock(); } + /** + * Look up a device's cached schema, but report a miss (return {@code null}) while the metadata + * lease is fenced. A fenced DataNode may hold a stale entry (it could have missed a ConfigNode + * cache-invalidation such as a DELETE TIMESERIES or datatype change while partitioned); forcing a + * miss makes the read-through callers re-fetch from the authoritative, quorum-backed SchemaRegion + * rather than validate writes/queries against possibly-stale schema. This is more available than + * hard-failing: the operation still succeeds whenever the SchemaRegion quorum is reachable, and + * only fails (fail-closed) when it is not. On lease recovery {@link #cleanUp()} additionally + * drops entries cached before the partition that were never re-read while fenced. + */ + private IDeviceSchema getDeviceSchemaOrMissWhenFenced(final String[] deviceIdNodes) { + if (MetadataLeaseManager.getInstance().isFenced()) { + return null; + } + return tableDeviceSchemaCache.getDeviceSchema(deviceIdNodes); + } + /** * Get SchemaEntity info without auto create schema * @@ -101,7 +123,7 @@ public void releaseWriteLock() { */ public ClusterSchemaTree get(final PartialPath devicePath, final String[] measurements) { final ClusterSchemaTree tree = new ClusterSchemaTree(); - final IDeviceSchema schema = tableDeviceSchemaCache.getDeviceSchema(devicePath.getNodes()); + final IDeviceSchema schema = getDeviceSchemaOrMissWhenFenced(devicePath.getNodes()); if (!(schema instanceof TreeDeviceNormalSchema)) { return tree; } @@ -130,7 +152,7 @@ public ClusterSchemaTree get(final PartialPath devicePath, final String[] measur */ public ClusterSchemaTree getMatchedTemplateSchema(final PartialPath devicePath) { final ClusterSchemaTree tree = new ClusterSchemaTree(); - final IDeviceSchema schema = tableDeviceSchemaCache.getDeviceSchema(devicePath.getNodes()); + final IDeviceSchema schema = getDeviceSchemaOrMissWhenFenced(devicePath.getNodes()); if (!(schema instanceof TreeDeviceTemplateSchema)) { return tree; } @@ -150,7 +172,7 @@ public ClusterSchemaTree getMatchedTemplateSchema(final PartialPath devicePath) public ClusterSchemaTree getMatchedNormalSchema(final PartialPath fullPath) { final ClusterSchemaTree tree = new ClusterSchemaTree(); final IDeviceSchema schema = - tableDeviceSchemaCache.getDeviceSchema( + getDeviceSchemaOrMissWhenFenced( Arrays.copyOf(fullPath.getNodes(), fullPath.getNodeLength() - 1)); if (!(schema instanceof TreeDeviceNormalSchema)) { return tree; @@ -171,7 +193,7 @@ public List computeWithoutTemplate(final ISchemaComputation schemaCompu final String[] measurements = schemaComputation.getMeasurements(); final IDeviceSchema schema = - tableDeviceSchemaCache.getDeviceSchema(schemaComputation.getDevicePath().getNodes()); + getDeviceSchemaOrMissWhenFenced(schemaComputation.getDevicePath().getNodes()); if (!(schema instanceof TreeDeviceNormalSchema)) { return IntStream.range(0, schemaComputation.getMeasurements().length) .boxed() @@ -229,7 +251,7 @@ public Pair, List> computeSourceOfLogicalView( final PartialPath fullPath = logicalViewSchema.getSourcePathIfWritable(); final IDeviceSchema schema = - tableDeviceSchemaCache.getDeviceSchema(fullPath.getDevicePath().getNodes()); + getDeviceSchemaOrMissWhenFenced(fullPath.getDevicePath().getNodes()); if (!(schema instanceof TreeDeviceNormalSchema)) { indexOfMissingMeasurements.add(i); continue; @@ -265,7 +287,7 @@ public List computeWithTemplate(final ISchemaComputation computation) { final List indexOfMissingMeasurements = new ArrayList<>(); final String[] measurements = computation.getMeasurements(); final IDeviceSchema deviceSchema = - tableDeviceSchemaCache.getDeviceSchema(computation.getDevicePath().getNodes()); + getDeviceSchemaOrMissWhenFenced(computation.getDevicePath().getNodes()); if (!(deviceSchema instanceof TreeDeviceTemplateSchema)) { return IntStream.range(0, measurements.length).boxed().collect(Collectors.toList()); diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/metadata/cache/TreeDeviceSchemaCacheManagerLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/metadata/cache/TreeDeviceSchemaCacheManagerLeaseTest.java new file mode 100644 index 0000000000000..6f91ec4504235 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/metadata/cache/TreeDeviceSchemaCacheManagerLeaseTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.metadata.cache; + +import org.apache.iotdb.commons.exception.IllegalPathException; +import org.apache.iotdb.commons.path.MeasurementPath; +import org.apache.iotdb.commons.path.PartialPath; +import org.apache.iotdb.db.queryengine.common.schematree.ClusterSchemaTree; +import org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.cache.TreeDeviceSchemaCacheManager; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.Collections; + +/** + * While the metadata lease is fenced, the tree-model schema cache must not be trusted: this + * DataNode could have missed a ConfigNode cache-invalidation (e.g. a DELETE TIMESERIES / datatype + * change) while partitioned, so a stale cached entry could validate a write or resolve a query + * against schema that no longer exists. Because the cache is read-through, the fix is to report a + * cache miss while fenced so the caller re-fetches from the authoritative, quorum-backed + * SchemaRegion (more available than hard-failing: the op still succeeds whenever that quorum is + * reachable). + */ +public class TreeDeviceSchemaCacheManagerLeaseTest { + + private TreeDeviceSchemaCacheManager manager; + + @Before + public void setUp() throws IllegalPathException { + manager = TreeDeviceSchemaCacheManager.getInstance(); + manager.cleanUp(); + final ClusterSchemaTree tree = new ClusterSchemaTree(); + tree.appendSingleMeasurement( + new PartialPath("root.sg1.d1.s1"), + new MeasurementSchema("s1", TSDataType.INT32), + null, + null, + null, + false); + tree.setDatabases(Collections.singleton("root.sg1")); + manager.put(tree); + } + + @After + public void tearDown() { + manager.cleanUp(); + // Restore the process-wide lease singleton so other tests in this JVM are unaffected. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + } + + @Test + public void fencedLeaseForcesTreeSchemaCacheMiss() throws IllegalPathException { + final PartialPath device1 = new PartialPath("root.sg1.d1"); + final String[] measurements = new String[] {"s1"}; + + // Sanity: with an active lease the cached entry is served (a cache hit). + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + Assert.assertFalse( + "an active lease should serve the cached tree schema", + manager.get(device1, measurements).getAllDevices().isEmpty()); + Assert.assertFalse( + manager + .getMatchedNormalSchema(new MeasurementPath("root.sg1.d1.s1")) + .getAllDevices() + .isEmpty()); + + // Fenced: every tree-schema lookup must report a miss so the caller re-fetches from the + // authoritative SchemaRegion instead of trusting a possibly-stale cached entry. + MetadataLeaseManager.getInstance().expireLeaseForTest(); + Assert.assertTrue( + "a fenced lease must report a tree-schema cache miss (force re-fetch)", + manager.get(device1, measurements).getAllDevices().isEmpty()); + Assert.assertTrue( + manager + .getMatchedNormalSchema(new MeasurementPath("root.sg1.d1.s1")) + .getAllDevices() + .isEmpty()); + } +} From d5bc64cf08b22032b4a4ef838fd22203f18e29a8 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 07:50:16 +0800 Subject: [PATCH 05/17] Use infinite TTL in compaction while metadata lease is fenced Compaction physically deletes data older than its TTL window, reading the TTL from DataNodeTTLCache (pushed by the ConfigNode). A DataNode partitioned from the ConfigNode can miss a TTL update; a too-short stale TTL would make compaction permanently delete data that a missed TTL-increase says to keep - an irreversible loss. MultiTsFileDeviceIterator.nextDevice() now uses an infinite TTL (Long.MAX_VALUE -> timeLowerBound Long.MIN_VALUE -> no TTL-based deletion) when the lease is fenced, scoped to the compaction path only (query/write TTL behavior is unchanged). The check runs before the cache reads, so the table-model path also avoids the now fail-closed DataNodeTableCache. Real TTL deletion resumes once the lease recovers and the cache resyncs. --- .../utils/MultiTsFileDeviceIterator.java | 11 +- .../MultiTsFileDeviceIteratorLeaseTest.java | 106 ++++++++++++++++++ 2 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/compaction/utils/MultiTsFileDeviceIteratorLeaseTest.java diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java index a639fba299cb9..3ce86271968f5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java @@ -28,6 +28,7 @@ import org.apache.iotdb.commons.schema.table.column.TsTableColumnSchema; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.db.queryengine.plan.analyze.cache.schema.DataNodeTTLCache; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.table.DataNodeTableCache; import org.apache.iotdb.db.storageengine.dataregion.compaction.io.CompactionTsFileReader; import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.constant.CompactionType; @@ -236,7 +237,15 @@ public Pair nextDevice() throws IllegalPathException, IOExce IDeviceID deviceID = currentDevice.left; boolean isAligned = currentDevice.right; ignoreAllNullRows = !isAligned || deviceID.getTableName().startsWith("root."); - if (!ignoreAllNullRows) { + if (MetadataLeaseManager.getInstance().isFenced()) { + // Metadata lease fenced: this DataNode may hold a stale TTL (it could have missed a + // ConfigNode + // TTL update while partitioned). A too-short stale TTL would make compaction permanently + // delete data that a missed TTL-increase says to keep, so use an infinite TTL: compaction + // deletes nothing by TTL while fenced, and real TTL deletion resumes once the lease recovers + // and the cache resyncs. (Checked first so the table path also avoids the fenced cache.) + ttlForCurrentDevice = Long.MAX_VALUE; + } else if (!ignoreAllNullRows) { ttlForCurrentDevice = DataNodeTTLCache.getInstance().getTTLForTable(databaseName, deviceID.getTableName()); } else { diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/compaction/utils/MultiTsFileDeviceIteratorLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/compaction/utils/MultiTsFileDeviceIteratorLeaseTest.java new file mode 100644 index 0000000000000..2868ed04d86db --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/compaction/utils/MultiTsFileDeviceIteratorLeaseTest.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.compaction.utils; + +import org.apache.iotdb.commons.exception.IllegalPathException; +import org.apache.iotdb.commons.exception.MetadataException; +import org.apache.iotdb.db.exception.StorageEngineException; +import org.apache.iotdb.db.queryengine.plan.analyze.cache.schema.DataNodeTTLCache; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.db.storageengine.dataregion.compaction.AbstractCompactionTest; +import org.apache.iotdb.db.storageengine.dataregion.compaction.execute.utils.MultiTsFileDeviceIterator; +import org.apache.iotdb.db.storageengine.dataregion.read.control.FileReaderManager; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; + +import org.apache.tsfile.exception.write.WriteProcessException; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; + +/** + * Compaction physically deletes data older than its TTL window. A DataNode partitioned from the + * ConfigNode may hold a stale TTL (it could have missed a ConfigNode TTL update while partitioned), + * and a too-short stale TTL would make compaction permanently delete data that a missed + * TTL-increase says to keep. While the metadata lease is fenced, compaction must therefore fall + * back to an infinite TTL (delete nothing by TTL); real TTL deletion resumes after the lease + * recovers and the cache resyncs. + */ +public class MultiTsFileDeviceIteratorLeaseTest extends AbstractCompactionTest { + + private final String oldThreadName = Thread.currentThread().getName(); + + @Before + public void setUp() + throws IOException, WriteProcessException, MetadataException, InterruptedException { + super.setUp(); + Thread.currentThread().setName("pool-1-IoTDB-Compaction-Worker-1"); + } + + @After + public void tearDown() throws IOException, StorageEngineException { + DataNodeTTLCache.getInstance().clearAllTTLForTree(); + // Restore the process-wide lease singleton so other tests in this JVM are unaffected. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + super.tearDown(); + for (final TsFileResource tsFileResource : seqResources) { + FileReaderManager.getInstance().closeFileAndRemoveReader(tsFileResource.getTsFileID()); + } + Thread.currentThread().setName(oldThreadName); + } + + @Test + public void activeLeaseAppliesRealTtlInCompaction() + throws MetadataException, IOException, WriteProcessException { + registerTimeseriesInMManger(1, 1, false); + createFiles(1, 1, 1, 100, 0, 0, 50, 50, false, true); + DataNodeTTLCache.getInstance().setTTLForTree(COMPACTION_TEST_SG + ".**", 100_000L); + + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + try (final MultiTsFileDeviceIterator it = new MultiTsFileDeviceIterator(seqResources)) { + Assert.assertTrue(it.hasNextDevice()); + it.nextDevice(); + Assert.assertNotEquals(Long.MAX_VALUE, it.getTTLForCurrentDevice()); + Assert.assertNotEquals(Long.MIN_VALUE, it.getTimeLowerBoundForCurrentDevice()); + } + } + + @Test + public void fencedLeaseUsesInfiniteTtlInCompaction() + throws MetadataException, IOException, WriteProcessException, IllegalPathException { + registerTimeseriesInMManger(1, 1, false); + createFiles(1, 1, 1, 100, 0, 0, 50, 50, false, true); + // A finite TTL is configured, so without the fence fallback compaction would delete by it. + DataNodeTTLCache.getInstance().setTTLForTree(COMPACTION_TEST_SG + ".**", 100_000L); + + MetadataLeaseManager.getInstance().expireLeaseForTest(); + try (final MultiTsFileDeviceIterator it = new MultiTsFileDeviceIterator(seqResources)) { + Assert.assertTrue(it.hasNextDevice()); + it.nextDevice(); + Assert.assertEquals( + "a fenced DataNode must use an infinite TTL in compaction so a stale TTL cannot delete data", + Long.MAX_VALUE, + it.getTTLForCurrentDevice()); + Assert.assertEquals(Long.MIN_VALUE, it.getTimeLowerBoundForCurrentDevice()); + } + } +} From f8a9bee44cddeb31e4944f5fa43c233fbe777acb Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 08:05:34 +0800 Subject: [PATCH 06/17] Add ClusterCachePropagator to drive the Tier-A broadcast verdict The ConfigNode side of the metadata-lease HA change. ClusterCachePropagator broadcasts a cache-invalidation to all registered DataNodes and turns the per-DataNode responses into a PROCEED/WAIT/FAIL verdict via the already-built MetadataBroadcastVerdict, instead of the legacy 'any unreachable DataNode fails the operation'. For each registered DataNode it builds a DataNodeState from: acked (SUCCESS response), supportsFencing and hbAge (from DataNodeContactTracker). A DataNode that is provably self-fenced (capable + silent past T_proceed) is safe to proceed past; a non-SUCCESS or recently-contacted unacked DataNode is unsafe. propagate() retries on WAIT until a DataNode acks/crosses T_proceed or the wait budget (T_proceed + buffer) runs out. The caller supplies a CacheBroadcast closure wrapping its specific RPC, so the propagator is agnostic to the request type. Clock and sleep are injectable; the verdict construction and the retry loop are covered by unit tests. --- .../manager/lease/ClusterCachePropagator.java | 163 ++++++++++++++++ .../lease/ClusterCachePropagatorTest.java | 176 ++++++++++++++++++ 2 files changed, 339 insertions(+) create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java create mode 100644 iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java new file mode 100644 index 0000000000000..74a49ca5c450d --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.confignode.manager.IManager; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.DataNodeState; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; +import org.apache.iotdb.rpc.TSStatusCode; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.function.IntPredicate; +import java.util.function.IntToLongFunction; +import java.util.function.LongSupplier; +import java.util.function.Supplier; + +/** + * Drives one Tier-A metadata cache-invalidation broadcast to the cluster and turns "which DataNodes + * acknowledged" into a {@link Verdict} via {@link MetadataBroadcastVerdict}. Instead of the legacy + * "any unreachable DataNode fails the operation", a DataNode that is provably self-fenced (out of + * ConfigNode contact for at least {@code T_proceed} and known to support fencing) is treated as + * safe to proceed past, delivering availability without risking dirty data (see the design doc). + * + *

The caller supplies a {@link CacheBroadcast} closure wrapping its specific RPC (table + * pre-release, schema-cache invalidation, permission-cache invalidation, ...); this class is + * agnostic to the request type and only interprets the per-DataNode {@link TSStatus} responses. + * + *

Stateless and cheap to construct per operation. Clock and sleep are injectable for testing. + */ +public class ClusterCachePropagator { + + /** + * {@code T_proceed = T_fence + margin}. The margin (default 5s) covers heartbeat-recording + * granularity and scheduling jitter; see design §2.6. Kept internal (not a user knob). + */ + private static final long DEFAULT_PROCEED_MARGIN_MS = 5_000L; + + /** How often to re-broadcast while waiting for unacked DataNodes to ack or to cross T_proceed. */ + private static final long RETRY_INTERVAL_MS = 1_000L; + + /** + * Extra slack on top of T_proceed before giving up, so a just-died DataNode can cross T_proceed. + */ + private static final long WAIT_BUDGET_BUFFER_MS = 5_000L; + + /** Broadcasts the cache invalidation to {@code targets} and returns the per-nodeId responses. */ + @FunctionalInterface + public interface CacheBroadcast { + Map sendTo(Map targets); + } + + /** Injectable sleep so the retry loop can be driven deterministically in tests. */ + @FunctionalInterface + interface Sleeper { + void sleepMs(long ms) throws InterruptedException; + } + + private final Supplier> registeredDataNodes; + private final IntPredicate supportsFencing; + private final IntToLongFunction hbAgeMs; + private final LongSupplier tProceedMs; + private final LongSupplier nanoClock; + private final Sleeper sleeper; + + public ClusterCachePropagator(final IManager configManager) { + this( + () -> configManager.getNodeManager().getRegisteredDataNodeLocations(), + nodeId -> DataNodeContactTracker.getInstance().supportsFencing(nodeId), + nodeId -> DataNodeContactTracker.getInstance().getMillisSinceLastSuccessfulResponse(nodeId), + () -> + CommonDescriptor.getInstance().getConfig().getMetadataLeaseFenceMs() + + DEFAULT_PROCEED_MARGIN_MS, + System::nanoTime, + Thread::sleep); + } + + ClusterCachePropagator( + final Supplier> registeredDataNodes, + final IntPredicate supportsFencing, + final IntToLongFunction hbAgeMs, + final LongSupplier tProceedMs, + final LongSupplier nanoClock, + final Sleeper sleeper) { + this.registeredDataNodes = registeredDataNodes; + this.supportsFencing = supportsFencing; + this.hbAgeMs = hbAgeMs; + this.tProceedMs = tProceedMs; + this.nanoClock = nanoClock; + this.sleeper = sleeper; + } + + /** + * Broadcast once and classify the result. {@code waitBudgetExhausted} turns a would-be {@link + * Verdict#WAIT} into {@link Verdict#FAIL} (the caller's retry budget ran out). + */ + public Verdict propagateOnce(final CacheBroadcast broadcast, final boolean waitBudgetExhausted) { + final Map targets = registeredDataNodes.get(); + final Map responses = broadcast.sendTo(targets); + final long tProceed = tProceedMs.getAsLong(); + final int successCode = TSStatusCode.SUCCESS_STATUS.getStatusCode(); + final List states = new ArrayList<>(targets.size()); + for (final Integer nodeId : targets.keySet()) { + final TSStatus status = responses.get(nodeId); + final boolean acked = status != null && status.getCode() == successCode; + // retiredOrFenceAcked is left false: there is no explicit fence-ack signal yet, and a + // Removing DataNode may still serve clients, so it must ack or be provably fenced like any + // other (see MetadataBroadcastVerdict's SAFE_GONE rule). + states.add( + new DataNodeState( + acked, false, supportsFencing.test(nodeId), hbAgeMs.applyAsLong(nodeId))); + } + return MetadataBroadcastVerdict.decide(states, tProceed, waitBudgetExhausted); + } + + /** + * Broadcast and retry until the verdict is {@link Verdict#PROCEED} (returns {@code true}) or the + * wait budget is exhausted with at least one DataNode still unsafe ({@link Verdict#FAIL}, returns + * {@code false}). Blocks the calling (procedure) thread for up to {@code T_proceed + buffer}. + */ + public boolean propagate(final CacheBroadcast broadcast) { + final long deadlineNanos = + nanoClock.getAsLong() + + TimeUnit.MILLISECONDS.toNanos(tProceedMs.getAsLong() + WAIT_BUDGET_BUFFER_MS); + while (true) { + final boolean waitBudgetExhausted = nanoClock.getAsLong() >= deadlineNanos; + final Verdict verdict = propagateOnce(broadcast, waitBudgetExhausted); + if (verdict == Verdict.PROCEED) { + return true; + } + if (verdict == Verdict.FAIL) { + return false; + } + try { + sleeper.sleepMs(RETRY_INTERVAL_MS); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + return false; + } + } + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java new file mode 100644 index 0000000000000..18910607a2fa9 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.IntPredicate; +import java.util.function.IntToLongFunction; + +public class ClusterCachePropagatorTest { + + private static final long T_PROCEED_MS = 25_000L; + + private static TSStatus success() { + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } + + private static TSStatus error() { + return new TSStatus(TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode()); + } + + private static Map twoDataNodes() { + final Map map = new HashMap<>(); + map.put(1, new TDataNodeLocation().setDataNodeId(1)); + map.put(2, new TDataNodeLocation().setDataNodeId(2)); + return map; + } + + /** Build a propagator whose loop seams are inert (only propagateOnce is exercised). */ + private static ClusterCachePropagator propagator( + final IntPredicate supportsFencing, final IntToLongFunction hbAgeMs) { + return new ClusterCachePropagator( + ClusterCachePropagatorTest::twoDataNodes, + supportsFencing, + hbAgeMs, + () -> T_PROCEED_MS, + () -> 0L, + ms -> {}); + } + + @Test + public void allAckedProceeds() { + final ClusterCachePropagator p = propagator(id -> true, id -> 0L); + final Verdict v = + p.propagateOnce( + targets -> { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, success()); + return r; + }, + false); + Assert.assertEquals(Verdict.PROCEED, v); + } + + @Test + public void unreachableButProvablyFencedProceeds() { + // DN2 did not respond, supports fencing, and has been silent past T_proceed -> provably fenced. + final ClusterCachePropagator p = propagator(id -> true, id -> id == 2 ? T_PROCEED_MS + 1 : 0L); + final Verdict v = p.propagateOnce(targets -> ackOnly(1), false); + Assert.assertEquals(Verdict.PROCEED, v); + } + + @Test + public void unreachableNotYetFencedWaits() { + // DN2 silent but not yet past T_proceed -> cannot assume fenced -> WAIT (budget not exhausted). + final ClusterCachePropagator p = propagator(id -> true, id -> id == 2 ? 10_000L : 0L); + Assert.assertEquals(Verdict.WAIT, p.propagateOnce(targets -> ackOnly(1), false)); + } + + @Test + public void unreachableNotYetFencedFailsWhenBudgetExhausted() { + final ClusterCachePropagator p = propagator(id -> true, id -> id == 2 ? 10_000L : 0L); + Assert.assertEquals(Verdict.FAIL, p.propagateOnce(targets -> ackOnly(1), true)); + } + + @Test + public void incapableDataNodeNeverFencedWaits() { + // DN2 does not support fencing: even silent "forever" it can never be assumed fenced + // (rolling-upgrade safety) -> strict semantics -> WAIT. + final ClusterCachePropagator p = propagator(id -> id != 2, id -> id == 2 ? 999_999L : 0L); + Assert.assertEquals(Verdict.WAIT, p.propagateOnce(targets -> ackOnly(1), false)); + } + + @Test + public void nonSuccessResponseIsNotAck() { + // DN2 responded but with a non-SUCCESS status: it did NOT apply the invalidation, and being + // reachable its hbAge is small, so it is UNSAFE (must retry), not silently accepted. + final ClusterCachePropagator p = propagator(id -> true, id -> id == 2 ? 1_000L : 0L); + final Verdict v = + p.propagateOnce( + targets -> { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, error()); + return r; + }, + false); + Assert.assertEquals(Verdict.WAIT, v); + } + + @Test + public void loopReturnsTrueWhenItEventuallyProceeds() { + final AtomicInteger calls = new AtomicInteger(); + final AtomicLong nanos = new AtomicLong(); + final ClusterCachePropagator p = + new ClusterCachePropagator( + ClusterCachePropagatorTest::twoDataNodes, + id -> true, + id -> id == 2 ? 10_000L : 0L, // DN2 not fenced, so round 1 must WAIT + () -> T_PROCEED_MS, + nanos::get, + ms -> nanos.addAndGet(ms * 1_000_000L)); + // Round 1: DN2 unreachable -> WAIT. Round 2: DN2 acks -> PROCEED. + final boolean proceeded = + p.propagate(targets -> calls.incrementAndGet() == 1 ? ackOnly(1) : ackBoth()); + Assert.assertTrue(proceeded); + Assert.assertEquals(2, calls.get()); + } + + @Test + public void loopReturnsFalseWhenBudgetExhausted() { + final AtomicLong nanos = new AtomicLong(); + final ClusterCachePropagator p = + new ClusterCachePropagator( + ClusterCachePropagatorTest::twoDataNodes, + id -> true, + id -> id == 2 ? 10_000L : 0L, // DN2 never fenced (alive but not acking) -> WAIT forever + () -> T_PROCEED_MS, + nanos::get, + ms -> nanos.addAndGet(ms * 1_000_000L)); + // DN2 keeps failing to ack; the fake clock advances on each sleep until the wait budget runs + // out, at which point the loop must give up with FAIL. + Assert.assertFalse(p.propagate(targets -> ackOnly(1))); + } + + private static Map ackOnly(final int nodeId) { + final Map r = new HashMap<>(); + r.put(nodeId, success()); + return r; + } + + private static Map ackBoth() { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, success()); + return r; + } +} From 3b6a6eae085ab4ca9148d1e917325bda60083562 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 08:08:44 +0800 Subject: [PATCH 07/17] Wire DataNodeContactTracker leadership/removal lifecycle hooks The metadata-broadcast verdict reads each DataNode's last-successful-response time from DataNodeContactTracker. Two lifecycle events must keep that signal sound: - On (re)acquiring ConfigRegion leadership (notifyLeaderReady), reset every registered DataNode's contact time to now. Otherwise a timestamp left from a previous leadership term - during which another ConfigNode was contacting the DataNodes - could make the verdict wrongly judge a live DataNode as fenced. - On permanent DataNode removal (removeDataNodePersistence), drop its tracker entry so stale contact/capability state is not retained and a future DataNode reusing the id cannot inherit it. --- .../consensus/statemachine/ConfigRegionStateMachine.java | 9 +++++++++ .../confignode/procedure/env/RemoveDataNodeHandler.java | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java index fe687d17556f7..cf61bbaea0445 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java @@ -37,6 +37,7 @@ import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.consensus.ConsensusManager; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.pipe.agent.PipeConfigNodeAgent; import org.apache.iotdb.confignode.persistence.executor.ConfigPlanExecutor; import org.apache.iotdb.confignode.persistence.schema.ConfigNodeSnapshotParser; @@ -291,6 +292,14 @@ public void notifyLeaderReady() { // Always start load services first configManager.getLoadManager().startLoadServices(); + // Reset every DataNode's last-contact time to now on (re)acquiring leadership: a stale + // timestamp + // left from a previous leadership term (while another ConfigNode was contacting the DataNodes) + // would otherwise let the metadata-broadcast verdict wrongly judge a live DataNode as fenced. + DataNodeContactTracker.getInstance() + .onLeadershipAcquired( + configManager.getNodeManager().getRegisteredDataNodeLocations().keySet()); + if (CONF.isEnableTopologyProbing()) { configManager.getLoadManager().startTopologyService(); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java index 5b505ec001bff..6d9fa18bab89d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java @@ -40,6 +40,7 @@ import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.load.balancer.region.GreedyCopySetRegionGroupAllocator; import org.apache.iotdb.confignode.manager.load.balancer.region.IRegionGroupAllocator; import org.apache.iotdb.confignode.manager.load.cache.node.NodeHeartbeatSample; @@ -455,6 +456,9 @@ public void removeDataNodePersistence(List removedDataNodes) PartitionMetrics.unbindDataNodePartitionMetricsWhenUpdate( MetricService.getInstance(), NodeUrlUtils.convertTEndPointUrl(dataNodeLocation.getClientRpcEndPoint())); + // Drop the removed DataNode's metadata-lease contact/capability state so it is not retained, + // and a future DataNode reusing the id cannot inherit stale fencing history. + DataNodeContactTracker.getInstance().removeDataNode(dataNodeLocation.getDataNodeId()); } } From fbaaae621886d71c6b9d8e6bece6072e7aaf5934 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 08:15:37 +0800 Subject: [PATCH 08/17] Wire CreateTable pre-release through ClusterCachePropagator (template) First Tier-A procedure migrated off 'any unreachable DataNode fails the op'. CreateTableProcedure's PRE_RELEASE step now broadcasts via ClusterCachePropagator: it proceeds once every unacked DataNode is provably self-fenced (which, per Phase 1, fails closed on its stale table cache and resyncs on lease recovery, so it cannot serve dirty schema), and only fails when an unacked DataNode is not provably fenced. SchemaUtils gains preUpdateTableReq() (request builder) and broadcastTableUpdate() (returns the full per-nodeId response map the verdict needs); the legacy preReleaseTable() is now a thin wrapper returning only failures, so its other callers are unchanged. The happy path (all DataNodes ack -> PROCEED) is behaviorally identical; CreateTableProcedureTest still passes. COMMIT_RELEASE stays best-effort (warn-only) as before. This is the template for the remaining Tier-A procedures. --- .../procedure/impl/schema/SchemaUtils.java | 40 ++++++++++++++----- .../schema/table/CreateTableProcedure.java | 20 +++++++--- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java index 4b8d0a533afe3..efd7ab4216ff1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java @@ -240,27 +240,47 @@ protected void onAllReplicasetFailure( } } - public static Map preReleaseTable( - final String database, - final TsTable table, - final ConfigManager configManager, - final String oldName) { + /** Build the PRE_UPDATE_TABLE request used to pre-release a table change to DataNodes. */ + public static TUpdateTableReq preUpdateTableReq( + final String database, final TsTable table, final String oldName) { final TUpdateTableReq req = new TUpdateTableReq(); req.setType(TsTableInternalRPCType.PRE_UPDATE_TABLE.getOperationType()); req.setTableInfo(TsTableInternalRPCUtil.serializeSingleTsTableWithDatabase(database, table)); req.setOldName(oldName); + return req; + } - final Map dataNodeLocationMap = - configManager.getNodeManager().getRegisteredDataNodeLocations(); + /** + * Broadcast a table update to exactly {@code targets} and return the full per-nodeId response map + * (both successes and failures). Used by {@link + * org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator}, which needs to know which + * DataNodes acknowledged in order to decide whether it is safe to proceed past the rest. + */ + public static Map broadcastTableUpdate( + final TUpdateTableReq req, final Map targets) { final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TABLE, req, dataNodeLocationMap); + new DataNodeAsyncRequestContext<>(CnToDnAsyncRequestType.UPDATE_TABLE, req, targets); CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - return clientHandler.getResponseMap().entrySet().stream() + return clientHandler.getResponseMap(); + } + + private static Map failedOnly(final Map responses) { + return responses.entrySet().stream() .filter(entry -> entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } + public static Map preReleaseTable( + final String database, + final TsTable table, + final ConfigManager configManager, + final String oldName) { + return failedOnly( + broadcastTableUpdate( + preUpdateTableReq(database, table, oldName), + configManager.getNodeManager().getRegisteredDataNodeLocations())); + } + public static Map commitReleaseTable( final String database, final String tableName, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java index 05e0facb3018e..b4350fbcc5040 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java @@ -29,6 +29,7 @@ import org.apache.iotdb.confignode.consensus.request.write.table.RollbackCreateTablePlan; import org.apache.iotdb.confignode.exception.DatabaseNotExistsException; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; @@ -36,6 +37,7 @@ import org.apache.iotdb.confignode.procedure.state.schema.CreateTableState; import org.apache.iotdb.confignode.procedure.store.ProcedureType; import org.apache.iotdb.confignode.rpc.thrift.TDatabaseSchema; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -151,16 +153,22 @@ protected void preCreateTable(final ConfigNodeProcedureEnv env) { } private void preReleaseTable(final ConfigNodeProcedureEnv env) { - final Map failedResults = - SchemaUtils.preReleaseTable(database, table, env.getConfigManager(), null); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // Broadcast the pre-update to all DataNodes. Instead of failing whenever any DataNode is + // unreachable, proceed once every unacked DataNode is provably self-fenced: such a DataNode + // fails closed on its (now-stale) table cache and resyncs on lease recovery, so it cannot serve + // dirty schema. Only fail if an unacked DataNode is not provably fenced (it may still be + // serving clients). + final TUpdateTableReq req = SchemaUtils.preUpdateTableReq(database, table, null); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_SYNC_TABLE_PRE_CREATE_INFO_TO_DATANODE_FAILURE, database, table.getTableName(), - failedResults); + "an unreachable DataNode is not provably fenced"); setFailure( new ProcedureException(new MetadataException(ProcedureMessages.PRE_CREATE_TABLE_FAILED))); return; From f9accb58f8e165abd83c7b7f2bf1b439cdab8dff Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 09:56:00 +0800 Subject: [PATCH 09/17] Add 1C3D IT: table DDL succeeds with one DataNode down End-to-end verification of the metadata-lease/fence HA change. With a short metadata_lease_fence_ms, the test starts 1 ConfigNode + 3 DataNodes, creates a database, stops one DataNode, and asserts CREATE TABLE still succeeds - whereas before the change a table DDL hard-failed whenever any DataNode could not acknowledge the cache-invalidation broadcast. Adds setMetadataLeaseFenceMs to the IT CommonConfig framework (interface + MppCommonConfig / MppSharedCommonConfig / RemoteCommonConfig) so the fence threshold can be shortened, keeping the proceed-past-fenced wait fast. --- .../env/cluster/config/MppCommonConfig.java | 6 + .../cluster/config/MppSharedCommonConfig.java | 7 ++ .../env/remote/config/RemoteCommonConfig.java | 5 + .../apache/iotdb/itbase/env/CommonConfig.java | 2 + .../it/schema/IoTDBTableDDLHAIT.java | 117 ++++++++++++++++++ 5 files changed, 137 insertions(+) create mode 100644 integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java index 4852b9d116e25..403671ac3c64f 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java @@ -85,6 +85,12 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + setProperty("metadata_lease_fence_ms", String.valueOf(metadataLeaseFenceMs)); + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { setProperty("time_partition_interval", String.valueOf(partitionInterval)); diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java index 582c9a049e492..df54eb295be48 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java @@ -61,6 +61,13 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + cnConfig.setMetadataLeaseFenceMs(metadataLeaseFenceMs); + dnConfig.setMetadataLeaseFenceMs(metadataLeaseFenceMs); + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { cnConfig.setPartitionInterval(partitionInterval); diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java index 48c157e957be8..0df0b61ce002f 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java @@ -44,6 +44,11 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { return this; diff --git a/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java index dc21234e2bad2..41263510f38d7 100644 --- a/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java @@ -32,6 +32,8 @@ public interface CommonConfig { CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold); + CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs); + CommonConfig setPartitionInterval(long partitionInterval); CommonConfig setCompressor(String compressor); diff --git a/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java b/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java new file mode 100644 index 0000000000000..1df3241b43d72 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.relational.it.schema; + +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.it.env.cluster.node.DataNodeWrapper; +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.TableClusterIT; +import org.apache.iotdb.itbase.env.BaseEnv; + +import org.awaitility.Awaitility; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.concurrent.TimeUnit; + +import static org.junit.Assert.assertTrue; + +/** + * HA behavior of table-model DDL: a table DDL must broadcast a cache-invalidation to every + * DataNode. Before the metadata-lease/fence change it hard-failed whenever any DataNode was + * unreachable (so a single down DataNode broke CREATE TABLE, contradicting multi-replica HA). With + * the change the ConfigNode proceeds once the unreachable DataNode is provably self-fenced (it + * fails closed on its stale caches and resyncs on recovery, so it cannot serve dirty schema). + * + *

This test stops one DataNode and asserts CREATE TABLE still succeeds. + */ +@RunWith(IoTDBTestRunner.class) +@Category({TableClusterIT.class}) +public class IoTDBTableDDLHAIT { + + @BeforeClass + public static void setUp() throws Exception { + // Small fence threshold so the ConfigNode can prove the stopped DataNode is self-fenced quickly + // (T_proceed = fence + ~5s internal margin), keeping the test fast. Live DataNodes keep + // heartbeating (~1s), so they do not spuriously fence. + EnvFactory.getEnv().getConfig().getCommonConfig().setMetadataLeaseFenceMs(2000); + EnvFactory.getEnv().initClusterEnvironment(1, 3); + } + + @AfterClass + public static void tearDown() throws Exception { + EnvFactory.getEnv().cleanClusterEnvironment(); + } + + @Test + public void createTableSucceedsWhileOneDataNodeIsDown() throws Exception { + final DataNodeWrapper liveDataNode = EnvFactory.getEnv().getDataNodeWrapper(0); + final DataNodeWrapper victimDataNode = EnvFactory.getEnv().getDataNodeWrapper(2); + + // Pin the connection to a DataNode we will keep alive, so stopping the victim cannot break it. + try (final Connection connection = + EnvFactory.getEnv() + .getConnection(liveDataNode, "root", "root", BaseEnv.TABLE_SQL_DIALECT); + final Statement statement = connection.createStatement()) { + statement.execute("CREATE DATABASE test_ha"); + statement.execute("USE test_ha"); + + // Sanity: with all DataNodes up the DDL broadcast acks everywhere and succeeds immediately. + statement.execute("CREATE TABLE t_all_up (region STRING TAG, temperature FLOAT FIELD)"); + + // Take one DataNode down. Its last successful ConfigNode contact is now frozen; after + // T_proceed the ConfigNode can treat it as self-fenced and stop waiting for its ack. + victimDataNode.stop(); + Assert.assertFalse("victim DataNode should be stopped", victimDataNode.isAlive()); + + // The DDL broadcast can no longer reach the stopped DataNode. Previously this hard-failed; + // now it must still succeed (after blocking ~T_proceed while the fence is proven). + statement.execute("CREATE TABLE t_after_down (region STRING TAG, temperature FLOAT FIELD)"); + + // Confirm the new table is really visible on the live DataNode. + Awaitility.await() + .atMost(30, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .until(() -> tableExists(statement, "t_after_down")); + assertTrue( + "CREATE TABLE must succeed with one DataNode down", + tableExists(statement, "t_after_down")); + } + } + + private static boolean tableExists(final Statement statement, final String tableName) + throws Exception { + try (final ResultSet resultSet = statement.executeQuery("SHOW TABLES")) { + while (resultSet.next()) { + if (tableName.equalsIgnoreCase(resultSet.getString(1))) { + return true; + } + } + } + return false; + } +} From 88c97bac14c1a767eb6ce6190d0ded6f3f55c864 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 10:19:32 +0800 Subject: [PATCH 10/17] Wire all alter/drop-table procedures through ClusterCachePropagator AbstractAlterOrDropTableProcedure is the base for all 8 table-mutation procedures (AddTableColumn, DropTableColumn, DropTable, RenameTable, RenameTableColumn, SetTableProperties, AlterTableColumnDataType, DeleteDevices), so wiring it migrates them all at once. Both the forward pre-release and the rollback pre-release now broadcast via ClusterCachePropagator and proceed once every unreachable DataNode is provably self-fenced, instead of hard-failing on the first unreachable DataNode. The rollback path is included so a down DataNode cannot block rollback either. commitRelease stays best-effort (warn-only) as before, since the change is already authoritative once committed. SchemaUtils gains rollbackUpdateTableReq() to mirror preUpdateTableReq(); both legacy preReleaseTable()/rollbackPreRelease() remain thin failure-returning wrappers. All 7 alter/drop procedure serialization tests still pass. --- .../procedure/impl/schema/SchemaUtils.java | 28 +++++++-------- .../AbstractAlterOrDropTableProcedure.java | 36 ++++++++++++------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java index efd7ab4216ff1..6011981c6ba11 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java @@ -309,11 +309,9 @@ public static Map commitReleaseTable( .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } - public static Map rollbackPreRelease( - final String database, - final String tableName, - final ConfigManager configManager, - final @Nullable String oldName) { + /** Build the ROLLBACK_UPDATE_TABLE request used to roll back a pre-released table change. */ + public static TUpdateTableReq rollbackUpdateTableReq( + final String database, final String tableName, final String oldName) { final TUpdateTableReq req = new TUpdateTableReq(); req.setType(TsTableInternalRPCType.ROLLBACK_UPDATE_TABLE.getOperationType()); final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); @@ -325,16 +323,18 @@ public static Map rollbackPreRelease( } req.setTableInfo(outputStream.toByteArray()); req.setOldName(oldName); + return req; + } - final Map dataNodeLocationMap = - configManager.getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TABLE, req, dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - return clientHandler.getResponseMap().entrySet().stream() - .filter(entry -> entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + public static Map rollbackPreRelease( + final String database, + final String tableName, + final ConfigManager configManager, + final @Nullable String oldName) { + return failedOnly( + broadcastTableUpdate( + rollbackUpdateTableReq(database, tableName, oldName), + configManager.getNodeManager().getRegisteredDataNodeLocations())); } public static TSStatus executeInConsensusLayer( diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java index 7cf1ff1c24f83..f1cf52e74caa1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java @@ -27,11 +27,13 @@ import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; import org.apache.iotdb.confignode.procedure.impl.schema.DataNodeTSStatusTaskExecutor; import org.apache.iotdb.confignode.procedure.impl.schema.SchemaUtils; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; import org.apache.tsfile.utils.ReadWriteIOUtils; import org.slf4j.Logger; @@ -91,17 +93,22 @@ protected void preRelease(final ConfigNodeProcedureEnv env) { } protected void preRelease(final ConfigNodeProcedureEnv env, final @Nullable String oldName) { - final Map failedResults = - SchemaUtils.preReleaseTable(database, table, env.getConfigManager(), oldName); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // Proceed once every unreachable DataNode is provably self-fenced instead of hard-failing the + // DDL: a fenced DataNode fails closed on its now-stale table cache and resyncs on lease + // recovery, so it cannot serve dirty schema. Only fail if an unacked DataNode is not provably + // fenced (it may still be serving clients). + final TUpdateTableReq req = SchemaUtils.preUpdateTableReq(database, table, oldName); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_PRE_RELEASE_FOR_TABLE_TO_DATANODE_FAILURE_RESULTS, getActionMessage(), database, table.getTableName(), - failedResults); + "an unreachable DataNode is not provably fenced"); setFailure( new ProcedureException( new MetadataException( @@ -138,18 +145,21 @@ protected void rollbackPreRelease(final ConfigNodeProcedureEnv env) { protected void rollbackPreRelease( final ConfigNodeProcedureEnv env, final @Nullable String tableName) { - final Map failedResults = - SchemaUtils.rollbackPreRelease( - database, table.getTableName(), env.getConfigManager(), tableName); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // A down DataNode must not block rollback either: proceed past provably-fenced DataNodes (which + // resync on recovery) and only fail on an unacked DataNode that is not provably fenced. + final TUpdateTableReq req = + SchemaUtils.rollbackUpdateTableReq(database, table.getTableName(), tableName); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_ROLLBACK_PRE_RELEASE_FOR_TABLE_INFO_TO_DATANODE, getActionMessage(), database, table.getTableName(), - failedResults); + "an unreachable DataNode is not provably fenced"); setFailure( new ProcedureException( new MetadataException( From d6f0e8ee24b20d4aea63ba9eaaae83250c8b5121 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 10:31:47 +0800 Subject: [PATCH 11/17] Wire tree-model schema-cache invalidation through ClusterCachePropagator DeleteTimeSeriesProcedure.invalidateCache is the shared static helper that broadcasts INVALIDATE_MATCHED_SCHEMA_CACHE; AlterEncodingCompressorProcedure reuses it, so wiring it covers both. It now proceeds once every unreachable DataNode is provably self-fenced instead of hard-failing on the first one. It runs before the physical delete in the state machine, so the 'delete only after PROCEED' ordering holds with no reordering. Because the propagator may re-broadcast while waiting for unacked DataNodes, the broadcast closure builds a fresh request with patternTreeBytes.duplicate() on every attempt, so a consumed buffer can never be re-sent as an empty (silently-successful) invalidation. DeleteTimeSeries and AlterEncodingCompressor serialization tests still pass. --- .../schema/DeleteTimeSeriesProcedure.java | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java index 0b5e45b5ca1f5..8bd396c72daf2 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java @@ -32,6 +32,7 @@ import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeleteTimeSeriesPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; @@ -197,26 +198,34 @@ public static void invalidateCache( final String requestMessage, final Consumer setFailure, final boolean needLock) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes).setNeedLock(needLock), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // schema cache and resyncs on recovery, so it cannot serve the to-be-deleted/altered series), + // instead of hard-failing on the first unreachable DataNode. This runs before the physical + // delete in the state machine, so the "delete only after PROCEED" ordering holds. The + // ConfigCachePropagator may re-broadcast while waiting, so build a fresh request (with a + // duplicated buffer) on every attempt rather than reusing a possibly-consumed one. + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate( + targets -> { + final DataNodeAsyncRequestContext + clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, + new TInvalidateMatchedSchemaCacheReq(patternTreeBytes.duplicate()) + .setNeedLock(needLock), + targets); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + }); + if (!proceeded) { // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, - requestMessage); - setFailure.accept( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, requestMessage); + setFailure.accept( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); } } From f87dae6a5d8d8419b4184285b9e77a19d4de6d1e Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 13:54:20 +0800 Subject: [PATCH 12/17] Centralize schema-cache invalidation through ClusterCachePropagator Add SchemaUtils.invalidateMatchedSchemaCache() as the single place that broadcasts INVALIDATE_MATCHED_SCHEMA_CACHE via ClusterCachePropagator (proceed once every unreachable DataNode is provably self-fenced) with the buffer.duplicate() safety against the propagator's re-broadcast on WAIT. Route all five INVALIDATE_MATCHED_SCHEMA_CACHE callers through it: - AlterLogicalViewProcedure, DeleteLogicalViewProcedure (views) - DeactivateTemplateProcedure (template) - DeleteTimeSeriesProcedure, AlterTimeSeriesDataTypeProcedure (static helpers, also used by AlterEncodingCompressorProcedure) - refactored off their inline broadcasts onto the shared helper. Each keeps its own error message and runs before its physical delete/alter step, so the 'delete/alter only after PROCEED' ordering holds. The region-task broadcasts (CONSTRUCT_*_BLACK_LIST, ALTER_VIEW, ALTER_TIMESERIES_DATATYPE) are deliberately untouched - they go through region consensus. Affected procedure tests still pass. --- cluster-metadata-ha-fencing-design.md | 308 ++++++++++++++++ .../schema/AlterLogicalViewProcedure.java | 31 +- .../AlterTimeSeriesDataTypeProcedure.java | 32 +- .../schema/DeactivateTemplateProcedure.java | 37 +- .../schema/DeleteLogicalViewProcedure.java | 30 +- .../schema/DeleteTimeSeriesProcedure.java | 26 +- .../procedure/impl/schema/SchemaUtils.java | 33 ++ table-model-ddl-ha-analysis.md | 342 ++++++++++++++++++ 8 files changed, 723 insertions(+), 116 deletions(-) create mode 100644 cluster-metadata-ha-fencing-design.md create mode 100644 table-model-ddl-ha-analysis.md diff --git a/cluster-metadata-ha-fencing-design.md b/cluster-metadata-ha-fencing-design.md new file mode 100644 index 0000000000000..eab7d82969b6d --- /dev/null +++ b/cluster-metadata-ha-fencing-design.md @@ -0,0 +1,308 @@ +# ConfigNode→DataNode 元数据广播的高可用改造:租约 + Fencing 统一方案 + +> 状态:待 review +> 关联文档:`table-model-ddl-ha-analysis.md`(表模型 DDL 的根因分析,本文是其推广到全集群的通用方案) +> 适用版本:2.0.x(master) +> 修订记录: +> - v2 — 移除 epoch 代次与 CN 侧 laggards,简化为"两支柱(Lease + Verdict)"。 +> - v3 — 移除 `tier_b_fail_closed` / `enable_metadata_lease_fencing` 配置;放行判定改用 `hbAge` 信号。 +> - v4 — `T_fence` 默认 20s(与 `failureDetectorFixedThresholdInMs` 对齐);Tier-A 内拆"加性即时放行 / 破坏类等待"。 +> - **v5(本版,依据第二轮 code-grounded review)**: +> 1. **修正 hbAge 安全证明**:现有样本记录的是心跳**发送时刻**(echoed `heartbeatTimestamp`),且 `onError` 用**当前时刻**写 Unknown 样本——都不能用于判 fence。改为 CN 侧新增**专用"最近成功收到心跳响应时刻"**(收到响应时用 CN 本地时钟打点,仅成功更新、绝不被失败样本前移),并显式声明有界延迟 + 心跳连接双向对称假设(§2.3/§2.6)。 +> 2. **取消"加性即时跳过"**(评审点 2/4):它会制造"未 fenced 的 laggard"(Running 但漏广播、又被跳过 → 新 schema 长期不可见)。改为**所有 Tier-A 统一"未 ack 必须 ack 或 已证实 fenced"**。代价:任一 DN 不可达时所有 Tier-A 都等 `T_proceed`(放弃 v4 的加性快路径)。 +> 3. **Tier-B 按 加资源 vs 撤资源/降级/控制 再分**(评审点 3):DROP UDF/Trigger/Plugin、SET SYSTEM STATUS ReadOnly 这类按 Tier-A 强一致处理(陈旧 DN 会继续跑旧资源 / 继续写入,不是良性漂移)。 +> 4. **能力位先于一切判定**(评审点 4):不支持 fencing 的旧 DN 一律 UNSAFE。 +> 5. **FENCED-SAFE 条件收紧**(评审点 5):`Removing` 不等于已 fence;须"已移出路由、不再接受 client"或"DN 显式 ack fence/shutdown"。 + +## 0. TL;DR + +- 范围:**整个 CN→所有 DN 的元数据广播**,穷尽排查出**约 30 个操作**(四种失败语义 HARD-FAIL / RETRY-THEN-FAIL / SOFT / TIMEOUT-ABANDON)。 +- 共性:CN 提交元数据后广播"失效/更新"给**所有**已注册 DN;DN 上一批**被 CN 推送维护的本地缓存**被读写关键路径**直接信任**——"活着但分区"的陈旧 DN 会产生脏数据 / 错误结果 / 安全漏洞。 +- 现状缺陷:把"DN 真宕机"(缓存随进程消失,安全)与"DN 活着但分区"(缓存仍在,危险)混为一谈而一并失败(牺牲可用性);同时对 SOFT/TIMEOUT 类又静默放过(留下长期不一致与权限漏洞)。 +- 统一方案两根支柱: + 1. **Lease/Fence(DN 侧)**:DN 的 CN-推送缓存只在"持有有效心跳租约"期间可信;`T_fence` 内收不到 CN 心跳即自我隔离(作废缓存 + Tier-A fail-closed),恢复时 **DN 自驱 resync** 后再解除。 + 2. **Verdict(CN 侧)**:把 ~20 处散落的"任一失败即 setFailure"收敛到一个统一判定器:未 ack 的 DN **要么 ack,要么已证实 fenced(或已移出路由)**,否则 WAIT/FAIL;据此输出 PROCEED / WAIT / FAIL。 +- **不引入 epoch/laggards**(你已确认的取舍):靠"不跳过未 ack 的 Running DN"+ 修正后的"成功响应接收时刻"信号 + 有界延迟/连接对称假设来保证正确性(§2.3)。代价:任一 DN 不可达时所有 Tier-A DDL 等 `T_proceed≈25s`(全员存活恒零等待)。 + +--- + +## 1. 问题范围:到底有哪些操作"这样" + +排查口径:CN 侧获取 `getRegisteredDataNodeLocations()`(所有已注册 DN,**不按存活过滤**)→ 广播失效/更新 RPC(异步 `CnToDnInternalServiceAsyncRequestManager.sendAsyncRequestWithRetry`,或同步 `ConfigNodeProcedureEnv.invalidateCache` / `SyncDataNodeClientPool`)→ 根据响应决定成败。 + +> 异步重试上限 `MAX_RETRY_NUM = 6`(`AsyncRequestManager.java:56`);DN 只有回 SUCCESS 才从重试集合移除(`AsyncRequestContext` Javadoc L51),不可达 DN 由 `DataNodeTSStatusRPCHandler.onError`(L73-90)写入一个 `EXECUTE_STATEMENT_ERROR` 状态,故调用方 squash 后必见失败 → 触发各自的失败处理。 + +### 1.1 全量操作清单(按失败语义分组) + +**A. HARD-FAIL — 任一 DN 不可达即整体失败 + 回滚(正确性敏感,本提案主目标)** + +| 操作 | 类 / 入口 | 广播 RPC | 目标 DN 缓存 | +|---|---|---|---| +| 表模型 DDL(CREATE/ADD/DROP/SET/RENAME/DROP TABLE 等) | `procedure/impl/schema/table/*`(见上一篇) | `UPDATE_TABLE` / `INVALIDATE_TABLE_CACHE` / `INVALIDATE_COLUMN_CACHE` | `DataNodeTableCache` / `TableDeviceSchemaCache` | +| DELETE TIMESERIES | `DeleteTimeSeriesProcedure`(`CLEAN_DATANODE_SCHEMA_CACHE` L126-128,helper L194-221) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema cache | +| DELETE DATABASE | `DeleteDatabaseProcedure`(`INVALIDATE_CACHE` L100-109)→ `ConfigNodeProcedureEnv.invalidateCache` L164-221(**同步串行**,Unknown 重试 10×500ms 后 false) | `INVALIDATE_PARTITION_CACHE` + `INVALIDATE_SCHEMA_CACHE` | 分区 cache + 树/表 schema cache | +| DEACTIVATE TEMPLATE | `DeactivateTemplateProcedure`(L183-206) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema cache | +| UNSET TEMPLATE | `UnsetTemplateProcedure`(`executeInvalidateCache` L157-181) | `UPDATE_TEMPLATE`(`INVALIDATE_TEMPLATE_SET_INFO`) | `ClusterTemplateManager` | +| SET TEMPLATE(pre/commit/rollback 三处) | `SetTemplateProcedure`(L223-242 / L408-433 / L510-538) | `UPDATE_TEMPLATE`(`ADD_TEMPLATE_PRE_SET_INFO`/`COMMIT_TEMPLATE_SET_INFO`/`INVALIDATE_TEMPLATE_SET_INFO`) | `ClusterTemplateManager` | +| SET / UNSET TTL | `SetTTLProcedure`(`UPDATE_DATANODE_CACHE` L90-129;rollback L222-250) | `SET_TTL` | `DataNodeTTLCache` | +| ALTER LOGICAL VIEW | `AlterLogicalViewProcedure`(两次 `invalidateCache` L100-103、L193;helper L125-146) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema/view cache | +| DELETE LOGICAL VIEW | `DeleteLogicalViewProcedure`(L169-189) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema/view cache | +| ALTER ENCODING/COMPRESSOR | `AlterEncodingCompressorProcedure`(`CLEAR_CACHE` L134-138,复用 DeleteTimeSeries helper) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema cache | +| ALTER TIMESERIES DATATYPE | `AlterTimeSeriesDataTypeProcedure`(`CLEAR_CACHE` L119-133,helper L246-273;**不支持回滚**) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema cache | +| DELETE DEVICES(表模型) | `DeleteDevicesProcedure`(L224-245) | `INVALIDATE_MATCHED_TABLE_DEVICE_CACHE` | `TableDeviceSchemaCache` | +| CREATE/DROP FUNCTION(UDF) | `UDFManager.createFunction/dropFunction`(**非 procedure**,L135-138 / L180-183 squash 即返回错误) | `CREATE_FUNCTION` / `DROP_FUNCTION` | `UDFManagementService` | +| CREATE/DROP/ACTIVE TRIGGER | `CreateTriggerProcedure` / `DropTriggerProcedure`(env L553-595) | `*_TRIGGER_INSTANCE` | `TriggerManagementService` | +| CREATE/DROP PIPE PLUGIN | `CreatePipePluginProcedure`(L206-215)/ `DropPipePluginProcedure`(L173-180) | `CREATE_PIPE_PLUGIN` / `DROP_PIPE_PLUGIN` | `PipePluginAgent` | +| SET SPACE / THROTTLE QUOTA | `ClusterQuotaManager`(L88-96 / L196-203,**非 procedure**,squash 即返回) | `SET_SPACE_QUOTA` / `SET_THROTTLE_QUOTA` | DN 配额执行缓存 | +| SET SYSTEM STATUS / LOAD CONFIG / SET CONFIG / START-STOP REPAIR | `NodeManager`(L1044-1196,**非 procedure**) | `SET_SYSTEM_STATUS` / `LOAD_CONFIGURATION` / `SET_CONFIGURATION` / `*_REPAIR_DATA` | DN 运行态/配置 | + +**B. RETRY-THEN-FAIL — 重试 1 次后失败(已部分容忍,但仍会失败)** + +| 操作 | 类 | 广播 RPC | +|---|---|---| +| CREATE/DROP/ALTER TOPIC、CREATE/DROP CONSUMER GROUP | `AbstractOperateSubscriptionProcedure` 子类(RETRY_THRESHOLD=1) | `TOPIC_PUSH_*` / `CONSUMER_GROUP_PUSH_*` | + +**C. SOFT — 仅告警,靠后台 reconcile(已可用,但一致性靠周期任务兜底)** + +| 操作 | 类 | 说明 | +|---|---|---| +| CREATE/START/STOP/DROP/ALTER PIPE | `AbstractOperatePipeProcedureV2` 子类 | 失败 warn,靠 `PipeMetaSyncProcedure` 心跳周期 reconcile | +| `extendSchemaTemplate` | `ClusterSchemaManager` L1103-1190(**非 procedure**) | 返回错误状态但不回滚,CN 与 DN 可漂移 | + +**D. TIMEOUT-ABANDON — 超时后静默丢弃(隐患最大)** + +| 操作 | 类 | 说明 | +|---|---|---| +| GRANT/REVOKE/DROP USER·ROLE/ALTER USER 等权限变更 | `AuthOperationProcedure`(L98-127) | 每 DN 单独同步重试,超 `datanodeTokenTimeoutMS`(默认 **180s**)即**静默移除**该 DN。该 DN 权限缓存**长期陈旧**直到重启或下次相关 auth 操作——**安全正确性漏洞** | + +> 排除项(确认不属于本问题):`CreateCQProcedure`(无 DN 广播,仅写 CN consensus + 本地调度);区域级 RPC(`CREATE_*_REGION`/`DELETE_REGION`/`CHANGE_REGION_LEADER` 等,只发副本所在 DN,走数据面 quorum,不在本提案范围)。 + +### 1.2 DN 上"被 CN 推送维护"的缓存全集(即需要被租约/fencing 覆盖的对象) + +| 缓存 / 注册表 | 类 | 由哪些 CN RPC 改写 | 失效后回源方式 | 类型 | +|---|---|---|---|---| +| 表 schema | `DataNodeTableCache` | `UPDATE_TABLE`/`INVALIDATE_TABLE_CACHE`/`INVALIDATE_SCHEMA_CACHE` | 回 CN 拉 + 启动 `init(tableInfo)`(`DataNode.java:523`) | 注册表型 | +| 表设备属性/last | `TableDeviceSchemaCache` | `INVALIDATE_TABLE_CACHE`/`INVALIDATE_COLUMN_CACHE`/`INVALIDATE_MATCHED_TABLE_DEVICE_CACHE`/`INVALIDATE_LAST_CACHE` | 懒加载,回 schema region | 懒加载型 | +| 树 schema/last/view | `TreeDeviceSchemaCacheManager` | `INVALIDATE_SCHEMA_CACHE`/`INVALIDATE_MATCHED_SCHEMA_CACHE`/`INVALIDATE_LAST_CACHE` | 懒加载,回 schema region | 懒加载型 | +| 权限 | `AuthorityChecker`/`AuthorityFetcher` | `INVALIDATE_PERMISSION_CACHE`(同步) | 懒加载回 CN;心跳触发 `refreshToken()`(`DataNodeInternalRPCServiceImpl:2273`) | 懒加载型 | +| 模板 | `ClusterTemplateManager` | `UPDATE_TEMPLATE`/`INVALIDATE_SCHEMA_CACHE` | 启动 `updateTemplateSetInfo`(`DataNode.java:497`) | 注册表型 | +| TTL | `DataNodeTTLCache` | `SET_TTL` | 启动 `initTTLInformation`(`DataNode.java:516`) | 注册表型 | +| 分区路由 | `ClusterPartitionFetcher` | `INVALIDATE_PARTITION_CACHE`/`UPDATE_REGION_ROUTE_MAP` | 懒加载回 CN | 懒加载型 | +| UDF | `UDFManagementService` | `CREATE_FUNCTION`/`DROP_FUNCTION` | 启动 `prepareUDFResources`(`DataNode.java:784`,下载 jar) | 注册表型 | +| Trigger | `TriggerManagementService` | `*_TRIGGER_INSTANCE` | 启动 `prepareTriggerResources` | 注册表型 | +| Pipe Plugin | `PipePluginAgent` | `CREATE/DROP_PIPE_PLUGIN` | 启动 `preparePipeResources` | 注册表型 | +| 配额 | DN 配额执行缓存 | `SET_SPACE/THROTTLE_QUOTA` | 无主动回源(只能等下次推送) | 注册表型(需补回源) | + +> "懒加载型 vs 注册表型"决定了 fence 恢复时的 resync 方式(§2.5):懒加载型只需作废、后续按需回源即可;注册表型需在恢复服务前主动重拉。 + +**关键事实(决定方案可行性,已据 review 核对代码更正)**: +- 心跳为 **CN→DN**,仅 Raft leader 发送(`HeartbeatService.java:128`),默认 **1s** 一次(`heartbeatIntervalInMs=1000`)。 +- DN 失联判定:默认 `FixedDetector`,阈值 **20s**(`failureDetectorFixedThresholdInMs=20000`)→ `NodeStatus.Unknown`;可选 Phi-Accrual。`NodeStatus` 仅 `Running/Unknown/Removing/ReadOnly`。 +- **⚠ 现有心跳样本时间戳不能直接用于判 fence**:`HeartbeatService.genHeartbeatReq` 写 `heartbeatTimestamp = System.nanoTime()`(**发送时刻**),DN 原样回填,`NodeHeartbeatSample(resp)` 记录的就是这个**发送时刻**(`NodeHeartbeatSample.java:55`);并且 `DataNodeHeartbeatHandler.onError` 在连接断开时用 **当前时刻** 写一个 `Unknown` 样本(`super(System.nanoTime())`)。→ 直接复用会导致:延迟心跳使 CN 以为 hbAge 已超而 DN 刚续约;失败样本不断把时间戳前移使 hbAge 反而不增长。**故本方案 CN 侧新增专用"最近成功收到心跳响应时刻",见 §2.3/§2.6。** +- 心跳**当前不带任何"元数据版本"字段**;DN 重启 resync 是**纯 pull**(`storeRuntimeConfigurations` L486-530)。 +- 真宕机 DN 重启必走 resync 重建缓存,**不可能用旧缓存服务**;缺的唯一一环就是"**活着但分区**的 DN 没有任何机制主动停用旧缓存"。 + +--- + +## 2. 统一机制设计:Lease + Fence(DN 侧)+ Verdict(CN 侧) + +### 2.1 一句话模型 + +> **DN 的"CN-推送缓存"只有在它"持有有效心跳租约"时才可用于服务;租约失效即 fence(作废 + Tier-A fail-closed),fence 恢复时由 DN 自己先 resync 再解除。CN 在确信某未 ack 的 DN"已被租约隔离 / 已移出路由"后才跳过它继续提交,否则等待或失败。** + +| 支柱 | 解决的失效场景 | 机制 | +|---|---|---| +| **Lease/Fence(DN 侧)** | DN **分区或宕机**(收不到心跳) | DN 本地计时(DN 自己的 receive-time):`now - lastCnHeartbeat > T_fence` → fence;恢复时 DN 自驱 resync | +| **Verdict(CN 侧)** | CN 决定能否跳过未 ack 的 DN | 据 CN 侧"最近成功收到响应时刻"算 `hbAge`,结合能力位/路由状态判定 → PROCEED / WAIT / FAIL | + +### 2.2 正确性分层(Tier) + +- **Tier-A(脏数据 / 错误结果 / 安全)**:表 schema、树 schema、设备属性/last、模板、view、datatype/encoding、TTL、**权限**、分区路由。Fence = **作废 + fail-closed**:隔离期间相关读写/鉴权回 CN 现拉,CN 不可达则**拒绝**(宁可不可用,绝不写脏 / 不返脏 / 不放行越权)。CN 侧未 ack 的 DN 须"ack 或 已证实 fenced"才放行。 +- **Tier-B(功能/一致性,非静默数据损坏)**,再按方向细分(评审点 3): + - **B-加资源(soft)**:CREATE FUNCTION/TRIGGER/PIPE PLUGIN、CREATE TOPIC/CONSUMER、配额上调等。漏掉 → 依赖该资源的操作自然失败(如 DN 没有该 UDF → 该查询失败),**不 fail-closed**,靠恢复 resync 收敛。 + - **B-撤资源/降级/控制(按 Tier-A 强一致)**:**DROP** FUNCTION/TRIGGER/PIPE PLUGIN(陈旧 DN 会**继续执行**已删资源)、**SET SYSTEM STATUS = ReadOnly**(陈旧 DN 会**继续接受写入**)、disable/控制类。这些不是良性漂移,须与 Tier-A 同等:"未 ack 必须 ack 或 已证实 fenced",必要时 fail-closed(fence 期间该 DN 本就拒服务)。 + +### 2.3 未 ack DN 的处理:为什么"统一等待/失败"且不引入 epoch/laggards + +回应评审点 1/2/4。纯靠 hbAge 时序"推断 DN 已 fence"不够稳健,且"跳过未 ack DN"会制造**未 fenced 的 laggard**(如 DN 漏掉 CREATE TABLE/ADD COLUMN 广播但心跳仍 Running——它不会自我 fence,而 `DataNodeTableCache.getTable` 仅在 pre-update map 命中时才回 CN 拉取,否则直接返回/抛不存在 `DataNodeTableCache.java:329`,导致新 schema 长期不可见)。本版采用更保守、更简单的规则: + +1. **取消"加性即时跳过"**:所有 Tier-A 统一走"未 ack 的 DN 必须 ack 或 已证实 fenced(或已移出路由)"。 + - 未 ack 且 **心跳仍新鲜(仍可能在服务)** → UNSAFE → **WAIT/FAIL**,绝不跳过 → 不会留下陈旧 laggard。 + - 未 ack 且 **不可达**(收不到心跳)→ `T_fence` 后自我 fence;CN 据"已证实 fenced"放行;该 DN 恢复时由本地 fence 标志驱动 resync 自愈。 + - 由此 ADD COLUMN 不再需要单独证明"旧 schema 下不带新列的写入语义安全"(也无法对"新增 TAG 列改变设备身份"等情形一概证明)——它和其他 Tier-A 一样必须 ack-或-fenced。 +2. **修正 CN 侧"联系"信号**:新增**专用"最近成功收到心跳响应时刻" `lastResp(dn)`**——CN 在**收到成功响应时**用 **CN 本地时钟**打点,**只在成功时更新,绝不被 `onError`/Unknown 样本前移**(与现有 load-cache 样本分离,不复用 §1.2 所述的发送时刻样本)。由因果关系 `dn_renew ≤ lastResp`(DN 先收到请求并续约 → 回响应 → CN 收到),故 `lastResp` 是 DN 续约时刻的**可靠上界**。 +3. **不引入 epoch/laggards**:既不跳过未 ack 的活跃 DN,就不存在"Running 但陈旧被放行"需要 epoch 兜底;不可达 DN 由 fence + 恢复 resync 自愈,CN 无需记 laggards。**代价**:放弃 v4 的加性快路径——任一 DN 不可达时所有 Tier-A 都等 `T_proceed`(§2.6)。 + +**显式前提假设(必须写明)**:上述时序论证依赖 +- **(a) 有界延迟**:在途心跳/RPC 在 Δ 内送达或被丢弃; +- **(b) 心跳连接双向对称**:心跳是同一连接上的请求/响应,"DN 收到请求并续约" ⟺ "CN 收到响应"(modulo Δ)。于是"DN 在续约" ⟺ "CN 在持续收到响应" ⟺ "`hbAge` 小" ⟺ "不判 FENCED-SAFE"。 +- 在 (a)(b) 下,"单向可达 + 选择性丢失失效广播"(CN→DN 投递心跳却丢弃失效、DN→CN 响应丢失)这一可能让"DN 续约却漏失效、而 CN 误判 fenced"的危险态不会发生。**若要去除该假设**,需引入 epoch/token 正向确认 currency(本版按选择不做,作为未来增强备选)。 + +### 2.4 CN 侧统一判定器(取代 ~20 处散落的 setFailure) + +``` +enum Verdict { PROCEED, WAIT, FAIL } + +// lastResp(dn): CN 本地时钟记录的"最近一次成功收到该 DN 心跳响应"时刻; +// 仅成功响应更新,绝不被 onError/Unknown 样本前移(§2.3,评审点 1)。 +// hbAge(dn) = now - lastResp(dn)。 +// 统一规则:没有"加性跳过";能力位先判(评审点 4);FENCED-SAFE 条件收紧(评审点 5)。 +Verdict propagate(payload, opts): + resp = sendAsyncRequestWithRetry(allRegisteredDNs, rpc) + for dn not ACKED: + if isRetiredFromRouting(dn) or ackedFenceOrShutdown(dn): + -> SAFE_GONE // 已移出路由/不再接受 client,或显式 ack fence/shutdown(评审点 5) + elif not supportsFencing(dn): -> UNSAFE // 旧 DN 不会自我 fence → 回退严格语义(能力位先判,评审点 4) + elif hbAge(dn) >= T_proceed: -> FENCED_SAFE // 已证实自我 fence(T_proceed = T_fence + margin) + else: -> UNSAFE // 心跳仍新鲜(仍可能服务)或瞬时错误 → 再等 + if all ACKED / SAFE_GONE / FENCED_SAFE: return PROCEED + if waited > maxWait: return FAIL // 半坏 DN:心跳新鲜却持续失败 → 失败(保守正确) + return WAIT // 循环本状态;hbAge 随时间增长,到阈值转 FENCED_SAFE +``` + +要点: +- **能力位先判**:不支持 fencing 的 DN 永远不会被判 FENCED-SAFE(评审点 4,解决 v4 自相矛盾)。 +- **FENCED-SAFE 仅来自**:`hbAge ≥ T_proceed`(且支持 fencing)**或** 已移出路由/显式 ack——`Removing` 本身不算(评审点 5,`Removing` 仍可能服务 client)。 +- **不再有"加性跳过"**:未 ack 的活跃 DN 一律 UNSAFE → WAIT/FAIL(评审点 2)。 +- **PROCEED**:所有未 ack 的 DN 都已 ack / SAFE_GONE / FENCED_SAFE 才提交。 +- **WAIT**:存在 UNSAFE(hbAge 未到 `T_proceed`,可能仍在服务)。循环等待至其 ack / 变 FENCED_SAFE / 超 `maxWait`。 +- **FAIL**:超 `maxWait` 仍有 UNSAFE(典型:心跳通但广播 RPC 持续失败的半坏 DN)→ 失败,维持现状语义。 + +各 procedure 把"任一失败即 setFailure"替换为"调用 `propagate(...)` 并按 Verdict 驱动状态机";`DeleteDatabaseProcedure`(同步串行)与 `AuthOperationProcedure`(180s 静默丢弃)统一切到这套——**修复权限 D 类漏洞**:不再静默丢弃,未 ack 的活跃 DN 会 WAIT/FAIL,不可达 DN fence 后权限缓存作废、恢复 resync。 + +### 2.5 DN 侧改造 + +1. **记录租约(DN 本地,sound)**:`getDataNodeHeartBeat`(`DataNodeInternalRPCServiceImpl.java:2226`)记录 `lastCnHeartbeatNanos = System.nanoTime()`(DN 自己的 receive-time,与 CN 侧信号无关)。DN 用它判自己是否该 fence——这一侧从来不是问题所在;评审点 1 针对的是 CN 侧的推断信号。 +2. **Fence 触发**:采用**惰性检查**(读写/鉴权入口处 `now - lastCnHeartbeatNanos > T_fence` 即视为 fenced),无需后台线程、无并发作废与读者竞争。(已实现:`MetadataLeaseManager.isFenced()`。) +3. **Tier-A fail-closed 注入点**(fenced 期间,**Phase 1 已全部落地、各带 TDD**): + - **表 schema**:`DataNodeTableCache.getTableInWrite/getTable` 抛 `INTERNAL_REQUEST_RETRY_ERROR`(推送型缓存、无回源,只能硬失败)。`TableHeaderSchemaValidator` 的所有读都经此路径,故**自动覆盖**,无需单独注入。 + - **树 schema**:`TreeDeviceSchemaCacheManager` 六个读方法统一经 `getDeviceSchemaOrMissWhenFenced(...)`,fenced 时报 **cache miss → 回源到 quorum 支撑的 SchemaRegion**(读穿透型缓存,回源即权威;比硬失败更可用:SchemaRegion 多数派可达即成功,不可达才 fail-closed)。恢复时 `cleanUp` 作废分区期间未复读的旧条目。 + - **权限**:`ClusterAuthorityFetcher.checkCacheAvailable()` 在 `isFenced()` 时丢弃权限缓存并回源 CN(分区时回源失败→拒绝)。补上了原 `refreshToken()` 超时机制的盲区(它仅在"心跳恢复"那刻才标记失效,分区进行中不触发)。 + - **TTL**:`MultiTsFileDeviceIterator.nextDevice()` fenced 时用无穷大 TTL(compaction 不按 TTL 删除),**仅压制 compaction 删除路径**(查询/写入 TTL 行为不变),防止陈旧 TTL 造成不可逆数据删除。 + - **分区缓存**:未覆盖(低风险,已有分区不变更、miss 即回源 CN,分区时自然 fail-closed);按需再评估。 +4. **恢复时 DN 自驱 resync(事件驱动)**:心跳恢复(fenced→active 的那次心跳)触发已注册的 recovery listener(已实现:`MetadataLeaseManager.addLeaseRecoveryListener`,`DataNodeTableCache` 注册 `invalidateAll`)。懒加载型缓存作废后按需回源;注册表型主动重拉(复用启动 resync 路径)。listener 同步执行:解除 fence 前缓存已作废,无窗口。 +5. **重启**:已全量 resync(`storeRuntimeConfigurations`),开机即一次完整追平。 + +### 2.6 时序正确性(lease ordering) + +记 `T_hb`=心跳间隔(默认 1s)、`T_fence`=DN 自我隔离阈值、`T_proceed = T_fence + margin`=CN 判 FENCED-SAFE 所需 `hbAge`。 + +**安全不变式**:CN 提交一次 Tier-A 变更(尤其**不可逆物理删除**)时,对每个未 ack 的 DN,要么它已 ack,要么它**已证实 fenced 或已移出路由**(不再用旧缓存服务)。 + +**证明(用修正后的信号)**:`lastResp(dn)` 是 CN 收到成功响应的本地时刻,由因果关系 `dn_renew ≤ lastResp`(DN 续约在前、CN 收响应在后)。DN 在 `dn_renew + T_fence` 自我 fence。故当 `hbAge = now - lastResp ≥ T_fence + margin` 时,`now ≥ lastResp + T_fence ≥ dn_renew + T_fence`,DN 必已 fence。取 `T_proceed = T_fence + margin` 即满足。 +- 这条之所以成立,关键是用**响应接收时刻**(≥ DN 续约)而非 §1.2 的**发送时刻**(< DN 续约,会让 CN 过早判 fenced);且 `lastResp` **绝不被失败样本前移**(否则 hbAge 永不增长)。 +- **延迟心跳**:被延迟的心跳即便让 DN 晚续约,CN 也只会更晚收到其响应、`lastResp` 更晚 → 不会过早判 FENCED-SAFE。✓ +- **leader 切换**:新 leader 无历史 `lastResp`,须把每个 DN 的 `lastResp` 初始化为**取得 leadership 的时刻**,从而至少等 `T_proceed` 才可能判 FENCED-SAFE(覆盖旧 leader 残留在途心跳对 DN 的续约)。✓ +- **残余风险**:见 §2.3 的 (a)(b) 假设;"单向可达+选择性丢失"在该假设下被排除。 + +`margin` 覆盖:①DN fence 检查粒度(惰性检查下≈0,定时检查则 1 周期)②GC/调度抖动 ③`lastResp` 至多 1 个心跳的认知粒度。默认 **≈5s**(`max(5000, 2×T_hb + fence检查间隔)`)。 + +**为什么 `T_fence` 取 20s**:`T_fence` 决定"健康 DN 多久没收到心跳就自我 fence(fail-closed)"。过小(如 5s)会让健康 DN 偶发 GC/抖动即被误 fence。取 **20s(与 `failureDetectorFixedThresholdInMs` 对齐)** 使"DN 自我 fence 时刻"≈"集群本来就判它 dead 时刻",不新增误判区间。 + +**代价(本版明确接受)**:取消加性快路径后,**任一 DN 不可达时,所有 Tier-A DDL 都要等 `T_proceed≈25s`**(全员存活时恒零等待)。这是为"无 epoch、规则统一、不留 laggard"付出的可用性代价。 + +### 2.7 不可逆删除的特殊编排 + +涉及物理删除数据/属性的操作(表 DROP COLUMN 的 `EXECUTE_ON_REGIONS`、DELETE TIMESERIES、DELETE DATABASE、DELETE DEVICES):删除步骤必须排在"判定器 PROCEED(所有未 ack DN 均 ack / SAFE_GONE / FENCED_SAFE)"**之后**。隔离 DN 在恢复 resync 前处于 fence、不接受相关写入,故删除后不会写出"幽灵列/幽灵设备"。 + +> 数据面可用性(region 多数派)与本提案正交:物理删除经 region consensus,本就需 quorum;本提案只解决"缓存失效广播"这层的可用性。 + +--- + +## 3. 各类操作的落地处理 + +| 类别 | 现状 | 方案后(v5) | +|---|---|---| +| 全部 Tier-A 表/树 schema DDL(CREATE/ADD/DROP/RENAME/SET)、view、datatype/encoding、模板、DELETE TS/DB/DEVICES、TTL | HARD-FAIL | 统一判定器:未 ack 的 DN 须 ack / SAFE_GONE / FENCED_SAFE 才 PROCEED,否则 WAIT/FAIL;不可逆删除编排见 §2.7;隔离 DN 恢复自驱 resync。**无加性快路径**——CREATE/ADD COLUMN 同样等待(避免未 fenced laggard) | +| 权限(grant/revoke/...) | **TIMEOUT-ABANDON(静默漏洞)** | 切判定器:不再 180s 静默丢弃;未 ack 活跃 DN → WAIT/FAIL,不可达 DN fence+恢复 resync;fence 期间鉴权 fail-closed(撤权立即生效)。**修复安全漏洞** | +| **B-加资源**:CREATE FUNCTION/TRIGGER/PIPE PLUGIN、CREATE TOPIC/CONSUMER、配额上调 | HARD-FAIL / RETRY-FAIL | Tier-B-soft:判定器可 PROCEED(缺资源者自然失败);恢复 resync 重拉 | +| **B-撤资源/降级/控制**:DROP FUNCTION/TRIGGER/PIPE PLUGIN、SET SYSTEM STATUS=ReadOnly、disable/控制 | HARD-FAIL | **按 Tier-A 强一致**:未 ack 须 ack 或 已证实 fenced(陈旧 DN 否则会继续跑已删资源 / 继续写入)。SET ReadOnly 漏达的 DN 必须被等到 ack 或 fence,不可静默放行 | +| Pipe task | SOFT(已 reconcile) | 维持现有周期 reconcile(或并入 fence-恢复 resync) | +| LOAD/SET CONFIGURATION | HARD-FAIL | 视具体项:影响正确性/安全的按强一致;纯性能项可 soft | + +--- + +## 4. 接口与配置改动清单 + +**Thrift(`iotdb-protocol/thrift-datanode/.../datanode.thrift`)** +- **不需要 epoch 字段。** 仅 `TDataNodeHeartbeatResp` 增 `optional bool supportsMetadataLeaseFencing`(或按 DN 版本号推断),供 CN 滚动升级期判断能否对该 DN 判 FENCED-SAFE(§5)。 +- "最近成功收到响应时刻"是 **CN 本地量**,无需协议字段。 + +**ConfigNode** +- `HeartbeatService` / 心跳成功回调中新增并维护每 DN 的 `lastSuccessfulHeartbeatResponseNanos`(CN 本地时钟,**仅成功响应更新**;与 load-cache 的 `NodeHeartbeatSample` 分离,不被 `onError`/Unknown 前移);leader 切换时初始化为取得 leadership 的时刻;记录 `supportsFencing`。 +- 新增 `ClusterCachePropagator`(§2.4 判定器),接入:`SchemaUtils.preRelease/commitRelease/rollback`、`DeleteTimeSeriesProcedure`/`AlterTimeSeriesDataTypeProcedure` 的 invalidateCache、模板/view/TTL procedure、`ConfigNodeProcedureEnv.invalidateCache`(DeleteDatabase)、`AuthOperationProcedure`、`UDFManager`、trigger/pipe-plugin env、`ClusterQuotaManager`、`NodeManager`(status/config 中的强一致项)。 + +**DataNode** +- `getDataNodeHeartBeat`:记录 `lastCnHeartbeatNanos`(已实现);回填 `supportsMetadataLeaseFencing=true`。 +- `MetadataLeaseManager`(已实现 `isFenced`/recovery listener);Tier-A 注入点 fail-closed(`DataNodeTableCache` 已实现,余 `TableHeaderSchemaValidator`/树 schema/`AuthorityChecker`/TTL 待补)。 +- 注册表型缓存的"恢复重拉"实现。 + +**配置(`CommonConfig`)** +- `metadata_lease_fence_ms`(`T_fence`,**唯一主旋钮**;默认 **20000**,与 `failureDetectorFixedThresholdInMs` 对齐)。已实现。 +- `T_proceed = T_fence + margin`,`margin` 内部派生(默认 ≈5s)。 +- 判定器 WAIT 的最大等待/重试上限。 +- **不设** `tier_b_fail_closed` / `enable_metadata_lease_fencing`。 + +--- + +## 5. 兼容性(升级后自动生效,无开关/无人工灰度) + +- **升级后直接生效,无需开关**。 +- **滚动升级期安全靠自动能力检测**:CN 判 FENCED-SAFE 的前提是"该 DN 会自我 fence"。**旧 DN 不会 fence、也不上报能力位**: + - DN 心跳回填 `supportsMetadataLeaseFencing=true`(或按版本号推断);CN 记录每 DN 最近上报的能力位。 + - 判定器**能力位先判**:不支持的 DN 一律 UNSAFE → 对它回退**现状严格语义**(任一不可达即失败)。已升级 DN 走新路径、未升级 DN 走旧严格路径,逐 DN 自动切换,正确性不破(此版已与 §2.4 顺序统一,消除 v4 矛盾)。 + - 全部升级完毕后所有 DN 走新路径。 +- **回滚**:仅一个自动能力位 + 新增逻辑,回退旧版本即恢复原行为。 + +--- + +## 6. 风险与权衡 + +- **取消加性快路径的延迟代价**:任一 DN 不可达时,**所有 Tier-A DDL(含 CREATE/ADD COLUMN)都等 `T_proceed≈25s`**;全员存活恒零等待。这是为"无 epoch、规则统一、不留 laggard"接受的代价(你已确认)。若日后需要给 CREATE 这类提速,可再评估 epoch 正向确认方案。 +- **时序假设(§2.3 (a)(b))**:依赖有界延迟 + 心跳连接双向对称;"单向可达 + 选择性丢失失效广播"在该假设外不保证安全。需在文档/运维约束中写明;若环境不满足,则需 epoch/token。 +- **CN 侧信号必须新增且独立**:绝不能复用记录发送时刻、且被 `onError` 前移的现有样本(否则证明不成立)。leader 切换须重置 `lastResp`。 +- **半坏 DN(心跳通、广播 RPC 断)**:保守失败(WAIT 超时 → FAIL)。 +- **少数派分区读写不可用**:被隔离 DN 对 Tier-A fail-closed,牺牲少数侧可用性换正确性(CP)。 +- **权限 fail-closed 可用性**:撤权立即生效 vs CN 短暂不可达不至全拒,需分别配置宽限。 +- **配额无回源**:DN 侧需新增主动拉取,否则恢复 resync 无处落地。 + +--- + +## 7. 分阶段实施计划 + +**Phase 0 — 观测、能力位、CN 侧 lastResp 信号(无行为变更)** +- DN 记录 `lastCnHeartbeatNanos`(已实现)、回填能力位;CN 新增并维护**独立的 `lastSuccessfulHeartbeatResponseNanos`**(仅成功更新、leader 切换重置)与能力位;加监控指标。**不 fence、不放行。** + +**Phase 1 — DN 自我 fencing + Tier-A fail-closed(正确性基石)✅ 已完成** +- `MetadataLeaseManager` + 惰性 fence 检查 + recovery 自驱 resync;表 schema、树 schema、权限、TTL(compaction) 的 fail-closed 注入**均已实现并各带 TDD**(见 §2.5.3)。`TableHeaderSchemaValidator` 经表缓存自动覆盖;分区缓存暂缓。此阶段不改 CN 放行逻辑——即使 CN 仍严格,DN 端已能在分区时自保。 + +**Phase 2 — CN 统一判定器 + Tier-A 放行(兑现可用性)🚧 进行中** +- ✅ `ClusterCachePropagator`(`propagateOnce` 判定 + `propagate` 重试循环;能力位先判、FENCED-SAFE 收紧;8 个单测)。 +- ✅ 生命周期挂钩:`notifyLeaderReady` → `onLeadershipAcquired`(重夺 leadership 重置 `lastResp`)、`removeDataNodePersistence` → `removeDataNode`。 +- ✅ **首个 procedure 接入(模板)**:`CreateTableProcedure` 的 PRE_RELEASE 改走 `ClusterCachePropagator.propagate`(`SchemaUtils.preUpdateTableReq` + `broadcastTableUpdate` 返回全量响应;旧 `preReleaseTable` 退化为只返回失败的薄封装,其余调用方不受影响)。COMMIT_RELEASE 维持 best-effort warn。 +- ✅ **端到端 IT**(`IoTDBTableDDLHAIT`,1C3D):停 1 DN 后 CREATE TABLE 仍成功;新增 IT 框架 `setMetadataLeaseFenceMs`。 +- ⏳ 待办:按模板接入其余 ~19 个 Tier-A procedure(AddTableColumn/DropTableColumn/RenameTable/SetTableProperties、DeleteTimeSeries、AlterTimeSeriesDataType、模板、view、TTL、DeleteDatabase 同步路径);不可逆删除编排。**统一规则、无加性快路径**。 + +**Phase 3 — 权限与 Tier-B 收编** +- `AuthOperationProcedure` 改造(修复 180s 静默漏洞);Tier-B 按 加资源(soft) / 撤资源·降级·控制(强一致) 分别接入;配额回源补齐。 + +**测试(含 review 要求的专项)** +- 1C3D 停 1 DN:所有 Tier-A 操作在 `T_proceed` 后成功(Phase 2/3 后)。 +- **延迟心跳**:心跳在 `T_proceed` 后才送达 DN(DN 晚续约)→ CN 不得在 DN fence 前判 FENCED-SAFE(验证用的是响应接收时刻)。 +- **leader 切换**:新 leader 不得凭旧 `lastResp` 过早判 FENCED-SAFE;旧 leader 残留心跳续约的 DN 不被误放行。 +- **heartbeat onError 连续刷新**:连接断开持续产生 Unknown 样本时,新增的 `lastResp` **不被前移**,`hbAge` 正常增长。 +- **活着但分区**:`T_fence` 后该 DN Tier-A fail-closed、不产生脏数据/脏读/越权;恢复后自驱 resync。 +- **未 fenced laggard 回归**:DN 漏掉 CREATE/ADD COLUMN 广播但心跳 Running → 判定器 WAIT/FAIL(不跳过),不得出现"新 schema 在该 DN 长期不可见"。 +- **权限**:撤权后分区 DN 拒绝越权。 +- **B-撤资源**:DROP UDF/Trigger/Plugin 后陈旧 DN 不得继续执行旧资源;SET ReadOnly 漏达的 DN 不得继续接受写入。 +- **不可逆删除并发写**:隔离 DN 不写幽灵列/设备。 +- **滚动升级**:半升级态(含旧 DN)回退严格语义,正确性不破。 +- 仅运行新增/改动相关 IT。 + +--- + +## 8. 附:与上一篇及已落地代码的关系 + +`table-model-ddl-ha-analysis.md` 是表模型特例与起点;本文推广为覆盖全部 CN→DN 元数据广播的统一框架。 + +已落地(worktree 分支):DN 侧 `MetadataLeaseManager`(含 recovery listener)、心跳记录 `lastCnHeartbeatNanos`、`metadata_lease_fence_ms` 配置、heartbeat-age 指标,以及 `DataNodeTableCache` 的 fail-closed + 恢复作废(均 TDD)。这些属于 Phase 0/1 的 DN 侧,**不受本次 review 影响**(评审点 1 针对的是尚未实现的 CN 侧判定信号)。CN 侧判定器(Phase 2)按本 v5 实现:统一规则、能力位先判、独立的响应接收时刻信号、FENCED-SAFE 收紧。 diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java index 8c8d2019f4de8..b5783e908a944 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java @@ -31,8 +31,6 @@ import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.commons.schema.view.viewExpression.ViewExpression; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; @@ -42,7 +40,6 @@ import org.apache.iotdb.db.exception.BatchProcessException; import org.apache.iotdb.db.exception.metadata.view.ViewNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TAlterViewReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -123,27 +120,15 @@ protected Flow executeFromState( } private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, false)) { // all dataNodes must clear the related schemaengine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, - viewPathToSourceMap.keySet()); - setFailure( - new ProcedureException( - new MetadataException( - ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, + viewPathToSourceMap.keySet()); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java index 26ea988f98e72..d223263e4935b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java @@ -28,8 +28,6 @@ import org.apache.iotdb.commons.path.PathDeserializeUtil; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeAlterTimeSeriesPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -42,7 +40,6 @@ import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.db.exception.metadata.PathNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TAlterTimeSeriesReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -249,26 +246,17 @@ public static void invalidateCache( final String requestMessage, final Consumer setFailure, final boolean needLock) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(measurementPathBytes).setNeedLock(needLock), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed past provably-fenced DataNodes instead of hard-failing on the first unreachable one + // (see SchemaUtils.invalidateMatchedSchemaCache). Runs before the physical datatype change, so + // the "alter only after PROCEED" ordering holds. + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), measurementPathBytes, needLock)) { // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, - requestMessage); - setFailure.accept( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, requestMessage); + setFailure.accept( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeactivateTemplateProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeactivateTemplateProcedure.java index ab4913da04d81..b94f6d76642a2 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeactivateTemplateProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeactivateTemplateProcedure.java @@ -30,8 +30,6 @@ import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.commons.schema.template.Template; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeactivateTemplatePlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -44,7 +42,6 @@ import org.apache.iotdb.mpp.rpc.thrift.TConstructSchemaBlackListWithTemplateReq; import org.apache.iotdb.mpp.rpc.thrift.TDeactivateTemplateReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteDataForDeleteSchemaReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TRollbackSchemaBlackListWithTemplateReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.TSStatusCode; @@ -181,29 +178,17 @@ protected List processResponseOfOneDataNode( private void invalidateCache(final ConfigNodeProcedureEnv env) { // if no target timeseries, return directly - if (!timeSeriesPatternTree.isEmpty()) { - Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(timeSeriesPatternTreeBytes), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance() - .sendAsyncRequestWithRetry(clientHandler); - Map statusMap = clientHandler.getResponseMap(); - for (TSStatus status : statusMap.values()) { - // all dataNodes must clear the related schema cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMA_CACHE_OF_TEMPLATE_TIMESERIES, - requestMessage); - setFailure( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMA_CACHE_FAILED))); - return; - } - } + if (!timeSeriesPatternTree.isEmpty() + && !SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), timeSeriesPatternTreeBytes, false)) { + // all dataNodes must clear the related schema cache + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMA_CACHE_OF_TEMPLATE_TIMESERIES, + requestMessage); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMA_CACHE_FAILED))); + return; } setNextState(DeactivateTemplateState.DELETE_DATA); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java index 4f63e96840c20..b1996e8ee92e7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java @@ -27,8 +27,6 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeleteLogicalViewPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -41,7 +39,6 @@ import org.apache.iotdb.db.exception.metadata.view.ViewNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TConstructViewSchemaBlackListReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteViewSchemaReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TRollbackViewSchemaBlackListReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.TSStatusCode; @@ -167,26 +164,15 @@ protected List processResponseOfOneDataNode( } private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, false)) { // all dataNodes must clear the related schemaengine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, requestMessage); - setFailure( - new ProcedureException( - new MetadataException( - ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, requestMessage); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); + return; } setNextState(DeleteLogicalViewState.DELETE_VIEW_SCHEMA); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java index 8bd396c72daf2..2d3d99e5fd94b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java @@ -27,12 +27,9 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeleteTimeSeriesPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; -import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; @@ -43,7 +40,6 @@ import org.apache.iotdb.mpp.rpc.thrift.TConstructSchemaBlackListReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteDataForDeleteSchemaReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteTimeSeriesReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TRollbackSchemaBlackListReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.TSStatusCode; @@ -201,25 +197,9 @@ public static void invalidateCache( // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its // schema cache and resyncs on recovery, so it cannot serve the to-be-deleted/altered series), // instead of hard-failing on the first unreachable DataNode. This runs before the physical - // delete in the state machine, so the "delete only after PROCEED" ordering holds. The - // ConfigCachePropagator may re-broadcast while waiting, so build a fresh request (with a - // duplicated buffer) on every attempt rather than reusing a possibly-consumed one. - final boolean proceeded = - new ClusterCachePropagator(env.getConfigManager()) - .propagate( - targets -> { - final DataNodeAsyncRequestContext - clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes.duplicate()) - .setNeedLock(needLock), - targets); - CnToDnInternalServiceAsyncRequestManager.getInstance() - .sendAsyncRequestWithRetry(clientHandler); - return clientHandler.getResponseMap(); - }); - if (!proceeded) { + // delete in the state machine, so the "delete only after PROCEED" ordering holds. + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, needLock)) { // All dataNodes must clear the related schemaEngine cache LOGGER.error( ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, requestMessage); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java index 6011981c6ba11..d0a4075070ae7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java @@ -36,6 +36,7 @@ import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.db.exception.metadata.PathNotExistException; @@ -43,6 +44,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TCheckSchemaRegionUsingTemplateResp; import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateReq; import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateResp; +import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -337,6 +339,37 @@ public static Map rollbackPreRelease( configManager.getNodeManager().getRegisteredDataNodeLocations())); } + /** + * Broadcast an INVALIDATE_MATCHED_SCHEMA_CACHE to all DataNodes through {@link + * ClusterCachePropagator}: proceed once every unreachable DataNode is provably self-fenced (it + * fails closed on its schema cache and resyncs on recovery, so it cannot serve the + * deleted/altered series), instead of hard-failing on the first unreachable DataNode. Returns + * whether it is safe to proceed; the caller maps {@code false} to its own failure. + * + *

The propagator may re-broadcast while waiting for unacked DataNodes, so a fresh request with + * a duplicated buffer is built on each attempt — a consumed buffer can never be re-sent as an + * empty (and silently-successful) invalidation. + */ + public static boolean invalidateMatchedSchemaCache( + final ConfigManager configManager, + final ByteBuffer patternTreeBytes, + final boolean needLock) { + return new ClusterCachePropagator(configManager) + .propagate( + targets -> { + final DataNodeAsyncRequestContext + clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, + new TInvalidateMatchedSchemaCacheReq(patternTreeBytes.duplicate()) + .setNeedLock(needLock), + targets); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + }); + } + public static TSStatus executeInConsensusLayer( final ConfigPhysicalPlan plan, final ConfigNodeProcedureEnv env, final Logger logger) { TSStatus status; diff --git a/table-model-ddl-ha-analysis.md b/table-model-ddl-ha-analysis.md new file mode 100644 index 0000000000000..20283a2ced8a7 --- /dev/null +++ b/table-model-ddl-ha-analysis.md @@ -0,0 +1,342 @@ +# 表模型 DDL 操作在任一 DataNode 宕机时失败的根因分析与解决方案 + +> 状态:待 review +> 作者:(草拟) +> 适用版本:2.0.x(master) + +## 0. TL;DR + +- **现象**:表模型的 `CREATE TABLE` / `ALTER TABLE`(加列、删列、改属性、改名)/ `DROP TABLE` 等 DDL,在集群中**任意一个 DataNode(DN)不可达**时都会执行失败并回滚。这与"多副本高可用"的预期相悖——按直觉,挂 1 个 DN 不应阻塞元数据变更。 +- **根因(正确性)**:这些 DDL 在 ConfigNode(CN)侧以 Procedure 执行,其中有一步会把"缓存失效 / 预发布"RPC **广播给集群里所有已注册的 DN**,并要求**每一个 DN 都返回 SUCCESS** 才继续;只要有一个 DN 不可达(重试 6 次后仍失败),整个 Procedure 就 `setFailure` 并回滚。 +- **为什么要这么强(不是 bug,是设计)**:DN 上对表模型有几类**本地缓存**(表 schema 缓存 `DataNodeTableCache`、设备属性/last 值缓存 `TableDeviceSchemaCache`)。写入校验和部分查询**直接读本地缓存、不回 CN 核对**。如果某个 DN 持有过期缓存且没被清理,在**网络分区**下它仍可能用旧 schema 接受写入 / 返回旧值,从而产生**与已提交 schema 不一致的脏数据**(类型错乱、幽灵列、删列后又写入等)。因此当前实现选择了"宁可失败也不放过任何一个 DN"。 +- **关键洞察**:真正危险的只有"**DN 活着但与 CN 分区**"这一种情况。**真正宕机的 DN 内存缓存已经没了**,重启时会从 CN 重新拉取全量 schema(`DataNode.java:523` 的 `DataNodeTableCache.init(...)`),不可能用旧缓存服务请求。当前实现把"宕机"和"分区"混为一谈,对"宕机"这种本来安全的情况也一并失败,才造成了可用性损失。 +- **解决方向**:给 DN 的表缓存加一个**与 CN 心跳绑定的"租约/fencing"机制**——DN 一旦在 `T_fence` 内收不到 CN 心跳,就**自行作废表缓存并对依赖缓存的表操作 fail-closed**。这样"不可达"就等价于"安全",CN 侧的 DDL 便可以**跳过已确认隔离/宕机的 DN 继续执行**,从而在挂掉少数 DN 时仍保持 DDL 可用,且不牺牲正确性。 + +--- + +## 1. 问题现象 + +在一个多 DN 集群(例如 1 CN + 3 DN,数据多副本)中,停掉任意 1 个 DN 后执行下列表模型语句,会直接报错失败(而非降级成功): + +- `CREATE TABLE` / `CREATE VIEW` +- `ALTER TABLE ... ADD COLUMN` +- `ALTER TABLE ... DROP COLUMN` +- `ALTER TABLE ... SET PROPERTIES`(如 TTL) +- `ALTER TABLE ... RENAME COLUMN` / `RENAME TABLE` +- `DROP TABLE` +- 对应的 view 变体、`CREATE/DROP DATABASE` 等 + +报错信息形如 `Pre create table failed` / `pre release add table column failed` / `... must clear the related schema cache` 等。 + +> 注:树模型的 `DELETE TIMESERIES`、`DELETE DATABASE` 也有**相同**的"所有 DN 必须可达"约束(见 §2.6),本文聚焦表模型,但方案对树模型同样适用。 + +--- + +## 2. 根因分析 + +### 2.1 总体执行链路 + +表模型 DDL 的执行入口在 CN 的 Procedure 框架,相关类位于: + +``` +iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/ +├── CreateTableProcedure.java +├── DropTableProcedure.java +├── AddTableColumnProcedure.java +├── DropTableColumnProcedure.java +├── RenameTableColumnProcedure.java +├── RenameTableProcedure.java +├── SetTablePropertiesProcedure.java +├── AlterTableColumnDataTypeProcedure.java +├── AbstractAlterOrDropTableProcedure.java ← 所有 alter/drop 的基类 +└── view/... ← view 变体 +``` + +这些 Procedure 的共同点:**在真正提交元数据变更之前 / 删除数据之前,必须先让所有 DN 把相关本地缓存清掉或进入"待更新"态**。这一步通过向所有 DN 广播 RPC 完成。 + +### 2.2 关键代码:广播给"所有已注册 DN",任一失败即整体失败 + +以"加列"为例,`AddTableColumnProcedure` 的状态机是: + +``` +COLUMN_CHECK → PRE_RELEASE → ADD_COLUMN → COMMIT_RELEASE +``` + +`PRE_RELEASE` 步调用基类 `AbstractAlterOrDropTableProcedure.preRelease(env)`,进而调用 `SchemaUtils.preReleaseTable(...)`。后者是整个问题的核心: + +```java +// SchemaUtils.java (≈ L243-262) preReleaseTable +final Map dataNodeLocationMap = + configManager.getNodeManager().getRegisteredDataNodeLocations(); // ← 所有已注册 DN +final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>(CnToDnAsyncRequestType.UPDATE_TABLE, req, dataNodeLocationMap); +CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); +return clientHandler.getResponseMap().entrySet().stream() + .filter(e -> e.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); // ← 返回"失败的 DN" +``` + +- `getRegisteredDataNodeLocations()`(`NodeManager.java` ≈ L688-697)返回**所有曾经注册过的 DN,不区分当前是否存活**。 +- `sendAsyncRequestWithRetry` 内部最多重试 `MAX_RETRY_NUM = 6` 次(`AsyncRequestManager.java`)。一个 DN 只有返回 SUCCESS 才会从重试集合移除;不可达的 DN(连接拒绝 / 超时)由 `DataNodeTSStatusRPCHandler.onError` 写入一个**错误 TSStatus**,并**保留在重试集合**里,6 轮耗尽后仍是失败项。 + +Procedure 拿到非空的 `failedResults` 后: + +```java +// AbstractAlterOrDropTableProcedure.java (≈ L96-101) +if (!failedResults.isEmpty()) { + // All dataNodes must clear the related schema cache ← 设计者的注释 + setFailure(new ProcedureException(new MetadataException(...))); + return; +} +``` + +`setFailure` 把 Procedure 置为 `FAILED`,ProcedureExecutor 随后触发**回滚**。结果就是:**只要有 1 个 DN 不可达,DDL 失败**。 + +> 同样的 "All dataNodes must clear the related schema cache / schemaEngine cache" 注释与 `setFailure` 逻辑,硬编码在至少 5 处:`AbstractAlterOrDropTableProcedure`(正向 L96-101 与回滚 L144-148)、`CreateTableProcedure`(L153-169)、`DropTableProcedure`(L141-167)、`DropTableColumnProcedure`(L152-188)。这是一个**全集群一致的硬约束**,不是个别遗漏。 + +### 2.3 DataNode 上与表模型相关的几类缓存 + +DDL 之所以要广播失效,是因为 DN 上确实缓存了 schema,且**关键路径直接信任本地缓存**。 + +| 缓存 | 类 / 字段 | Key | 内容 | 谁来读(危险路径) | +|---|---|---|---|---| +| 表 schema(已提交) | `DataNodeTableCache.databaseTableMap`(L64) | db, table | `TsTable`:列定义、列类别(TAG/ATTRIBUTE/FIELD/TIME)、数据类型、表属性(TTL 等) | **写入校验**:`getTableInWrite`(L316)直接读,不回 CN | +| 表 schema(变更中) | `DataNodeTableCache.preUpdateTableMap`(L67) | db, table | `(TsTable, version)`:DDL 进行中的"待更新"占位 | 查询取 schema 的 `getTable`(L329)遇到它会**回 CN 重新拉取** | +| 设备属性 | `TableDeviceSchemaCache`(dualKeyCache,table 模型 `deviceSchema`) | db, table, deviceID | `Map`:每个设备的 ATTRIBUTE 列名→值 | 查询属性:`getDeviceAttribute`(L141)直接读,命中即返回,不回 schema region | +| last 值 | `TableDeviceSchemaCache`(table 模型 `lastCache`) | db, table, deviceID, measurement | `TimeValuePair`:每列最后一个点 | `LAST` 查询直接读 | + +补充:设备 ATTRIBUTE 的**权威存储**是 schema region 内的 `DeviceAttributeStore`(按 snapshot 持久化),但**查询读属性走的是 `TableDeviceSchemaCache` 缓存**——命中缓存就不回 schema region。这正是属性脏读的来源。 + +DN 侧接收 CN 广播的 RPC handler 都在 `DataNodeInternalRPCServiceImpl`: + +- `updateTable`(L1813):按子类型分发 `PRE_UPDATE_TABLE` / `ROLLBACK_UPDATE_TABLE` / `COMMIT_UPDATE_TABLE`,驱动 `DataNodeTableCache` 的两阶段协议。 +- `invalidateTableCache`(L1842):整表失效(drop table)。 +- `invalidateColumnCache`(L2033):单列失效(drop column)。 +- `deleteColumnData`(L2051):物理删除列数据(drop column 第三步)。 + +所有失效 handler 都会先拿 `SchemaLockType.VALIDATE_VS_DELETION_TABLE` 的**写锁**。 + +### 2.4 两阶段协议与 `VALIDATE_VS_DELETION_TABLE` 锁 + +DDL 用一个 **pre-release / commit-release / rollback** 的两阶段(对 drop 列是"先失效缓存、再删数据、最后提交"的三阶段)协议来保证跨 DN 的原子性: + +- **PRE_RELEASE**:广播 `PRE_UPDATE_TABLE`,每个 DN 把新 `TsTable` 放入 `preUpdateTableMap`,使得**取 schema 的读路径**在变更窗口内回 CN 拉最新版本。**任一 DN 失败 → 整体失败**。 +- **(中间)** 在 CN consensus 提交真正的元数据变更。 +- **COMMIT_RELEASE**:广播 `COMMIT_UPDATE_TABLE`,DN 把表从 `preUpdateTableMap` 落到 `databaseTableMap`。**失败只告警、不失败 Procedure**(见 §2.5 的关键不对称)。 + +`VALIDATE_VS_DELETION_TABLE` 锁的协议在 `SchemaLockType.java`(L52-62)写得很清楚: + +``` +1. 写入 / load TsFile 校验 schema 前,加读锁; +2. 完成后释放读锁; +3. 表相关删除时,作废 device cache 前,加写锁; +4. 完成失效后释放写锁。 +``` + +这把锁只保证**单个 DN 内**"写入校验"与"缓存失效"互斥。"所有 DN 都必须 ack"这一条,则是把这种互斥**提升到集群级**:CN 不提交元数据变更,直到确信每个 DN 都拆掉了旧缓存。 + +### 2.5 关键不对称:PRE 必须全可达,COMMIT 却允许失败 + +这是理解问题、也是设计方案的关键线索: + +| 步骤 | 一个 DN 不可达时 | 原因 | +|---|---|---| +| PRE_RELEASE / INVALIDATE_CACHE | **整体失败 + 回滚** | 元数据还没变。必须保证"没有任何活着的 DN 还揣着旧缓存跨过这次变更" | +| COMMIT_RELEASE | **仅告警,Procedure 成功** | 元数据已提交。漏掉 commit 的 DN 只是 `preUpdateTableMap` 里留了个占位,其读路径会持续回 CN 拉最新版 → 最终一致、且安全 | + +**结论**:系统其实**已经容忍** DN 在 commit 阶段缺席(最终一致)。真正的硬约束只在**变更前的失效/预发布**这一步。而这一步要求"全员可达"的唯一目的,是**防止某个活着的 DN 带着旧缓存跨过变更点**。这恰恰是我们可以用 fencing 来替代的部分。 + +### 2.6 为什么"必须所有 DN"——三个正确性场景 + +如果允许某个**活着但分区**的 DN 漏掉失效,可能产生的脏数据: + +1. **写路径用过期表 schema(类型/类别错乱)** + 写入校验入口 `TableHeaderSchemaValidator`(`validateInsertNodeMeasurements` L343 / `validateTableHeaderSchema4TsFile` L102)先加 `VALIDATE_VS_DELETION_TABLE` 读锁,然后 `DataNodeTableCache.getTableInWrite(...)`(L123 / L363)**直接读 `databaseTableMap`**。该方法对 FIELD 列的数据类型校验"交给上层",类型不一致不会在这层拦截。 + 设想列 `pressure` 在 CN 上由 `FLOAT` 改成 `DOUBLE`,但分区 DN 仍缓存 `FLOAT`:路由到该 DN 的写入会以 `DOUBLE` 落盘,而该 DN 的列定义却是 `FLOAT`——后续按 `FLOAT` 解码即得到**错误数值**,且**对客户端无任何报错**(静默类型损坏)。若错配发生在 TAG 与 FIELD 之间,物理存储路径完全不同(tag 进 deviceID、field 进 measurement),会产生**正常查询无法触及的结构性脏数据**。 + +2. **属性缓存脏读** + 查询读属性 `TableDeviceSchemaFetcher.tryGetTableDeviceInCache`(L413-452)→ `cache.getDeviceAttribute`(L424)**纯内存命中即返回**,不回 schema region。`DROP COLUMN`(属性列)经 `invalidateColumnCache` 把该列从每个设备的属性 map 移除;若某分区 DN 漏掉这次失效,它仍会把已删除列的旧值当作有效属性返回——**返回一个 schema 里已不存在的列的值**。 + +3. **DROP COLUMN:物理删除 + 幽灵数据(最危险)** + `DropTableColumnProcedure` 状态机: + + ``` + CHECK_AND_INVALIDATE_COLUMN → INVALIDATE_CACHE → EXECUTE_ON_REGIONS → DROP_COLUMN + ``` + + 顺序保证是:**先让所有 DN 失效缓存(此后没有新写入能写进该列)→ 再物理删除 TsFile/属性数据 → 最后在 CN 提交删列**。 + 若某 DN 漏掉 `INVALIDATE_CACHE` 且系统仍继续:数据被物理删除、CN schema 已删列,但该 DN 缓存里**该列仍存在**;新写入路由到它会**通过校验**并把该列数据写进 WAL/memtable——于是出现"存储里有、schema 里没有"的**幽灵列数据**;查询扫到这些字节会静默跳过或解码报错。属性列情形更糟:脏值落在 schema region 持久化存储里,副本间**持久性数据分叉**,难以自动 reconcile。 + +> 这三点正是用户所说"DN 上有几种 Cache,不清理则在网络分区时可能产生脏数据"的具体机理。 + +### 2.7 现状的不合理之处 + +把上面拼起来,问题的本质是: + +> **当前实现用"所有已注册 DN 必须同步 ack 缓存失效"来保证正确性,却没有区分"DN 真宕机"(缓存已随进程消失,本质安全)与"DN 活着但与 CN 分区"(缓存仍在,真正危险)。对前者本可放行,却一并判失败,于是牺牲了 DDL 的高可用。** + +证据: + +- 一个**真正宕机**的 DN,重启后必然走注册流程,从 CN 的 `runtimeConfiguration.getTableInfo()` 重建 `DataNodeTableCache`(`DataNode.java:523` 的 `init(...)`),**不可能用旧缓存服务任何请求**。它在宕机期间也不服务任何请求。对它而言,"等它 ack 失效"在逻辑上是多余的。 +- CN 侧其实**已经知道** DN 是否可达:`DataNodeHeartbeatCache` 通过 Phi-Accrual `failureDetector` 把失联 DN 标为 `Unknown`。树模型的 `DeleteDatabaseProcedure` 走的 `ConfigNodeProcedureEnv.invalidateCache`(L164-221)已经会**检查 NodeStatus、对 `Unknown` 重试 10 次 / 5s** ——说明"失效时参考 NodeStatus"这条路代码里已有先例,只是最终仍是"超时即失败",没有走到"放行"。 + +缺的那一环是:**没有任何机制保证一个"活着但分区"的 DN 会主动停止使用旧缓存**。只要补上这一环(DN 自我 fencing),"不可达"就能安全地等价于"已隔离",CN 就能放心放行。 + +--- + +## 3. 解决方案 + +### 3.1 设计目标 + +1. **正确性不回退**:任何已提交的表 schema 变更之后,集群中**不存在**任何 DN 用过期缓存接受写入或返回旧值(尤其是 DROP 列物理删除之前,必须保证没有 DN 还能写该列)。 +2. **可用性提升**:挂掉**少数** DN(典型:3 副本挂 1)时,表模型 DDL 仍能成功。 +3. **复用现有设施**:尽量基于现有心跳 / NodeStatus / 注册 resync,不引入新的重协议。 +4. **常态零额外开销**:全员存活时路径与现状一致,无新增等待。 + +### 3.2 核心思想:把"缓存有效性"绑定到"与 ConfigNode 的租约" + +引入一个概念:**DN 的表模型缓存只有在它"持有 CN 租约"期间才可信**。租约就用现有心跳承载——DN 持续收到 CN 心跳即续约;一旦在 `T_fence` 内收不到心跳,租约过期,DN 必须**自我隔离(self-fencing)**。 + +于是: + +- **DN 真宕机** → 进程没了,缓存没了,重启 resync,安全。 +- **DN 活着但分区** → `T_fence` 后租约过期,自我隔离(作废表缓存 + 对依赖缓存的表操作 fail-closed),不再可能产生脏数据。 +- 两种情况下,"CN 联系不上的 DN"在 `T_fence` 之后都**保证不会用旧缓存服务请求**。CN 据此放行。 + +### 3.3 组件一:DataNode 自我隔离(self-fencing)——新增 + +这是方案中**唯一全新的机制**,也是正确性的基石。 + +1. **记录最后心跳时间**:在 `getDataNodeHeartBeat`(`DataNodeInternalRPCServiceImpl.java:2226`)里记录"最近一次收到 CN 心跳"的单调时钟时间戳(DN 当前**不**记录,但 handler 就在那,改动很小)。 +2. **后台 fencing 检查**:DN 起一个轻量定时任务,若 `now - lastHeartbeatFromCN > T_fence`,进入 **FENCED** 态: + - 作废 `DataNodeTableCache`(`databaseTableMap` + `preUpdateTableMap`)与 `TableDeviceSchemaCache`(属性 + last)。 + - 设 `tableSchemaFenced = true`。 +3. **FENCED 态下 fail-closed**: + - 写入校验(`TableHeaderSchemaValidator`)与取 schema(`getTableInWrite` / `getTable`)在 FENCED 态下**不信任本地缓存**:要么回 CN 现拉,CN 不可达则**直接拒绝该操作**(fail-closed,宁可不可用也不写脏);要么干脆对表写入/查询返回"schema 暂不可用,请重试"。 + - 属性 / last 缓存查询同理:FENCED 态视为 miss,回源;回不了源则失败。 +4. **续约即恢复**:恢复收到 CN 心跳后,**先强制 resync**(组件二)再清除 FENCED 态。 + +> 失败语义:分区少数侧的客户端在 `T_fence` 后会被 fail-closed 拒绝表读写——这正是 CP 系统对少数派分区的**正确**行为;多数侧(含 CN)保持可用。 + +### 3.4 组件二:重连后强制 resync——增强现有路径 + +DN 从 FENCED 恢复(或重启注册)时,在**对外服务表请求之前**必须把缓存与 CN 对齐: + +- 复用现有注册 resync:重启路径已通过 `DataNodeTableCache.init(runtimeConfiguration.getTableInfo())`(`DataNode.java:523`)重建缓存。 +- 对"未重启、仅心跳恢复"的 FENCED→恢复路径,新增一次**主动全量拉取**(沿用 `getTable` 已有的 `fetchTables`/`ClusterConfigTaskExecutor` 回 CN 的能力):拉到当前 schema 版本后再清 FENCED。 +- 可选优化:在心跳响应里带一个**单调递增的 schema epoch**;DN 比对本地 epoch,落后才触发全量拉取,常态只续约不拉数据。 + +### 3.5 组件三:ConfigNode DDL 容忍"已隔离/已宕机"的 DN——核心改动 + +改造 §2.2 那一步"广播失效 + 任一失败即 setFailure"的逻辑。把失效广播的结果分三类处理,而不是一刀切失败: + +对每个未返回 SUCCESS 的 DN,查其 `NodeStatus` / 最近成功联系时间: + +| DN 情况 | 判定 | 处理 | +|---|---|---| +| `Running`(可达),但 RPC 报错 | 真错误(如 DN 内部异常) | 重试;仍失败则**失败 Procedure**(与现状一致) | +| `Unknown` / 失联,且失联时长 < `T_proceed` | 可能还没自我隔离 | **等待**至 `T_proceed`(或在此期间它恢复并 ack) | +| 失联时长(`hbAge`)≥ `T_proceed` | 保证已自我隔离或已宕机 | **视为安全,放行**;该 DN 恢复时由其**自驱 resync**,CN 无需记录(见通用方案 `cluster-metadata-ha-fencing-design.md` §2.3) | +| 已被移除 / 确认下线 | 不再是集群成员 | 放行 | + +放行后: + +- DDL 照常提交元数据(并对 DROP 列继续物理删除——此时已保证无活着的旧缓存 DN)。 +- 把"该 DN 需要 resync"持久化(或依赖组件二的 DN 自恢复 resync)。该 DN 恢复时被强制对齐后才重新服务(组件二保证它在对齐前处于 FENCED,不会脏读/脏写)。 + +> 这本质上是把树模型 `ConfigNodeProcedureEnv.invalidateCache`(L164-221)已有的"NodeStatus 感知 + 重试",从"超时即失败"改成"确认隔离后放行"。 + +### 3.6 时序与正确性论证(lease ordering) + +记号: +- `T_hb`:心跳间隔。 +- `T_fence`:DN 自我隔离阈值(收不到心跳超过它就 fence)。 +- `T_proceed`:CN 判定"该失联 DN 已安全隔离"所需的失联时长。 + +**安全不变式**:CN 在提交变更(及 DROP 列物理删除)时,对每个未 ack 的 DN,要么它已 ack,要么它**已经自我隔离**。 + +**为什么 `T_proceed > T_fence + margin` 即可保证**: +心跳方向是 CN→DN。CN 对某 DN 的"最近一次成功联系"时刻 `t_cn`,在真实时间上**不早于** DN"最近一次收到心跳"的时刻 `t_dn`(DN 先收到、CN 才拿到响应)。DN 在 `t_dn + T_fence` 自我隔离。因此只要 CN 自 `t_cn` 起又过了 `T_fence + margin`(`margin` 覆盖时钟漂移、DN fence 检查周期、网络抖动),就能确信 `now ≥ t_dn + T_fence`,即该 DN **已隔离**。取 `T_proceed = T_fence + margin` 成立。 + +**与 commit 不对称的呼应**:§2.5 已说明系统本就容忍 commit 阶段缺席(最终一致)。本方案只是把"变更前失效"这步从"全员强同步"放宽为"全员 ack 或 已隔离",正确性边界没有降低——因为"已隔离"DN 与"已清缓存"DN 对外行为等价(都不会用旧缓存服务)。 + +**DROP 列的特别说明**:物理删除(`EXECUTE_ON_REGIONS`)必须在"全员 ack 或 已隔离"**之后**才执行。隔离 DN 在恢复 resync 前处于 FENCED、不接受该表写入,故不会在删除后再写出幽灵列。顺序不变,安全。 + +### 3.7 各操作的处理要点 + +| 操作 | 现状失败点 | 方案后 | +|---|---|---| +| CREATE TABLE | PRE_RELEASE 广播 `PRE_UPDATE_TABLE` 全员必达 | 失联且已隔离的 DN 放行;它恢复 resync 时自然学到新表 | +| ADD COLUMN / SET PROPERTIES / RENAME | `preRelease` 全员必达 | 同上;隔离 DN 恢复后拉到新 schema | +| DROP COLUMN | `INVALIDATE_CACHE` 全员必达,且卡住后续物理删除 | 隔离 DN 放行后再删数据;隔离 DN 恢复前 FENCED,不会写幽灵列 | +| DROP TABLE | `invalidateTableCache` 全员必达 | 同 DROP COLUMN | +| COMMIT_RELEASE | 本就只告警 | 不变 | + +### 3.8 备选方案(讨论用) + +**备选 A:写路径 schema 版本号 fencing(更彻底但更侵入)** +给每张表一个在 CN consensus 提交的单调 schema 版本 `V`,并下沉到数据写入路径:每次写入用所用 schema 的 `V` 打戳,region 侧(持有权威已提交 `V`)拒绝**低版本**写入。这样即使某 DN 缓存过期,它的写入也会在 region 层被版本校验拦下,从根上杜绝脏写。 +- 优点:不依赖时间/租约推理,纯版本号比较,最严格。 +- 缺点:需把表 schema 版本贯穿到数据写入 consensus 路径(当前数据 region 并不知道表 schema 版本),改动面大;对读路径脏读仍需另行处理。 +- 建议:作为长期演进选项,可与组件一/三组合(fencing 解决可用性,版本号兜底正确性)。 + +**备选 B:仅缩小广播范围** +有人可能想"只对受影响 schema region 的副本所在 DN 广播"。**不可行**:表缓存存在于**每个** DN(任何 DN 都可能做查询协调者并缓存任意表 schema、也可能承载该表数据 region),不是只在副本 DN 上。所以无法用 quorum 替代全员。这条排除,正好反衬出组件一(让每个 DN 自我兜底)才是对的方向。 + +### 3.9 实施计划与涉及文件 + +**Phase 1:DataNode 自我 fencing(正确性基石,先落地)** +- `DataNodeInternalRPCServiceImpl.getDataNodeHeartBeat`(L2226):记录 `lastHeartbeatFromCnNanos`。 +- 新增 fencing 检查任务 + FENCED 状态(建议挂在 schema engine / `DataNodeTableCache` 附近)。 +- `DataNodeTableCache`:FENCED 态下 `getTableInWrite` / `getTable` 不信任本地缓存;提供 `fenceAll()` / `clearFence()`。 +- `TableDeviceSchemaCache`:FENCED 态下属性/last 查询按 miss 处理。 +- `TableHeaderSchemaValidator`(L102/L343):FENCED 态写入校验 fail-closed。 +- 配置项:`T_fence`(默认 **20s**,与 `failureDetectorFixedThresholdInMs` 对齐,避免误 fence 健康 DN)。表 DDL 中 **CREATE TABLE / ADD COLUMN 等加性操作即时放行**(陈旧 DN 对未知实体天然 fail-closed),只有 DROP/RENAME/SET 等**破坏/语义变更类**在确有 DN 不可达时等 `T_proceed = T_fence + margin ≈ 25s`(`margin≈5s` 内部派生)。分类与论证见通用方案 §2.6/§3。 + +**Phase 2:DN 恢复 resync** +- 心跳响应增加 `schemaEpoch`(可选优化);DN 心跳恢复后落后才全量拉取。 +- 复用 `DataNode.java:523` 注册 resync;新增"心跳恢复"分支的主动拉取。 + +**Phase 3:ConfigNode DDL 放行逻辑(兑现可用性)** +- `SchemaUtils.preReleaseTable / commitReleaseTable / rollbackPreRelease`(≈L243-318):返回结果区分"真失败 / 失联可放行"。 +- `AbstractAlterOrDropTableProcedure.preRelease`(L89-110)、`CreateTableProcedure`(L153-169)、`DropTableProcedure`(L141-167)、`DropTableColumnProcedure`(L152-188):把"任一失败即 setFailure"改为"按 §3.5 表格分类处理"。 +- 引入基于 `hbAge` 的判定(参考并改造 `ConfigNodeProcedureEnv.invalidateCache` L164-221),落地"加性即时放行 / 破坏类等 `T_proceed = T_fence + margin`"。 +- 隔离 DN 恢复时由其**自驱 resync**,CN 无需记录 laggards(见通用方案 §2.3)。 + +**Phase 4(可选):备选 A 的写路径版本号兜底**——长期演进。 + +**测试** +- 复用 / 扩展现有 IT:在 1C3D 集群停 1 DN,验证 CREATE / ADD / DROP / SET / RENAME / DROP TABLE 均成功。 +- 注入"活着但分区"场景(阻断 DN↔CN 心跳但保留 DN↔client):验证 `T_fence` 后该 DN 对表读写 fail-closed,不产生脏数据;恢复后 resync 正确。 +- DROP 列并发写:验证隔离 DN 不会写出幽灵列。 +- 仅运行本次新增/改动的 IT,不跑全量。 + +### 3.10 风险与权衡 + +- **DDL 延迟**:仅当确有 DN 失联时,CN 需等待至 `T_proceed` 才放行;全员存活时无额外等待。可接受。 +- **少数派读不可用**:分区少数侧的 DN 在 `T_fence` 后对表读写 fail-closed,牺牲该侧可用性换正确性——符合 CP 取舍。若未来要给"可容忍轻微陈旧"的读开口子,可在备选 A 的版本号体系下单独放宽,但默认 fail-closed。 +- **`T_fence` 取值**:默认 20s(与失败检测阈值对齐,避免误 fence)。破坏类操作的下线等待 ~25s 主要由加性快速路径抵消;全员存活恒零等待。详见通用方案 §2.6。 +- **时钟假设**:论证用单调时钟与保守 margin,不依赖跨节点钟同步。 + +--- + +## 4. 附:关键文件索引 + +CN 侧: +- `confignode/.../procedure/impl/schema/SchemaUtils.java` —— `preReleaseTable`/`commitReleaseTable`/`rollbackPreRelease`(≈L243-318) +- `confignode/.../procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java` —— `preRelease`/`commitRelease`/`rollbackPreRelease`,"All dataNodes must clear..." 注释 +- `confignode/.../procedure/impl/schema/table/{CreateTable,DropTable,AddTableColumn,DropTableColumn,SetTableProperties,RenameTableColumn}Procedure.java` +- `confignode/.../procedure/env/ConfigNodeProcedureEnv.java` —— 树模型 `invalidateCache`(L164-221,NodeStatus 重试先例) +- `confignode/.../manager/node/NodeManager.java` —— `getRegisteredDataNodeLocations`(L688-697) +- `confignode/.../manager/load/cache/node/DataNodeHeartbeatCache.java` —— Phi-Accrual 失败检测 → `Unknown` +- `node-commons/.../client/request/AsyncRequestManager.java` —— `MAX_RETRY_NUM = 6` + +DN 侧: +- `datanode/.../schemaengine/table/DataNodeTableCache.java` —— 表 schema 缓存 + 两阶段协议(`databaseTableMap` L64 / `preUpdateTableMap` L67 / `getTableInWrite` L316 / `getTable` L329) +- `datanode/.../queryengine/plan/relational/metadata/fetcher/cache/TableDeviceSchemaCache.java` —— 属性 + last 缓存(`getDeviceAttribute` L141 / `invalidate` L614,L676) +- `datanode/.../queryengine/plan/relational/metadata/fetcher/TableDeviceSchemaFetcher.java` —— 属性读缓存路径(`tryGetTableDeviceInCache` L413-452) +- `datanode/.../queryengine/plan/relational/metadata/fetcher/TableHeaderSchemaValidator.java` —— 写入校验入口(L102 / L343) +- `datanode/.../queryengine/plan/analyze/lock/SchemaLockType.java` —— `VALIDATE_VS_DELETION_TABLE`(L52-62) +- `datanode/.../protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java` —— RPC handler:`updateTable` L1813 / `invalidateTableCache` L1842 / `invalidateColumnCache` L2033 / `deleteColumnData` L2051 / `getDataNodeHeartBeat` L2226 +- `datanode/.../schemaengine/schemaregion/attribute/DeviceAttributeStore.java` —— 属性权威存储(snapshot 持久化) +- `datanode/.../service/DataNode.java` —— 启动 resync(`DataNodeTableCache.init` L523) From 109c15552124cbae6a3eb8c4e9f908e11d2ed10c Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 13:59:42 +0800 Subject: [PATCH 13/17] Wire SET/UNSET TEMPLATE through ClusterCachePropagator Add SchemaUtils.broadcastTemplateUpdate(cm, Supplier): the single place that broadcasts UPDATE_TEMPLATE via ClusterCachePropagator, proceeding once every unreachable DataNode is provably self-fenced. The request is rebuilt from the supplier on each attempt because the propagator may re-broadcast on WAIT and TUpdateTemplateReq's binary field is ByteBuffer-backed (reusing one request could re-send a consumed, empty payload). SetTemplateProcedure (ADD_TEMPLATE_PRE_SET_INFO forward step) and UnsetTemplateProcedure (INVALIDATE_TEMPLATE_SET_INFO) now use it instead of hard-failing on the first unreachable DataNode; both keep their own messages and their state advance / throw-on-failure semantics. The region-task validation (VALIDATE_TIMESERIES_EXISTENCE) is unchanged. Template procedure tests pass. --- .../procedure/impl/schema/SchemaUtils.java | 26 +++++++++++ .../impl/schema/SetTemplateProcedure.java | 43 ++++++++----------- .../impl/schema/UnsetTemplateProcedure.java | 39 ++++++++--------- 3 files changed, 62 insertions(+), 46 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java index d0a4075070ae7..4e7b43669a587 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java @@ -46,6 +46,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateResp; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTemplateReq; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -62,6 +63,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Supplier; import java.util.stream.Collectors; public class SchemaUtils { @@ -370,6 +372,30 @@ public static boolean invalidateMatchedSchemaCache( }); } + /** + * Broadcast an UPDATE_TEMPLATE to all DataNodes through {@link ClusterCachePropagator}: proceed + * once every unreachable DataNode is provably self-fenced (it fails closed on its template cache + * and resyncs on recovery), instead of hard-failing on the first unreachable DataNode. Returns + * whether it is safe to proceed. + * + *

The request is rebuilt from {@code requestSupplier} on every attempt: the propagator may + * re-broadcast while waiting, and {@code TUpdateTemplateReq}'s binary field is backed by a {@link + * ByteBuffer}, so reusing one request could re-send a consumed (empty) payload. + */ + public static boolean broadcastTemplateUpdate( + final ConfigManager configManager, final Supplier requestSupplier) { + return new ClusterCachePropagator(configManager) + .propagate( + targets -> { + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.UPDATE_TEMPLATE, requestSupplier.get(), targets); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + }); + } + public static TSStatus executeInConsensusLayer( final ConfigPhysicalPlan plan, final ConfigNodeProcedureEnv env, final Logger logger) { TSStatus status; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java index 55fffedad6145..8f24fe92eefe7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java @@ -215,30 +215,25 @@ private void preReleaseTemplate(final ConfigNodeProcedureEnv env) { return; } - final TUpdateTemplateReq req = new TUpdateTemplateReq(); - req.setType(TemplateInternalRPCUpdateType.ADD_TEMPLATE_PRE_SET_INFO.toByte()); - req.setTemplateInfo( - TemplateInternalRPCUtil.generateAddTemplateSetInfoBytes(template, templateSetPath)); - - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TEMPLATE, req, dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final Map.Entry entry : statusMap.entrySet()) { - if (entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.warn( - ProcedureMessages.FAILED_TO_SYNC_TEMPLATE_PRE_SET_INFO_ON_PATH_TO, - templateName, - templateSetPath, - dataNodeLocationMap.get(entry.getKey())); - setFailure( - new ProcedureException( - new MetadataException(ProcedureMessages.PRE_SET_TEMPLATE_FAILED))); - return; - } + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // template cache and resyncs on recovery) instead of hard-failing on the first unreachable one. + if (!SchemaUtils.broadcastTemplateUpdate( + env.getConfigManager(), + () -> { + final TUpdateTemplateReq req = new TUpdateTemplateReq(); + req.setType(TemplateInternalRPCUpdateType.ADD_TEMPLATE_PRE_SET_INFO.toByte()); + req.setTemplateInfo( + TemplateInternalRPCUtil.generateAddTemplateSetInfoBytes(template, templateSetPath)); + return req; + })) { + LOGGER.warn( + ProcedureMessages.FAILED_TO_SYNC_TEMPLATE_PRE_SET_INFO_ON_PATH_TO, + templateName, + templateSetPath, + "an unreachable DataNode is not provably fenced"); + setFailure( + new ProcedureException(new MetadataException(ProcedureMessages.PRE_SET_TEMPLATE_FAILED))); + return; } setNextState(SetTemplateState.VALIDATE_TIMESERIES_EXISTENCE); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java index 1fd7aefb33065..d7bb9d0894660 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java @@ -155,29 +155,24 @@ private void invalidateCache(final ConfigNodeProcedureEnv env) { } private void executeInvalidateCache(final ConfigNodeProcedureEnv env) throws ProcedureException { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final TUpdateTemplateReq invalidateTemplateSetInfoReq = new TUpdateTemplateReq(); - invalidateTemplateSetInfoReq.setType( - TemplateInternalRPCUpdateType.INVALIDATE_TEMPLATE_SET_INFO.toByte()); - invalidateTemplateSetInfoReq.setTemplateInfo(getInvalidateTemplateSetInfo()); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TEMPLATE, - invalidateTemplateSetInfoReq, - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // template cache and resyncs on recovery) instead of hard-failing on the first unreachable one. + if (!SchemaUtils.broadcastTemplateUpdate( + env.getConfigManager(), + () -> { + final TUpdateTemplateReq invalidateTemplateSetInfoReq = new TUpdateTemplateReq(); + invalidateTemplateSetInfoReq.setType( + TemplateInternalRPCUpdateType.INVALIDATE_TEMPLATE_SET_INFO.toByte()); + invalidateTemplateSetInfoReq.setTemplateInfo(getInvalidateTemplateSetInfo()); + return invalidateTemplateSetInfoReq; + })) { // all dataNodes must clear the related template cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_TEMPLATE_CACHE_OF_TEMPLATE_SET_ON, - template.getName(), - path); - throw new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_TEMPLATE_CACHE_FAILED)); - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_TEMPLATE_CACHE_OF_TEMPLATE_SET_ON, + template.getName(), + path); + throw new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_TEMPLATE_CACHE_FAILED)); } } From 67a9746ff70f98cf34d6b5f1fd61aa08a29fdb28 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 14:14:15 +0800 Subject: [PATCH 14/17] Wire SetTTL DataNode broadcast through ClusterCachePropagator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SetTTLProcedure's UPDATE_DATANODE_CACHE step (and the symmetric rollback restore) broadcast SET_TTL to all DataNodes after the authoritative ConfigNode write. Both used to hard-fail on the first unreachable DataNode, which also forced a full rollback of the committed TTL whenever any DataNode was down. Both now go through a new overridable broadcastTTLAndDecide() seam backed by ClusterCachePropagator: proceed once every unreachable DataNode is provably self-fenced (it fails closed on TTL in compaction and resyncs on recovery), and fail (→ rollback) only when a live DataNode is genuinely unacked. TSetTTLReq has no ByteBuffer field, so the request is reused safely across the propagator's re-broadcasts. The test overrides broadcastTTLAndDecide instead of sendTTLRequest, keeping the rollback-on-live-failure scenario deterministic and fast (no real propagator sleep). All 6 SetTTL tests pass. --- .../impl/schema/SetTTLProcedure.java | 58 +++++++------------ .../impl/schema/SetTTLProcedureTest.java | 29 ++-------- 2 files changed, 25 insertions(+), 62 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java index dca79a02366f6..31641528269af 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java @@ -34,6 +34,7 @@ import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; @@ -115,13 +116,8 @@ void setConfigNodeTTL(final ConfigNodeProcedureEnv env) { } void updateDataNodeTTL(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - sendTTLRequest( - dataNodeLocationMap, - buildSetTTLReq(plan.getPathPattern(), plan.getTTL(), plan.isDataBase())); - if (hasFailedDataNode(clientHandler)) { + if (!broadcastTTLAndDecide( + env, buildSetTTLReq(plan.getPathPattern(), plan.getTTL(), plan.isDataBase()))) { LOGGER.error(ProcedureMessages.FAILED_TO_UPDATE_TTL_CACHE_OF_DATANODE); setFailure( new ProcedureException( @@ -129,6 +125,17 @@ void updateDataNodeTTL(final ConfigNodeProcedureEnv env) { } } + /** + * Broadcast the TTL update to all DataNodes and decide whether it is safe to proceed: proceed + * once every unreachable DataNode is provably self-fenced (it fails closed on TTL in compaction + * and resyncs on recovery) instead of hard-failing on the first unreachable DataNode. + * Package-private and overridable for tests. + */ + boolean broadcastTTLAndDecide(final ConfigNodeProcedureEnv env, final TSetTTLReq req) { + return new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> sendTTLRequest(targets, req).getResponseMap()); + } + private void capturePreviousTTLState(final ConfigNodeProcedureEnv env) { if (previousTTLStateCaptured) { return; @@ -168,19 +175,6 @@ private TSetTTLReq buildSetTTLReq( Collections.singletonList(String.join(".", pathPattern)), ttl, isDataBase); } - private boolean hasFailedDataNode( - final DataNodeAsyncRequestContext clientHandler) { - if (!clientHandler.getRequestIndices().isEmpty()) { - return true; - } - for (TSStatus status : clientHandler.getResponseMap().values()) { - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - return true; - } - } - return false; - } - private long getTTLOrDefault(final ConfigNodeProcedureEnv env, final String[] pathPattern) { final long ttl = env.getConfigManager().getTTLManager().getTTL(pathPattern); return ttl == TTLCache.NULL_TTL ? TTL_NOT_EXIST : ttl; @@ -220,30 +214,20 @@ private void restoreTTLOnConfigNode( } private void rollbackDataNodeTTL(final ConfigNodeProcedureEnv env) throws ProcedureException { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - restoreTTLOnDataNodes(dataNodeLocationMap, plan.getPathPattern(), previousTTL); + restoreTTLOnDataNodes(env, plan.getPathPattern(), previousTTL); if (plan.isDataBase()) { restoreTTLOnDataNodes( - dataNodeLocationMap, - getDatabaseWildcardPathPattern(plan.getPathPattern()), - previousDatabaseWildcardTTL); + env, getDatabaseWildcardPathPattern(plan.getPathPattern()), previousDatabaseWildcardTTL); } } private void restoreTTLOnDataNodes( - final Map dataNodeLocationMap, - final String[] pathPattern, - final long ttl) + final ConfigNodeProcedureEnv env, final String[] pathPattern, final long ttl) throws ProcedureException { - if (dataNodeLocationMap.isEmpty()) { - return; - } - final DataNodeAsyncRequestContext clientHandler = - sendTTLRequest( - dataNodeLocationMap, - buildSetTTLReq(pathPattern, ttl == TTL_NOT_EXIST ? TTLCache.NULL_TTL : ttl, false)); - if (hasFailedDataNode(clientHandler)) { + // Same proceed-past-fenced semantics as the forward update: a down DataNode must not block + // rollback (it resyncs TTL on recovery); only a live unacked DataNode fails it. + if (!broadcastTTLAndDecide( + env, buildSetTTLReq(pathPattern, ttl == TTL_NOT_EXIST ? TTLCache.NULL_TTL : ttl, false))) { throw new ProcedureException( new MetadataException( "Rollback dataNode ttl cache failed for " + String.join(".", pathPattern))); diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java index cb09c23659c39..a1813c1642cca 100644 --- a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java @@ -25,8 +25,6 @@ import org.apache.iotdb.commons.exception.IllegalPathException; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.schema.ttl.TTLCache; -import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.database.SetTTLPlan; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.TTLManager; @@ -355,30 +353,11 @@ TSStatus writeConfigNodePlan(final ConfigNodeProcedureEnv env, final SetTTLPlan } @Override - DataNodeAsyncRequestContext sendTTLRequest( - final Map dataNodeLocationMap, final TSetTTLReq req) { + boolean broadcastTTLAndDecide(final ConfigNodeProcedureEnv env, final TSetTTLReq req) { requests.add(copyRequest(req)); - - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.SET_TTL, copyRequest(req), dataNodeLocationMap); - final List requestIds = new ArrayList<>(clientHandler.getNodeLocationMap().keySet()); - final boolean shouldFail = failFirstDataNodeUpdate && requestCount++ == 0; - - for (Integer requestId : requestIds) { - clientHandler - .getResponseMap() - .put( - requestId, - new TSStatus( - shouldFail - ? TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode() - : TSStatusCode.SUCCESS_STATUS.getStatusCode())); - if (!shouldFail) { - clientHandler.getNodeLocationMap().remove(requestId); - } - } - return clientHandler; + // Simulate a live, un-acked DataNode on the first broadcast: the propagator verdict is FAIL + // (which triggers rollback). Later broadcasts (the rollback restore) proceed. + return !(failFirstDataNodeUpdate && requestCount++ == 0); } private SetTTLPlan copyPlan(final SetTTLPlan plan) { From 8f58c4e2365eb823fa198b97cd20501328563268 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 14:21:46 +0800 Subject: [PATCH 15/17] Wire DeleteDatabase sync cache-invalidation through ClusterCachePropagator ConfigNodeProcedureEnv.invalidateCache (the DeleteDatabase INVALIDATE_CACHE step) synchronously invalidates partition+schema cache on every DataNode. It used to poll an Unknown DataNode for 5s then hard-fail, so a single down DataNode broke DROP DATABASE. It now runs through ClusterCachePropagator: a per-round closure synchronously invalidates each reachable DataNode (SUCCESS only if both partition and schema succeed) and reports Unknown/erroring DataNodes as not-acked WITHOUT sync-sending to them (a sync send to a dead DataNode would block on connect timeouts). The propagator then proceeds once every not-acked DataNode is provably self-fenced (it fails closed and resyncs on recovery) and fails only on a live unacked DataNode. This runs before DELETE_DATABASE_SCHEMA, so the delete-after-PROCEED ordering holds. Removed the now-dead Unknown-polling loop and unused getNodeManager() helper. DeleteDatabaseProcedureTest passes. --- .../procedure/env/ConfigNodeProcedureEnv.java | 106 +++++++++--------- 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java index 960d0a7977f51..6b3428cffeaa7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java @@ -22,7 +22,6 @@ import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; -import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; import org.apache.iotdb.common.rpc.thrift.TSStatus; @@ -46,6 +45,7 @@ import org.apache.iotdb.confignode.exception.AddPeerException; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.consensus.ConsensusManager; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.manager.load.LoadManager; import org.apache.iotdb.confignode.manager.load.cache.region.RegionHeartbeatSample; import org.apache.iotdb.confignode.manager.node.NodeManager; @@ -95,7 +95,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; @@ -162,62 +161,63 @@ public void preDeleteDatabase( * @throws TException Thrift IOE */ public boolean invalidateCache(final String databaseName) throws IOException, TException { - final List allDataNodes = getNodeManager().getRegisteredDataNodes(); final TInvalidateCacheReq invalidateCacheReq = new TInvalidateCacheReq(); invalidateCacheReq.setStorageGroup(true); invalidateCacheReq.setFullPath(databaseName); - for (final TDataNodeConfiguration dataNodeConfiguration : allDataNodes) { - final int dataNodeId = dataNodeConfiguration.getLocation().getDataNodeId(); - - // If the node is not alive, retry for up to 10 times - NodeStatus nodeStatus = getLoadManager().getNodeStatus(dataNodeId); - int retryNum = 10; - if (nodeStatus == NodeStatus.Unknown) { - for (int i = 0; i < retryNum && nodeStatus == NodeStatus.Unknown; i++) { - try { - TimeUnit.MILLISECONDS.sleep(500); - } catch (final InterruptedException e) { - LOG.error("Sleep failed in ConfigNodeProcedureEnv: ", e); - Thread.currentThread().interrupt(); - break; - } - nodeStatus = getLoadManager().getNodeStatus(dataNodeId); - } - } + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // caches + // and resyncs on recovery), instead of hard-failing whenever any DataNode is Unknown. This runs + // before the physical database-schema delete in the state machine, so the "delete only after + // PROCEED" ordering holds. (throws kept for source compatibility with callers.) + return new ClusterCachePropagator(configManager) + .propagate(targets -> invalidateDatabaseCacheOnce(targets, invalidateCacheReq)); + } - if (nodeStatus == NodeStatus.Unknown) { - LOG.warn( - "Invalidate cache failed, because DataNode {} is Unknown", - dataNodeConfiguration.getLocation().getInternalEndPoint()); - return false; + /** + * One broadcast round of the database cache invalidation over {@code targets}: synchronously + * invalidate partition then schema cache on each DataNode and report SUCCESS for a DataNode only + * if both succeeded. Unknown DataNodes are not contacted (a sync send would block on connect + * timeouts) and are reported as not-acked, so the {@link ClusterCachePropagator} can decide + * whether they are provably fenced. + */ + private Map invalidateDatabaseCacheOnce( + final Map targets, final TInvalidateCacheReq invalidateCacheReq) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : targets.entrySet()) { + final int dataNodeId = entry.getKey(); + final TSStatus notAcked = new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + if (getLoadManager().getNodeStatus(dataNodeId) == NodeStatus.Unknown) { + result.put(dataNodeId, notAcked); + continue; } - - // Always invalidate PartitionCache first - final TSStatus invalidatePartitionStatus = - (TSStatus) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( - dataNodeConfiguration.getLocation().getInternalEndPoint(), - invalidateCacheReq, - CnToDnSyncRequestType.INVALIDATE_PARTITION_CACHE); - - final TSStatus invalidateSchemaStatus = - (TSStatus) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( - dataNodeConfiguration.getLocation().getInternalEndPoint(), - invalidateCacheReq, - CnToDnSyncRequestType.INVALIDATE_SCHEMA_CACHE); - - if (!verifySucceed(invalidatePartitionStatus, invalidateSchemaStatus)) { - LOG.error( - "Invalidate cache failed, invalidate partition cache status is {}, invalidate schemaengine cache status is {}", - invalidatePartitionStatus, - invalidateSchemaStatus); - return false; + try { + // Always invalidate PartitionCache first. + final TSStatus invalidatePartitionStatus = + (TSStatus) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithRetry( + entry.getValue().getInternalEndPoint(), + invalidateCacheReq, + CnToDnSyncRequestType.INVALIDATE_PARTITION_CACHE); + final TSStatus invalidateSchemaStatus = + (TSStatus) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithRetry( + entry.getValue().getInternalEndPoint(), + invalidateCacheReq, + CnToDnSyncRequestType.INVALIDATE_SCHEMA_CACHE); + result.put( + dataNodeId, + verifySucceed(invalidatePartitionStatus, invalidateSchemaStatus) + ? invalidatePartitionStatus + : notAcked); + } catch (final Exception e) { + LOG.warn( + "Invalidate cache failed for DataNode {}", entry.getValue().getInternalEndPoint(), e); + result.put(dataNodeId, notAcked); } } - return true; + return result; } public boolean verifySucceed(TSStatus... status) { @@ -876,10 +876,6 @@ private ConsensusManager getConsensusManager() { return configManager.getConsensusManager(); } - private NodeManager getNodeManager() { - return configManager.getNodeManager(); - } - private ClusterSchemaManager getClusterSchemaManager() { return configManager.getClusterSchemaManager(); } From 1e2535530cd1474d40fb2ae8932686f4a0a9cdc5 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 14:32:34 +0800 Subject: [PATCH 16/17] Fix AuthOperationProcedure silent permission-staleness hole The DATANODE_AUTHCACHE_INVALIDING step broadcast INVALIDATE_PERMISSION_CACHE and, after datanode_token_timeout_ms, SILENTLY DROPPED any still-unacked DataNode from the list - leaving a live DataNode serving a just-revoked permission until its own token timeout. (Phase 1 already closed the fenced- DataNode case via DN-side fail-closed; this closes the live-transiently-unacked case.) The step now runs through ClusterCachePropagator over the live registered DataNodes: it proceeds once every unreachable DataNode is provably self-fenced (it fails closed on auth and resyncs on recovery) and fails only when a live DataNode stays unacked - never silently dropping one. Unknown DataNodes are not sync-contacted (avoids connect-timeout stalls) and are reported as not-acked for the verdict. Fields/serialization are unchanged for procedure-restart compatibility (dataNodesToInvalid is now vestigial). AuthOperationProcedureTest passes. --- .../impl/sync/AuthOperationProcedure.java | 81 +++++++++++++------ 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/sync/AuthOperationProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/sync/AuthOperationProcedure.java index 9011267525361..8eca6e9ec44ab 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/sync/AuthOperationProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/sync/AuthOperationProcedure.java @@ -20,7 +20,9 @@ package org.apache.iotdb.confignode.procedure.impl.sync; import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.cluster.NodeStatus; import org.apache.iotdb.commons.conf.CommonConfig; import org.apache.iotdb.commons.conf.CommonDescriptor; import org.apache.iotdb.commons.exception.IoTDBException; @@ -31,6 +33,7 @@ import org.apache.iotdb.confignode.consensus.request.write.auth.AuthorPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.node.AbstractNodeProcedure; @@ -49,8 +52,9 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Iterator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import static org.apache.iotdb.confignode.procedure.state.auth.AuthOperationProcedureState.DATANODE_AUTHCACHE_INVALIDING; @@ -96,34 +100,25 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, AuthOperationProcedu writePlan(env); return Flow.HAS_MORE_STATE; case DATANODE_AUTHCACHE_INVALIDING: - TInvalidatePermissionCacheReq req = new TInvalidatePermissionCacheReq(); - TSStatus status; + final TInvalidatePermissionCacheReq req = new TInvalidatePermissionCacheReq(); req.setUsername(user); req.setRoleName(role); - Iterator> it = dataNodesToInvalid.iterator(); - while (it.hasNext()) { - Pair pair = it.next(); - if (pair.getRight() + this.timeoutMS < System.currentTimeMillis()) { - it.remove(); - continue; - } - status = - (TSStatus) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( - pair.getLeft().getLocation().getInternalEndPoint(), - req, - CnToDnSyncRequestType.INVALIDATE_PERMISSION_CACHE); - if (status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - it.remove(); - } - } - if (dataNodesToInvalid.isEmpty()) { + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on + // auth + // and resyncs on recovery), instead of silently dropping a DataNode after a timeout, + // which + // left a live but un-acked DataNode serving the just-revoked permission. Fail only when a + // live DataNode stays un-acked. (dataNodesToInvalid is retained for serialization + // compatibility but no longer drives this step.) + if (new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> invalidatePermissionCacheOnce(env, targets, req))) { LOGGER.info(ProcedureMessages.AUTH_PROCEDURE_CLEAN_DATANODE_CACHE_SUCCESSFULLY); return Flow.NO_MORE_STATE; - } else { - setNextState(AuthOperationProcedureState.DATANODE_AUTHCACHE_INVALIDING); } + setFailure( + new ProcedureException( + String.format( + ProcedureMessages.FAIL_TO_EXECUTE_PLAN_AT_STATE, plan.toString(), state))); break; } } catch (Exception e) { @@ -171,6 +166,44 @@ private void writePlan(ConfigNodeProcedureEnv env) { } } + /** + * One broadcast round of the permission-cache invalidation over {@code targets}: synchronously + * invalidate each reachable DataNode and report its status. Unknown DataNodes are not contacted + * (a sync send would block on connect timeouts) and are reported as not-acked, so the {@link + * ClusterCachePropagator} can decide whether they are provably fenced. + */ + private Map invalidatePermissionCacheOnce( + final ConfigNodeProcedureEnv env, + final Map targets, + final TInvalidatePermissionCacheReq req) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : targets.entrySet()) { + final int dataNodeId = entry.getKey(); + final TSStatus notAcked = new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + if (env.getConfigManager().getLoadManager().getNodeStatus(dataNodeId) == NodeStatus.Unknown) { + result.put(dataNodeId, notAcked); + continue; + } + try { + result.put( + dataNodeId, + (TSStatus) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithRetry( + entry.getValue().getInternalEndPoint(), + req, + CnToDnSyncRequestType.INVALIDATE_PERMISSION_CACHE)); + } catch (final Exception e) { + LOGGER.warn( + "Invalidate permission cache failed for DataNode {}", + entry.getValue().getInternalEndPoint(), + e); + result.put(dataNodeId, notAcked); + } + } + return result; + } + @Override protected boolean isRollbackSupported(AuthOperationProcedureState state) { return state == AuthOperationProcedureState.INIT; From cc0cbcc1432c804ac4939a3c536279e4994f5d57 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Wed, 3 Jun 2026 15:04:32 +0800 Subject: [PATCH 17/17] Remove internal design notes from version control These were accidentally committed by a 'git add -A'; they are informal design notes, not part of the change, and carry no Apache license header. --- cluster-metadata-ha-fencing-design.md | 308 ----------------------- table-model-ddl-ha-analysis.md | 342 -------------------------- 2 files changed, 650 deletions(-) delete mode 100644 cluster-metadata-ha-fencing-design.md delete mode 100644 table-model-ddl-ha-analysis.md diff --git a/cluster-metadata-ha-fencing-design.md b/cluster-metadata-ha-fencing-design.md deleted file mode 100644 index eab7d82969b6d..0000000000000 --- a/cluster-metadata-ha-fencing-design.md +++ /dev/null @@ -1,308 +0,0 @@ -# ConfigNode→DataNode 元数据广播的高可用改造:租约 + Fencing 统一方案 - -> 状态:待 review -> 关联文档:`table-model-ddl-ha-analysis.md`(表模型 DDL 的根因分析,本文是其推广到全集群的通用方案) -> 适用版本:2.0.x(master) -> 修订记录: -> - v2 — 移除 epoch 代次与 CN 侧 laggards,简化为"两支柱(Lease + Verdict)"。 -> - v3 — 移除 `tier_b_fail_closed` / `enable_metadata_lease_fencing` 配置;放行判定改用 `hbAge` 信号。 -> - v4 — `T_fence` 默认 20s(与 `failureDetectorFixedThresholdInMs` 对齐);Tier-A 内拆"加性即时放行 / 破坏类等待"。 -> - **v5(本版,依据第二轮 code-grounded review)**: -> 1. **修正 hbAge 安全证明**:现有样本记录的是心跳**发送时刻**(echoed `heartbeatTimestamp`),且 `onError` 用**当前时刻**写 Unknown 样本——都不能用于判 fence。改为 CN 侧新增**专用"最近成功收到心跳响应时刻"**(收到响应时用 CN 本地时钟打点,仅成功更新、绝不被失败样本前移),并显式声明有界延迟 + 心跳连接双向对称假设(§2.3/§2.6)。 -> 2. **取消"加性即时跳过"**(评审点 2/4):它会制造"未 fenced 的 laggard"(Running 但漏广播、又被跳过 → 新 schema 长期不可见)。改为**所有 Tier-A 统一"未 ack 必须 ack 或 已证实 fenced"**。代价:任一 DN 不可达时所有 Tier-A 都等 `T_proceed`(放弃 v4 的加性快路径)。 -> 3. **Tier-B 按 加资源 vs 撤资源/降级/控制 再分**(评审点 3):DROP UDF/Trigger/Plugin、SET SYSTEM STATUS ReadOnly 这类按 Tier-A 强一致处理(陈旧 DN 会继续跑旧资源 / 继续写入,不是良性漂移)。 -> 4. **能力位先于一切判定**(评审点 4):不支持 fencing 的旧 DN 一律 UNSAFE。 -> 5. **FENCED-SAFE 条件收紧**(评审点 5):`Removing` 不等于已 fence;须"已移出路由、不再接受 client"或"DN 显式 ack fence/shutdown"。 - -## 0. TL;DR - -- 范围:**整个 CN→所有 DN 的元数据广播**,穷尽排查出**约 30 个操作**(四种失败语义 HARD-FAIL / RETRY-THEN-FAIL / SOFT / TIMEOUT-ABANDON)。 -- 共性:CN 提交元数据后广播"失效/更新"给**所有**已注册 DN;DN 上一批**被 CN 推送维护的本地缓存**被读写关键路径**直接信任**——"活着但分区"的陈旧 DN 会产生脏数据 / 错误结果 / 安全漏洞。 -- 现状缺陷:把"DN 真宕机"(缓存随进程消失,安全)与"DN 活着但分区"(缓存仍在,危险)混为一谈而一并失败(牺牲可用性);同时对 SOFT/TIMEOUT 类又静默放过(留下长期不一致与权限漏洞)。 -- 统一方案两根支柱: - 1. **Lease/Fence(DN 侧)**:DN 的 CN-推送缓存只在"持有有效心跳租约"期间可信;`T_fence` 内收不到 CN 心跳即自我隔离(作废缓存 + Tier-A fail-closed),恢复时 **DN 自驱 resync** 后再解除。 - 2. **Verdict(CN 侧)**:把 ~20 处散落的"任一失败即 setFailure"收敛到一个统一判定器:未 ack 的 DN **要么 ack,要么已证实 fenced(或已移出路由)**,否则 WAIT/FAIL;据此输出 PROCEED / WAIT / FAIL。 -- **不引入 epoch/laggards**(你已确认的取舍):靠"不跳过未 ack 的 Running DN"+ 修正后的"成功响应接收时刻"信号 + 有界延迟/连接对称假设来保证正确性(§2.3)。代价:任一 DN 不可达时所有 Tier-A DDL 等 `T_proceed≈25s`(全员存活恒零等待)。 - ---- - -## 1. 问题范围:到底有哪些操作"这样" - -排查口径:CN 侧获取 `getRegisteredDataNodeLocations()`(所有已注册 DN,**不按存活过滤**)→ 广播失效/更新 RPC(异步 `CnToDnInternalServiceAsyncRequestManager.sendAsyncRequestWithRetry`,或同步 `ConfigNodeProcedureEnv.invalidateCache` / `SyncDataNodeClientPool`)→ 根据响应决定成败。 - -> 异步重试上限 `MAX_RETRY_NUM = 6`(`AsyncRequestManager.java:56`);DN 只有回 SUCCESS 才从重试集合移除(`AsyncRequestContext` Javadoc L51),不可达 DN 由 `DataNodeTSStatusRPCHandler.onError`(L73-90)写入一个 `EXECUTE_STATEMENT_ERROR` 状态,故调用方 squash 后必见失败 → 触发各自的失败处理。 - -### 1.1 全量操作清单(按失败语义分组) - -**A. HARD-FAIL — 任一 DN 不可达即整体失败 + 回滚(正确性敏感,本提案主目标)** - -| 操作 | 类 / 入口 | 广播 RPC | 目标 DN 缓存 | -|---|---|---|---| -| 表模型 DDL(CREATE/ADD/DROP/SET/RENAME/DROP TABLE 等) | `procedure/impl/schema/table/*`(见上一篇) | `UPDATE_TABLE` / `INVALIDATE_TABLE_CACHE` / `INVALIDATE_COLUMN_CACHE` | `DataNodeTableCache` / `TableDeviceSchemaCache` | -| DELETE TIMESERIES | `DeleteTimeSeriesProcedure`(`CLEAN_DATANODE_SCHEMA_CACHE` L126-128,helper L194-221) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema cache | -| DELETE DATABASE | `DeleteDatabaseProcedure`(`INVALIDATE_CACHE` L100-109)→ `ConfigNodeProcedureEnv.invalidateCache` L164-221(**同步串行**,Unknown 重试 10×500ms 后 false) | `INVALIDATE_PARTITION_CACHE` + `INVALIDATE_SCHEMA_CACHE` | 分区 cache + 树/表 schema cache | -| DEACTIVATE TEMPLATE | `DeactivateTemplateProcedure`(L183-206) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema cache | -| UNSET TEMPLATE | `UnsetTemplateProcedure`(`executeInvalidateCache` L157-181) | `UPDATE_TEMPLATE`(`INVALIDATE_TEMPLATE_SET_INFO`) | `ClusterTemplateManager` | -| SET TEMPLATE(pre/commit/rollback 三处) | `SetTemplateProcedure`(L223-242 / L408-433 / L510-538) | `UPDATE_TEMPLATE`(`ADD_TEMPLATE_PRE_SET_INFO`/`COMMIT_TEMPLATE_SET_INFO`/`INVALIDATE_TEMPLATE_SET_INFO`) | `ClusterTemplateManager` | -| SET / UNSET TTL | `SetTTLProcedure`(`UPDATE_DATANODE_CACHE` L90-129;rollback L222-250) | `SET_TTL` | `DataNodeTTLCache` | -| ALTER LOGICAL VIEW | `AlterLogicalViewProcedure`(两次 `invalidateCache` L100-103、L193;helper L125-146) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema/view cache | -| DELETE LOGICAL VIEW | `DeleteLogicalViewProcedure`(L169-189) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema/view cache | -| ALTER ENCODING/COMPRESSOR | `AlterEncodingCompressorProcedure`(`CLEAR_CACHE` L134-138,复用 DeleteTimeSeries helper) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema cache | -| ALTER TIMESERIES DATATYPE | `AlterTimeSeriesDataTypeProcedure`(`CLEAR_CACHE` L119-133,helper L246-273;**不支持回滚**) | `INVALIDATE_MATCHED_SCHEMA_CACHE` | 树 schema cache | -| DELETE DEVICES(表模型) | `DeleteDevicesProcedure`(L224-245) | `INVALIDATE_MATCHED_TABLE_DEVICE_CACHE` | `TableDeviceSchemaCache` | -| CREATE/DROP FUNCTION(UDF) | `UDFManager.createFunction/dropFunction`(**非 procedure**,L135-138 / L180-183 squash 即返回错误) | `CREATE_FUNCTION` / `DROP_FUNCTION` | `UDFManagementService` | -| CREATE/DROP/ACTIVE TRIGGER | `CreateTriggerProcedure` / `DropTriggerProcedure`(env L553-595) | `*_TRIGGER_INSTANCE` | `TriggerManagementService` | -| CREATE/DROP PIPE PLUGIN | `CreatePipePluginProcedure`(L206-215)/ `DropPipePluginProcedure`(L173-180) | `CREATE_PIPE_PLUGIN` / `DROP_PIPE_PLUGIN` | `PipePluginAgent` | -| SET SPACE / THROTTLE QUOTA | `ClusterQuotaManager`(L88-96 / L196-203,**非 procedure**,squash 即返回) | `SET_SPACE_QUOTA` / `SET_THROTTLE_QUOTA` | DN 配额执行缓存 | -| SET SYSTEM STATUS / LOAD CONFIG / SET CONFIG / START-STOP REPAIR | `NodeManager`(L1044-1196,**非 procedure**) | `SET_SYSTEM_STATUS` / `LOAD_CONFIGURATION` / `SET_CONFIGURATION` / `*_REPAIR_DATA` | DN 运行态/配置 | - -**B. RETRY-THEN-FAIL — 重试 1 次后失败(已部分容忍,但仍会失败)** - -| 操作 | 类 | 广播 RPC | -|---|---|---| -| CREATE/DROP/ALTER TOPIC、CREATE/DROP CONSUMER GROUP | `AbstractOperateSubscriptionProcedure` 子类(RETRY_THRESHOLD=1) | `TOPIC_PUSH_*` / `CONSUMER_GROUP_PUSH_*` | - -**C. SOFT — 仅告警,靠后台 reconcile(已可用,但一致性靠周期任务兜底)** - -| 操作 | 类 | 说明 | -|---|---|---| -| CREATE/START/STOP/DROP/ALTER PIPE | `AbstractOperatePipeProcedureV2` 子类 | 失败 warn,靠 `PipeMetaSyncProcedure` 心跳周期 reconcile | -| `extendSchemaTemplate` | `ClusterSchemaManager` L1103-1190(**非 procedure**) | 返回错误状态但不回滚,CN 与 DN 可漂移 | - -**D. TIMEOUT-ABANDON — 超时后静默丢弃(隐患最大)** - -| 操作 | 类 | 说明 | -|---|---|---| -| GRANT/REVOKE/DROP USER·ROLE/ALTER USER 等权限变更 | `AuthOperationProcedure`(L98-127) | 每 DN 单独同步重试,超 `datanodeTokenTimeoutMS`(默认 **180s**)即**静默移除**该 DN。该 DN 权限缓存**长期陈旧**直到重启或下次相关 auth 操作——**安全正确性漏洞** | - -> 排除项(确认不属于本问题):`CreateCQProcedure`(无 DN 广播,仅写 CN consensus + 本地调度);区域级 RPC(`CREATE_*_REGION`/`DELETE_REGION`/`CHANGE_REGION_LEADER` 等,只发副本所在 DN,走数据面 quorum,不在本提案范围)。 - -### 1.2 DN 上"被 CN 推送维护"的缓存全集(即需要被租约/fencing 覆盖的对象) - -| 缓存 / 注册表 | 类 | 由哪些 CN RPC 改写 | 失效后回源方式 | 类型 | -|---|---|---|---|---| -| 表 schema | `DataNodeTableCache` | `UPDATE_TABLE`/`INVALIDATE_TABLE_CACHE`/`INVALIDATE_SCHEMA_CACHE` | 回 CN 拉 + 启动 `init(tableInfo)`(`DataNode.java:523`) | 注册表型 | -| 表设备属性/last | `TableDeviceSchemaCache` | `INVALIDATE_TABLE_CACHE`/`INVALIDATE_COLUMN_CACHE`/`INVALIDATE_MATCHED_TABLE_DEVICE_CACHE`/`INVALIDATE_LAST_CACHE` | 懒加载,回 schema region | 懒加载型 | -| 树 schema/last/view | `TreeDeviceSchemaCacheManager` | `INVALIDATE_SCHEMA_CACHE`/`INVALIDATE_MATCHED_SCHEMA_CACHE`/`INVALIDATE_LAST_CACHE` | 懒加载,回 schema region | 懒加载型 | -| 权限 | `AuthorityChecker`/`AuthorityFetcher` | `INVALIDATE_PERMISSION_CACHE`(同步) | 懒加载回 CN;心跳触发 `refreshToken()`(`DataNodeInternalRPCServiceImpl:2273`) | 懒加载型 | -| 模板 | `ClusterTemplateManager` | `UPDATE_TEMPLATE`/`INVALIDATE_SCHEMA_CACHE` | 启动 `updateTemplateSetInfo`(`DataNode.java:497`) | 注册表型 | -| TTL | `DataNodeTTLCache` | `SET_TTL` | 启动 `initTTLInformation`(`DataNode.java:516`) | 注册表型 | -| 分区路由 | `ClusterPartitionFetcher` | `INVALIDATE_PARTITION_CACHE`/`UPDATE_REGION_ROUTE_MAP` | 懒加载回 CN | 懒加载型 | -| UDF | `UDFManagementService` | `CREATE_FUNCTION`/`DROP_FUNCTION` | 启动 `prepareUDFResources`(`DataNode.java:784`,下载 jar) | 注册表型 | -| Trigger | `TriggerManagementService` | `*_TRIGGER_INSTANCE` | 启动 `prepareTriggerResources` | 注册表型 | -| Pipe Plugin | `PipePluginAgent` | `CREATE/DROP_PIPE_PLUGIN` | 启动 `preparePipeResources` | 注册表型 | -| 配额 | DN 配额执行缓存 | `SET_SPACE/THROTTLE_QUOTA` | 无主动回源(只能等下次推送) | 注册表型(需补回源) | - -> "懒加载型 vs 注册表型"决定了 fence 恢复时的 resync 方式(§2.5):懒加载型只需作废、后续按需回源即可;注册表型需在恢复服务前主动重拉。 - -**关键事实(决定方案可行性,已据 review 核对代码更正)**: -- 心跳为 **CN→DN**,仅 Raft leader 发送(`HeartbeatService.java:128`),默认 **1s** 一次(`heartbeatIntervalInMs=1000`)。 -- DN 失联判定:默认 `FixedDetector`,阈值 **20s**(`failureDetectorFixedThresholdInMs=20000`)→ `NodeStatus.Unknown`;可选 Phi-Accrual。`NodeStatus` 仅 `Running/Unknown/Removing/ReadOnly`。 -- **⚠ 现有心跳样本时间戳不能直接用于判 fence**:`HeartbeatService.genHeartbeatReq` 写 `heartbeatTimestamp = System.nanoTime()`(**发送时刻**),DN 原样回填,`NodeHeartbeatSample(resp)` 记录的就是这个**发送时刻**(`NodeHeartbeatSample.java:55`);并且 `DataNodeHeartbeatHandler.onError` 在连接断开时用 **当前时刻** 写一个 `Unknown` 样本(`super(System.nanoTime())`)。→ 直接复用会导致:延迟心跳使 CN 以为 hbAge 已超而 DN 刚续约;失败样本不断把时间戳前移使 hbAge 反而不增长。**故本方案 CN 侧新增专用"最近成功收到心跳响应时刻",见 §2.3/§2.6。** -- 心跳**当前不带任何"元数据版本"字段**;DN 重启 resync 是**纯 pull**(`storeRuntimeConfigurations` L486-530)。 -- 真宕机 DN 重启必走 resync 重建缓存,**不可能用旧缓存服务**;缺的唯一一环就是"**活着但分区**的 DN 没有任何机制主动停用旧缓存"。 - ---- - -## 2. 统一机制设计:Lease + Fence(DN 侧)+ Verdict(CN 侧) - -### 2.1 一句话模型 - -> **DN 的"CN-推送缓存"只有在它"持有有效心跳租约"时才可用于服务;租约失效即 fence(作废 + Tier-A fail-closed),fence 恢复时由 DN 自己先 resync 再解除。CN 在确信某未 ack 的 DN"已被租约隔离 / 已移出路由"后才跳过它继续提交,否则等待或失败。** - -| 支柱 | 解决的失效场景 | 机制 | -|---|---|---| -| **Lease/Fence(DN 侧)** | DN **分区或宕机**(收不到心跳) | DN 本地计时(DN 自己的 receive-time):`now - lastCnHeartbeat > T_fence` → fence;恢复时 DN 自驱 resync | -| **Verdict(CN 侧)** | CN 决定能否跳过未 ack 的 DN | 据 CN 侧"最近成功收到响应时刻"算 `hbAge`,结合能力位/路由状态判定 → PROCEED / WAIT / FAIL | - -### 2.2 正确性分层(Tier) - -- **Tier-A(脏数据 / 错误结果 / 安全)**:表 schema、树 schema、设备属性/last、模板、view、datatype/encoding、TTL、**权限**、分区路由。Fence = **作废 + fail-closed**:隔离期间相关读写/鉴权回 CN 现拉,CN 不可达则**拒绝**(宁可不可用,绝不写脏 / 不返脏 / 不放行越权)。CN 侧未 ack 的 DN 须"ack 或 已证实 fenced"才放行。 -- **Tier-B(功能/一致性,非静默数据损坏)**,再按方向细分(评审点 3): - - **B-加资源(soft)**:CREATE FUNCTION/TRIGGER/PIPE PLUGIN、CREATE TOPIC/CONSUMER、配额上调等。漏掉 → 依赖该资源的操作自然失败(如 DN 没有该 UDF → 该查询失败),**不 fail-closed**,靠恢复 resync 收敛。 - - **B-撤资源/降级/控制(按 Tier-A 强一致)**:**DROP** FUNCTION/TRIGGER/PIPE PLUGIN(陈旧 DN 会**继续执行**已删资源)、**SET SYSTEM STATUS = ReadOnly**(陈旧 DN 会**继续接受写入**)、disable/控制类。这些不是良性漂移,须与 Tier-A 同等:"未 ack 必须 ack 或 已证实 fenced",必要时 fail-closed(fence 期间该 DN 本就拒服务)。 - -### 2.3 未 ack DN 的处理:为什么"统一等待/失败"且不引入 epoch/laggards - -回应评审点 1/2/4。纯靠 hbAge 时序"推断 DN 已 fence"不够稳健,且"跳过未 ack DN"会制造**未 fenced 的 laggard**(如 DN 漏掉 CREATE TABLE/ADD COLUMN 广播但心跳仍 Running——它不会自我 fence,而 `DataNodeTableCache.getTable` 仅在 pre-update map 命中时才回 CN 拉取,否则直接返回/抛不存在 `DataNodeTableCache.java:329`,导致新 schema 长期不可见)。本版采用更保守、更简单的规则: - -1. **取消"加性即时跳过"**:所有 Tier-A 统一走"未 ack 的 DN 必须 ack 或 已证实 fenced(或已移出路由)"。 - - 未 ack 且 **心跳仍新鲜(仍可能在服务)** → UNSAFE → **WAIT/FAIL**,绝不跳过 → 不会留下陈旧 laggard。 - - 未 ack 且 **不可达**(收不到心跳)→ `T_fence` 后自我 fence;CN 据"已证实 fenced"放行;该 DN 恢复时由本地 fence 标志驱动 resync 自愈。 - - 由此 ADD COLUMN 不再需要单独证明"旧 schema 下不带新列的写入语义安全"(也无法对"新增 TAG 列改变设备身份"等情形一概证明)——它和其他 Tier-A 一样必须 ack-或-fenced。 -2. **修正 CN 侧"联系"信号**:新增**专用"最近成功收到心跳响应时刻" `lastResp(dn)`**——CN 在**收到成功响应时**用 **CN 本地时钟**打点,**只在成功时更新,绝不被 `onError`/Unknown 样本前移**(与现有 load-cache 样本分离,不复用 §1.2 所述的发送时刻样本)。由因果关系 `dn_renew ≤ lastResp`(DN 先收到请求并续约 → 回响应 → CN 收到),故 `lastResp` 是 DN 续约时刻的**可靠上界**。 -3. **不引入 epoch/laggards**:既不跳过未 ack 的活跃 DN,就不存在"Running 但陈旧被放行"需要 epoch 兜底;不可达 DN 由 fence + 恢复 resync 自愈,CN 无需记 laggards。**代价**:放弃 v4 的加性快路径——任一 DN 不可达时所有 Tier-A 都等 `T_proceed`(§2.6)。 - -**显式前提假设(必须写明)**:上述时序论证依赖 -- **(a) 有界延迟**:在途心跳/RPC 在 Δ 内送达或被丢弃; -- **(b) 心跳连接双向对称**:心跳是同一连接上的请求/响应,"DN 收到请求并续约" ⟺ "CN 收到响应"(modulo Δ)。于是"DN 在续约" ⟺ "CN 在持续收到响应" ⟺ "`hbAge` 小" ⟺ "不判 FENCED-SAFE"。 -- 在 (a)(b) 下,"单向可达 + 选择性丢失失效广播"(CN→DN 投递心跳却丢弃失效、DN→CN 响应丢失)这一可能让"DN 续约却漏失效、而 CN 误判 fenced"的危险态不会发生。**若要去除该假设**,需引入 epoch/token 正向确认 currency(本版按选择不做,作为未来增强备选)。 - -### 2.4 CN 侧统一判定器(取代 ~20 处散落的 setFailure) - -``` -enum Verdict { PROCEED, WAIT, FAIL } - -// lastResp(dn): CN 本地时钟记录的"最近一次成功收到该 DN 心跳响应"时刻; -// 仅成功响应更新,绝不被 onError/Unknown 样本前移(§2.3,评审点 1)。 -// hbAge(dn) = now - lastResp(dn)。 -// 统一规则:没有"加性跳过";能力位先判(评审点 4);FENCED-SAFE 条件收紧(评审点 5)。 -Verdict propagate(payload, opts): - resp = sendAsyncRequestWithRetry(allRegisteredDNs, rpc) - for dn not ACKED: - if isRetiredFromRouting(dn) or ackedFenceOrShutdown(dn): - -> SAFE_GONE // 已移出路由/不再接受 client,或显式 ack fence/shutdown(评审点 5) - elif not supportsFencing(dn): -> UNSAFE // 旧 DN 不会自我 fence → 回退严格语义(能力位先判,评审点 4) - elif hbAge(dn) >= T_proceed: -> FENCED_SAFE // 已证实自我 fence(T_proceed = T_fence + margin) - else: -> UNSAFE // 心跳仍新鲜(仍可能服务)或瞬时错误 → 再等 - if all ACKED / SAFE_GONE / FENCED_SAFE: return PROCEED - if waited > maxWait: return FAIL // 半坏 DN:心跳新鲜却持续失败 → 失败(保守正确) - return WAIT // 循环本状态;hbAge 随时间增长,到阈值转 FENCED_SAFE -``` - -要点: -- **能力位先判**:不支持 fencing 的 DN 永远不会被判 FENCED-SAFE(评审点 4,解决 v4 自相矛盾)。 -- **FENCED-SAFE 仅来自**:`hbAge ≥ T_proceed`(且支持 fencing)**或** 已移出路由/显式 ack——`Removing` 本身不算(评审点 5,`Removing` 仍可能服务 client)。 -- **不再有"加性跳过"**:未 ack 的活跃 DN 一律 UNSAFE → WAIT/FAIL(评审点 2)。 -- **PROCEED**:所有未 ack 的 DN 都已 ack / SAFE_GONE / FENCED_SAFE 才提交。 -- **WAIT**:存在 UNSAFE(hbAge 未到 `T_proceed`,可能仍在服务)。循环等待至其 ack / 变 FENCED_SAFE / 超 `maxWait`。 -- **FAIL**:超 `maxWait` 仍有 UNSAFE(典型:心跳通但广播 RPC 持续失败的半坏 DN)→ 失败,维持现状语义。 - -各 procedure 把"任一失败即 setFailure"替换为"调用 `propagate(...)` 并按 Verdict 驱动状态机";`DeleteDatabaseProcedure`(同步串行)与 `AuthOperationProcedure`(180s 静默丢弃)统一切到这套——**修复权限 D 类漏洞**:不再静默丢弃,未 ack 的活跃 DN 会 WAIT/FAIL,不可达 DN fence 后权限缓存作废、恢复 resync。 - -### 2.5 DN 侧改造 - -1. **记录租约(DN 本地,sound)**:`getDataNodeHeartBeat`(`DataNodeInternalRPCServiceImpl.java:2226`)记录 `lastCnHeartbeatNanos = System.nanoTime()`(DN 自己的 receive-time,与 CN 侧信号无关)。DN 用它判自己是否该 fence——这一侧从来不是问题所在;评审点 1 针对的是 CN 侧的推断信号。 -2. **Fence 触发**:采用**惰性检查**(读写/鉴权入口处 `now - lastCnHeartbeatNanos > T_fence` 即视为 fenced),无需后台线程、无并发作废与读者竞争。(已实现:`MetadataLeaseManager.isFenced()`。) -3. **Tier-A fail-closed 注入点**(fenced 期间,**Phase 1 已全部落地、各带 TDD**): - - **表 schema**:`DataNodeTableCache.getTableInWrite/getTable` 抛 `INTERNAL_REQUEST_RETRY_ERROR`(推送型缓存、无回源,只能硬失败)。`TableHeaderSchemaValidator` 的所有读都经此路径,故**自动覆盖**,无需单独注入。 - - **树 schema**:`TreeDeviceSchemaCacheManager` 六个读方法统一经 `getDeviceSchemaOrMissWhenFenced(...)`,fenced 时报 **cache miss → 回源到 quorum 支撑的 SchemaRegion**(读穿透型缓存,回源即权威;比硬失败更可用:SchemaRegion 多数派可达即成功,不可达才 fail-closed)。恢复时 `cleanUp` 作废分区期间未复读的旧条目。 - - **权限**:`ClusterAuthorityFetcher.checkCacheAvailable()` 在 `isFenced()` 时丢弃权限缓存并回源 CN(分区时回源失败→拒绝)。补上了原 `refreshToken()` 超时机制的盲区(它仅在"心跳恢复"那刻才标记失效,分区进行中不触发)。 - - **TTL**:`MultiTsFileDeviceIterator.nextDevice()` fenced 时用无穷大 TTL(compaction 不按 TTL 删除),**仅压制 compaction 删除路径**(查询/写入 TTL 行为不变),防止陈旧 TTL 造成不可逆数据删除。 - - **分区缓存**:未覆盖(低风险,已有分区不变更、miss 即回源 CN,分区时自然 fail-closed);按需再评估。 -4. **恢复时 DN 自驱 resync(事件驱动)**:心跳恢复(fenced→active 的那次心跳)触发已注册的 recovery listener(已实现:`MetadataLeaseManager.addLeaseRecoveryListener`,`DataNodeTableCache` 注册 `invalidateAll`)。懒加载型缓存作废后按需回源;注册表型主动重拉(复用启动 resync 路径)。listener 同步执行:解除 fence 前缓存已作废,无窗口。 -5. **重启**:已全量 resync(`storeRuntimeConfigurations`),开机即一次完整追平。 - -### 2.6 时序正确性(lease ordering) - -记 `T_hb`=心跳间隔(默认 1s)、`T_fence`=DN 自我隔离阈值、`T_proceed = T_fence + margin`=CN 判 FENCED-SAFE 所需 `hbAge`。 - -**安全不变式**:CN 提交一次 Tier-A 变更(尤其**不可逆物理删除**)时,对每个未 ack 的 DN,要么它已 ack,要么它**已证实 fenced 或已移出路由**(不再用旧缓存服务)。 - -**证明(用修正后的信号)**:`lastResp(dn)` 是 CN 收到成功响应的本地时刻,由因果关系 `dn_renew ≤ lastResp`(DN 续约在前、CN 收响应在后)。DN 在 `dn_renew + T_fence` 自我 fence。故当 `hbAge = now - lastResp ≥ T_fence + margin` 时,`now ≥ lastResp + T_fence ≥ dn_renew + T_fence`,DN 必已 fence。取 `T_proceed = T_fence + margin` 即满足。 -- 这条之所以成立,关键是用**响应接收时刻**(≥ DN 续约)而非 §1.2 的**发送时刻**(< DN 续约,会让 CN 过早判 fenced);且 `lastResp` **绝不被失败样本前移**(否则 hbAge 永不增长)。 -- **延迟心跳**:被延迟的心跳即便让 DN 晚续约,CN 也只会更晚收到其响应、`lastResp` 更晚 → 不会过早判 FENCED-SAFE。✓ -- **leader 切换**:新 leader 无历史 `lastResp`,须把每个 DN 的 `lastResp` 初始化为**取得 leadership 的时刻**,从而至少等 `T_proceed` 才可能判 FENCED-SAFE(覆盖旧 leader 残留在途心跳对 DN 的续约)。✓ -- **残余风险**:见 §2.3 的 (a)(b) 假设;"单向可达+选择性丢失"在该假设下被排除。 - -`margin` 覆盖:①DN fence 检查粒度(惰性检查下≈0,定时检查则 1 周期)②GC/调度抖动 ③`lastResp` 至多 1 个心跳的认知粒度。默认 **≈5s**(`max(5000, 2×T_hb + fence检查间隔)`)。 - -**为什么 `T_fence` 取 20s**:`T_fence` 决定"健康 DN 多久没收到心跳就自我 fence(fail-closed)"。过小(如 5s)会让健康 DN 偶发 GC/抖动即被误 fence。取 **20s(与 `failureDetectorFixedThresholdInMs` 对齐)** 使"DN 自我 fence 时刻"≈"集群本来就判它 dead 时刻",不新增误判区间。 - -**代价(本版明确接受)**:取消加性快路径后,**任一 DN 不可达时,所有 Tier-A DDL 都要等 `T_proceed≈25s`**(全员存活时恒零等待)。这是为"无 epoch、规则统一、不留 laggard"付出的可用性代价。 - -### 2.7 不可逆删除的特殊编排 - -涉及物理删除数据/属性的操作(表 DROP COLUMN 的 `EXECUTE_ON_REGIONS`、DELETE TIMESERIES、DELETE DATABASE、DELETE DEVICES):删除步骤必须排在"判定器 PROCEED(所有未 ack DN 均 ack / SAFE_GONE / FENCED_SAFE)"**之后**。隔离 DN 在恢复 resync 前处于 fence、不接受相关写入,故删除后不会写出"幽灵列/幽灵设备"。 - -> 数据面可用性(region 多数派)与本提案正交:物理删除经 region consensus,本就需 quorum;本提案只解决"缓存失效广播"这层的可用性。 - ---- - -## 3. 各类操作的落地处理 - -| 类别 | 现状 | 方案后(v5) | -|---|---|---| -| 全部 Tier-A 表/树 schema DDL(CREATE/ADD/DROP/RENAME/SET)、view、datatype/encoding、模板、DELETE TS/DB/DEVICES、TTL | HARD-FAIL | 统一判定器:未 ack 的 DN 须 ack / SAFE_GONE / FENCED_SAFE 才 PROCEED,否则 WAIT/FAIL;不可逆删除编排见 §2.7;隔离 DN 恢复自驱 resync。**无加性快路径**——CREATE/ADD COLUMN 同样等待(避免未 fenced laggard) | -| 权限(grant/revoke/...) | **TIMEOUT-ABANDON(静默漏洞)** | 切判定器:不再 180s 静默丢弃;未 ack 活跃 DN → WAIT/FAIL,不可达 DN fence+恢复 resync;fence 期间鉴权 fail-closed(撤权立即生效)。**修复安全漏洞** | -| **B-加资源**:CREATE FUNCTION/TRIGGER/PIPE PLUGIN、CREATE TOPIC/CONSUMER、配额上调 | HARD-FAIL / RETRY-FAIL | Tier-B-soft:判定器可 PROCEED(缺资源者自然失败);恢复 resync 重拉 | -| **B-撤资源/降级/控制**:DROP FUNCTION/TRIGGER/PIPE PLUGIN、SET SYSTEM STATUS=ReadOnly、disable/控制 | HARD-FAIL | **按 Tier-A 强一致**:未 ack 须 ack 或 已证实 fenced(陈旧 DN 否则会继续跑已删资源 / 继续写入)。SET ReadOnly 漏达的 DN 必须被等到 ack 或 fence,不可静默放行 | -| Pipe task | SOFT(已 reconcile) | 维持现有周期 reconcile(或并入 fence-恢复 resync) | -| LOAD/SET CONFIGURATION | HARD-FAIL | 视具体项:影响正确性/安全的按强一致;纯性能项可 soft | - ---- - -## 4. 接口与配置改动清单 - -**Thrift(`iotdb-protocol/thrift-datanode/.../datanode.thrift`)** -- **不需要 epoch 字段。** 仅 `TDataNodeHeartbeatResp` 增 `optional bool supportsMetadataLeaseFencing`(或按 DN 版本号推断),供 CN 滚动升级期判断能否对该 DN 判 FENCED-SAFE(§5)。 -- "最近成功收到响应时刻"是 **CN 本地量**,无需协议字段。 - -**ConfigNode** -- `HeartbeatService` / 心跳成功回调中新增并维护每 DN 的 `lastSuccessfulHeartbeatResponseNanos`(CN 本地时钟,**仅成功响应更新**;与 load-cache 的 `NodeHeartbeatSample` 分离,不被 `onError`/Unknown 前移);leader 切换时初始化为取得 leadership 的时刻;记录 `supportsFencing`。 -- 新增 `ClusterCachePropagator`(§2.4 判定器),接入:`SchemaUtils.preRelease/commitRelease/rollback`、`DeleteTimeSeriesProcedure`/`AlterTimeSeriesDataTypeProcedure` 的 invalidateCache、模板/view/TTL procedure、`ConfigNodeProcedureEnv.invalidateCache`(DeleteDatabase)、`AuthOperationProcedure`、`UDFManager`、trigger/pipe-plugin env、`ClusterQuotaManager`、`NodeManager`(status/config 中的强一致项)。 - -**DataNode** -- `getDataNodeHeartBeat`:记录 `lastCnHeartbeatNanos`(已实现);回填 `supportsMetadataLeaseFencing=true`。 -- `MetadataLeaseManager`(已实现 `isFenced`/recovery listener);Tier-A 注入点 fail-closed(`DataNodeTableCache` 已实现,余 `TableHeaderSchemaValidator`/树 schema/`AuthorityChecker`/TTL 待补)。 -- 注册表型缓存的"恢复重拉"实现。 - -**配置(`CommonConfig`)** -- `metadata_lease_fence_ms`(`T_fence`,**唯一主旋钮**;默认 **20000**,与 `failureDetectorFixedThresholdInMs` 对齐)。已实现。 -- `T_proceed = T_fence + margin`,`margin` 内部派生(默认 ≈5s)。 -- 判定器 WAIT 的最大等待/重试上限。 -- **不设** `tier_b_fail_closed` / `enable_metadata_lease_fencing`。 - ---- - -## 5. 兼容性(升级后自动生效,无开关/无人工灰度) - -- **升级后直接生效,无需开关**。 -- **滚动升级期安全靠自动能力检测**:CN 判 FENCED-SAFE 的前提是"该 DN 会自我 fence"。**旧 DN 不会 fence、也不上报能力位**: - - DN 心跳回填 `supportsMetadataLeaseFencing=true`(或按版本号推断);CN 记录每 DN 最近上报的能力位。 - - 判定器**能力位先判**:不支持的 DN 一律 UNSAFE → 对它回退**现状严格语义**(任一不可达即失败)。已升级 DN 走新路径、未升级 DN 走旧严格路径,逐 DN 自动切换,正确性不破(此版已与 §2.4 顺序统一,消除 v4 矛盾)。 - - 全部升级完毕后所有 DN 走新路径。 -- **回滚**:仅一个自动能力位 + 新增逻辑,回退旧版本即恢复原行为。 - ---- - -## 6. 风险与权衡 - -- **取消加性快路径的延迟代价**:任一 DN 不可达时,**所有 Tier-A DDL(含 CREATE/ADD COLUMN)都等 `T_proceed≈25s`**;全员存活恒零等待。这是为"无 epoch、规则统一、不留 laggard"接受的代价(你已确认)。若日后需要给 CREATE 这类提速,可再评估 epoch 正向确认方案。 -- **时序假设(§2.3 (a)(b))**:依赖有界延迟 + 心跳连接双向对称;"单向可达 + 选择性丢失失效广播"在该假设外不保证安全。需在文档/运维约束中写明;若环境不满足,则需 epoch/token。 -- **CN 侧信号必须新增且独立**:绝不能复用记录发送时刻、且被 `onError` 前移的现有样本(否则证明不成立)。leader 切换须重置 `lastResp`。 -- **半坏 DN(心跳通、广播 RPC 断)**:保守失败(WAIT 超时 → FAIL)。 -- **少数派分区读写不可用**:被隔离 DN 对 Tier-A fail-closed,牺牲少数侧可用性换正确性(CP)。 -- **权限 fail-closed 可用性**:撤权立即生效 vs CN 短暂不可达不至全拒,需分别配置宽限。 -- **配额无回源**:DN 侧需新增主动拉取,否则恢复 resync 无处落地。 - ---- - -## 7. 分阶段实施计划 - -**Phase 0 — 观测、能力位、CN 侧 lastResp 信号(无行为变更)** -- DN 记录 `lastCnHeartbeatNanos`(已实现)、回填能力位;CN 新增并维护**独立的 `lastSuccessfulHeartbeatResponseNanos`**(仅成功更新、leader 切换重置)与能力位;加监控指标。**不 fence、不放行。** - -**Phase 1 — DN 自我 fencing + Tier-A fail-closed(正确性基石)✅ 已完成** -- `MetadataLeaseManager` + 惰性 fence 检查 + recovery 自驱 resync;表 schema、树 schema、权限、TTL(compaction) 的 fail-closed 注入**均已实现并各带 TDD**(见 §2.5.3)。`TableHeaderSchemaValidator` 经表缓存自动覆盖;分区缓存暂缓。此阶段不改 CN 放行逻辑——即使 CN 仍严格,DN 端已能在分区时自保。 - -**Phase 2 — CN 统一判定器 + Tier-A 放行(兑现可用性)🚧 进行中** -- ✅ `ClusterCachePropagator`(`propagateOnce` 判定 + `propagate` 重试循环;能力位先判、FENCED-SAFE 收紧;8 个单测)。 -- ✅ 生命周期挂钩:`notifyLeaderReady` → `onLeadershipAcquired`(重夺 leadership 重置 `lastResp`)、`removeDataNodePersistence` → `removeDataNode`。 -- ✅ **首个 procedure 接入(模板)**:`CreateTableProcedure` 的 PRE_RELEASE 改走 `ClusterCachePropagator.propagate`(`SchemaUtils.preUpdateTableReq` + `broadcastTableUpdate` 返回全量响应;旧 `preReleaseTable` 退化为只返回失败的薄封装,其余调用方不受影响)。COMMIT_RELEASE 维持 best-effort warn。 -- ✅ **端到端 IT**(`IoTDBTableDDLHAIT`,1C3D):停 1 DN 后 CREATE TABLE 仍成功;新增 IT 框架 `setMetadataLeaseFenceMs`。 -- ⏳ 待办:按模板接入其余 ~19 个 Tier-A procedure(AddTableColumn/DropTableColumn/RenameTable/SetTableProperties、DeleteTimeSeries、AlterTimeSeriesDataType、模板、view、TTL、DeleteDatabase 同步路径);不可逆删除编排。**统一规则、无加性快路径**。 - -**Phase 3 — 权限与 Tier-B 收编** -- `AuthOperationProcedure` 改造(修复 180s 静默漏洞);Tier-B 按 加资源(soft) / 撤资源·降级·控制(强一致) 分别接入;配额回源补齐。 - -**测试(含 review 要求的专项)** -- 1C3D 停 1 DN:所有 Tier-A 操作在 `T_proceed` 后成功(Phase 2/3 后)。 -- **延迟心跳**:心跳在 `T_proceed` 后才送达 DN(DN 晚续约)→ CN 不得在 DN fence 前判 FENCED-SAFE(验证用的是响应接收时刻)。 -- **leader 切换**:新 leader 不得凭旧 `lastResp` 过早判 FENCED-SAFE;旧 leader 残留心跳续约的 DN 不被误放行。 -- **heartbeat onError 连续刷新**:连接断开持续产生 Unknown 样本时,新增的 `lastResp` **不被前移**,`hbAge` 正常增长。 -- **活着但分区**:`T_fence` 后该 DN Tier-A fail-closed、不产生脏数据/脏读/越权;恢复后自驱 resync。 -- **未 fenced laggard 回归**:DN 漏掉 CREATE/ADD COLUMN 广播但心跳 Running → 判定器 WAIT/FAIL(不跳过),不得出现"新 schema 在该 DN 长期不可见"。 -- **权限**:撤权后分区 DN 拒绝越权。 -- **B-撤资源**:DROP UDF/Trigger/Plugin 后陈旧 DN 不得继续执行旧资源;SET ReadOnly 漏达的 DN 不得继续接受写入。 -- **不可逆删除并发写**:隔离 DN 不写幽灵列/设备。 -- **滚动升级**:半升级态(含旧 DN)回退严格语义,正确性不破。 -- 仅运行新增/改动相关 IT。 - ---- - -## 8. 附:与上一篇及已落地代码的关系 - -`table-model-ddl-ha-analysis.md` 是表模型特例与起点;本文推广为覆盖全部 CN→DN 元数据广播的统一框架。 - -已落地(worktree 分支):DN 侧 `MetadataLeaseManager`(含 recovery listener)、心跳记录 `lastCnHeartbeatNanos`、`metadata_lease_fence_ms` 配置、heartbeat-age 指标,以及 `DataNodeTableCache` 的 fail-closed + 恢复作废(均 TDD)。这些属于 Phase 0/1 的 DN 侧,**不受本次 review 影响**(评审点 1 针对的是尚未实现的 CN 侧判定信号)。CN 侧判定器(Phase 2)按本 v5 实现:统一规则、能力位先判、独立的响应接收时刻信号、FENCED-SAFE 收紧。 diff --git a/table-model-ddl-ha-analysis.md b/table-model-ddl-ha-analysis.md deleted file mode 100644 index 20283a2ced8a7..0000000000000 --- a/table-model-ddl-ha-analysis.md +++ /dev/null @@ -1,342 +0,0 @@ -# 表模型 DDL 操作在任一 DataNode 宕机时失败的根因分析与解决方案 - -> 状态:待 review -> 作者:(草拟) -> 适用版本:2.0.x(master) - -## 0. TL;DR - -- **现象**:表模型的 `CREATE TABLE` / `ALTER TABLE`(加列、删列、改属性、改名)/ `DROP TABLE` 等 DDL,在集群中**任意一个 DataNode(DN)不可达**时都会执行失败并回滚。这与"多副本高可用"的预期相悖——按直觉,挂 1 个 DN 不应阻塞元数据变更。 -- **根因(正确性)**:这些 DDL 在 ConfigNode(CN)侧以 Procedure 执行,其中有一步会把"缓存失效 / 预发布"RPC **广播给集群里所有已注册的 DN**,并要求**每一个 DN 都返回 SUCCESS** 才继续;只要有一个 DN 不可达(重试 6 次后仍失败),整个 Procedure 就 `setFailure` 并回滚。 -- **为什么要这么强(不是 bug,是设计)**:DN 上对表模型有几类**本地缓存**(表 schema 缓存 `DataNodeTableCache`、设备属性/last 值缓存 `TableDeviceSchemaCache`)。写入校验和部分查询**直接读本地缓存、不回 CN 核对**。如果某个 DN 持有过期缓存且没被清理,在**网络分区**下它仍可能用旧 schema 接受写入 / 返回旧值,从而产生**与已提交 schema 不一致的脏数据**(类型错乱、幽灵列、删列后又写入等)。因此当前实现选择了"宁可失败也不放过任何一个 DN"。 -- **关键洞察**:真正危险的只有"**DN 活着但与 CN 分区**"这一种情况。**真正宕机的 DN 内存缓存已经没了**,重启时会从 CN 重新拉取全量 schema(`DataNode.java:523` 的 `DataNodeTableCache.init(...)`),不可能用旧缓存服务请求。当前实现把"宕机"和"分区"混为一谈,对"宕机"这种本来安全的情况也一并失败,才造成了可用性损失。 -- **解决方向**:给 DN 的表缓存加一个**与 CN 心跳绑定的"租约/fencing"机制**——DN 一旦在 `T_fence` 内收不到 CN 心跳,就**自行作废表缓存并对依赖缓存的表操作 fail-closed**。这样"不可达"就等价于"安全",CN 侧的 DDL 便可以**跳过已确认隔离/宕机的 DN 继续执行**,从而在挂掉少数 DN 时仍保持 DDL 可用,且不牺牲正确性。 - ---- - -## 1. 问题现象 - -在一个多 DN 集群(例如 1 CN + 3 DN,数据多副本)中,停掉任意 1 个 DN 后执行下列表模型语句,会直接报错失败(而非降级成功): - -- `CREATE TABLE` / `CREATE VIEW` -- `ALTER TABLE ... ADD COLUMN` -- `ALTER TABLE ... DROP COLUMN` -- `ALTER TABLE ... SET PROPERTIES`(如 TTL) -- `ALTER TABLE ... RENAME COLUMN` / `RENAME TABLE` -- `DROP TABLE` -- 对应的 view 变体、`CREATE/DROP DATABASE` 等 - -报错信息形如 `Pre create table failed` / `pre release add table column failed` / `... must clear the related schema cache` 等。 - -> 注:树模型的 `DELETE TIMESERIES`、`DELETE DATABASE` 也有**相同**的"所有 DN 必须可达"约束(见 §2.6),本文聚焦表模型,但方案对树模型同样适用。 - ---- - -## 2. 根因分析 - -### 2.1 总体执行链路 - -表模型 DDL 的执行入口在 CN 的 Procedure 框架,相关类位于: - -``` -iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/ -├── CreateTableProcedure.java -├── DropTableProcedure.java -├── AddTableColumnProcedure.java -├── DropTableColumnProcedure.java -├── RenameTableColumnProcedure.java -├── RenameTableProcedure.java -├── SetTablePropertiesProcedure.java -├── AlterTableColumnDataTypeProcedure.java -├── AbstractAlterOrDropTableProcedure.java ← 所有 alter/drop 的基类 -└── view/... ← view 变体 -``` - -这些 Procedure 的共同点:**在真正提交元数据变更之前 / 删除数据之前,必须先让所有 DN 把相关本地缓存清掉或进入"待更新"态**。这一步通过向所有 DN 广播 RPC 完成。 - -### 2.2 关键代码:广播给"所有已注册 DN",任一失败即整体失败 - -以"加列"为例,`AddTableColumnProcedure` 的状态机是: - -``` -COLUMN_CHECK → PRE_RELEASE → ADD_COLUMN → COMMIT_RELEASE -``` - -`PRE_RELEASE` 步调用基类 `AbstractAlterOrDropTableProcedure.preRelease(env)`,进而调用 `SchemaUtils.preReleaseTable(...)`。后者是整个问题的核心: - -```java -// SchemaUtils.java (≈ L243-262) preReleaseTable -final Map dataNodeLocationMap = - configManager.getNodeManager().getRegisteredDataNodeLocations(); // ← 所有已注册 DN -final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>(CnToDnAsyncRequestType.UPDATE_TABLE, req, dataNodeLocationMap); -CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); -return clientHandler.getResponseMap().entrySet().stream() - .filter(e -> e.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); // ← 返回"失败的 DN" -``` - -- `getRegisteredDataNodeLocations()`(`NodeManager.java` ≈ L688-697)返回**所有曾经注册过的 DN,不区分当前是否存活**。 -- `sendAsyncRequestWithRetry` 内部最多重试 `MAX_RETRY_NUM = 6` 次(`AsyncRequestManager.java`)。一个 DN 只有返回 SUCCESS 才会从重试集合移除;不可达的 DN(连接拒绝 / 超时)由 `DataNodeTSStatusRPCHandler.onError` 写入一个**错误 TSStatus**,并**保留在重试集合**里,6 轮耗尽后仍是失败项。 - -Procedure 拿到非空的 `failedResults` 后: - -```java -// AbstractAlterOrDropTableProcedure.java (≈ L96-101) -if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache ← 设计者的注释 - setFailure(new ProcedureException(new MetadataException(...))); - return; -} -``` - -`setFailure` 把 Procedure 置为 `FAILED`,ProcedureExecutor 随后触发**回滚**。结果就是:**只要有 1 个 DN 不可达,DDL 失败**。 - -> 同样的 "All dataNodes must clear the related schema cache / schemaEngine cache" 注释与 `setFailure` 逻辑,硬编码在至少 5 处:`AbstractAlterOrDropTableProcedure`(正向 L96-101 与回滚 L144-148)、`CreateTableProcedure`(L153-169)、`DropTableProcedure`(L141-167)、`DropTableColumnProcedure`(L152-188)。这是一个**全集群一致的硬约束**,不是个别遗漏。 - -### 2.3 DataNode 上与表模型相关的几类缓存 - -DDL 之所以要广播失效,是因为 DN 上确实缓存了 schema,且**关键路径直接信任本地缓存**。 - -| 缓存 | 类 / 字段 | Key | 内容 | 谁来读(危险路径) | -|---|---|---|---|---| -| 表 schema(已提交) | `DataNodeTableCache.databaseTableMap`(L64) | db, table | `TsTable`:列定义、列类别(TAG/ATTRIBUTE/FIELD/TIME)、数据类型、表属性(TTL 等) | **写入校验**:`getTableInWrite`(L316)直接读,不回 CN | -| 表 schema(变更中) | `DataNodeTableCache.preUpdateTableMap`(L67) | db, table | `(TsTable, version)`:DDL 进行中的"待更新"占位 | 查询取 schema 的 `getTable`(L329)遇到它会**回 CN 重新拉取** | -| 设备属性 | `TableDeviceSchemaCache`(dualKeyCache,table 模型 `deviceSchema`) | db, table, deviceID | `Map`:每个设备的 ATTRIBUTE 列名→值 | 查询属性:`getDeviceAttribute`(L141)直接读,命中即返回,不回 schema region | -| last 值 | `TableDeviceSchemaCache`(table 模型 `lastCache`) | db, table, deviceID, measurement | `TimeValuePair`:每列最后一个点 | `LAST` 查询直接读 | - -补充:设备 ATTRIBUTE 的**权威存储**是 schema region 内的 `DeviceAttributeStore`(按 snapshot 持久化),但**查询读属性走的是 `TableDeviceSchemaCache` 缓存**——命中缓存就不回 schema region。这正是属性脏读的来源。 - -DN 侧接收 CN 广播的 RPC handler 都在 `DataNodeInternalRPCServiceImpl`: - -- `updateTable`(L1813):按子类型分发 `PRE_UPDATE_TABLE` / `ROLLBACK_UPDATE_TABLE` / `COMMIT_UPDATE_TABLE`,驱动 `DataNodeTableCache` 的两阶段协议。 -- `invalidateTableCache`(L1842):整表失效(drop table)。 -- `invalidateColumnCache`(L2033):单列失效(drop column)。 -- `deleteColumnData`(L2051):物理删除列数据(drop column 第三步)。 - -所有失效 handler 都会先拿 `SchemaLockType.VALIDATE_VS_DELETION_TABLE` 的**写锁**。 - -### 2.4 两阶段协议与 `VALIDATE_VS_DELETION_TABLE` 锁 - -DDL 用一个 **pre-release / commit-release / rollback** 的两阶段(对 drop 列是"先失效缓存、再删数据、最后提交"的三阶段)协议来保证跨 DN 的原子性: - -- **PRE_RELEASE**:广播 `PRE_UPDATE_TABLE`,每个 DN 把新 `TsTable` 放入 `preUpdateTableMap`,使得**取 schema 的读路径**在变更窗口内回 CN 拉最新版本。**任一 DN 失败 → 整体失败**。 -- **(中间)** 在 CN consensus 提交真正的元数据变更。 -- **COMMIT_RELEASE**:广播 `COMMIT_UPDATE_TABLE`,DN 把表从 `preUpdateTableMap` 落到 `databaseTableMap`。**失败只告警、不失败 Procedure**(见 §2.5 的关键不对称)。 - -`VALIDATE_VS_DELETION_TABLE` 锁的协议在 `SchemaLockType.java`(L52-62)写得很清楚: - -``` -1. 写入 / load TsFile 校验 schema 前,加读锁; -2. 完成后释放读锁; -3. 表相关删除时,作废 device cache 前,加写锁; -4. 完成失效后释放写锁。 -``` - -这把锁只保证**单个 DN 内**"写入校验"与"缓存失效"互斥。"所有 DN 都必须 ack"这一条,则是把这种互斥**提升到集群级**:CN 不提交元数据变更,直到确信每个 DN 都拆掉了旧缓存。 - -### 2.5 关键不对称:PRE 必须全可达,COMMIT 却允许失败 - -这是理解问题、也是设计方案的关键线索: - -| 步骤 | 一个 DN 不可达时 | 原因 | -|---|---|---| -| PRE_RELEASE / INVALIDATE_CACHE | **整体失败 + 回滚** | 元数据还没变。必须保证"没有任何活着的 DN 还揣着旧缓存跨过这次变更" | -| COMMIT_RELEASE | **仅告警,Procedure 成功** | 元数据已提交。漏掉 commit 的 DN 只是 `preUpdateTableMap` 里留了个占位,其读路径会持续回 CN 拉最新版 → 最终一致、且安全 | - -**结论**:系统其实**已经容忍** DN 在 commit 阶段缺席(最终一致)。真正的硬约束只在**变更前的失效/预发布**这一步。而这一步要求"全员可达"的唯一目的,是**防止某个活着的 DN 带着旧缓存跨过变更点**。这恰恰是我们可以用 fencing 来替代的部分。 - -### 2.6 为什么"必须所有 DN"——三个正确性场景 - -如果允许某个**活着但分区**的 DN 漏掉失效,可能产生的脏数据: - -1. **写路径用过期表 schema(类型/类别错乱)** - 写入校验入口 `TableHeaderSchemaValidator`(`validateInsertNodeMeasurements` L343 / `validateTableHeaderSchema4TsFile` L102)先加 `VALIDATE_VS_DELETION_TABLE` 读锁,然后 `DataNodeTableCache.getTableInWrite(...)`(L123 / L363)**直接读 `databaseTableMap`**。该方法对 FIELD 列的数据类型校验"交给上层",类型不一致不会在这层拦截。 - 设想列 `pressure` 在 CN 上由 `FLOAT` 改成 `DOUBLE`,但分区 DN 仍缓存 `FLOAT`:路由到该 DN 的写入会以 `DOUBLE` 落盘,而该 DN 的列定义却是 `FLOAT`——后续按 `FLOAT` 解码即得到**错误数值**,且**对客户端无任何报错**(静默类型损坏)。若错配发生在 TAG 与 FIELD 之间,物理存储路径完全不同(tag 进 deviceID、field 进 measurement),会产生**正常查询无法触及的结构性脏数据**。 - -2. **属性缓存脏读** - 查询读属性 `TableDeviceSchemaFetcher.tryGetTableDeviceInCache`(L413-452)→ `cache.getDeviceAttribute`(L424)**纯内存命中即返回**,不回 schema region。`DROP COLUMN`(属性列)经 `invalidateColumnCache` 把该列从每个设备的属性 map 移除;若某分区 DN 漏掉这次失效,它仍会把已删除列的旧值当作有效属性返回——**返回一个 schema 里已不存在的列的值**。 - -3. **DROP COLUMN:物理删除 + 幽灵数据(最危险)** - `DropTableColumnProcedure` 状态机: - - ``` - CHECK_AND_INVALIDATE_COLUMN → INVALIDATE_CACHE → EXECUTE_ON_REGIONS → DROP_COLUMN - ``` - - 顺序保证是:**先让所有 DN 失效缓存(此后没有新写入能写进该列)→ 再物理删除 TsFile/属性数据 → 最后在 CN 提交删列**。 - 若某 DN 漏掉 `INVALIDATE_CACHE` 且系统仍继续:数据被物理删除、CN schema 已删列,但该 DN 缓存里**该列仍存在**;新写入路由到它会**通过校验**并把该列数据写进 WAL/memtable——于是出现"存储里有、schema 里没有"的**幽灵列数据**;查询扫到这些字节会静默跳过或解码报错。属性列情形更糟:脏值落在 schema region 持久化存储里,副本间**持久性数据分叉**,难以自动 reconcile。 - -> 这三点正是用户所说"DN 上有几种 Cache,不清理则在网络分区时可能产生脏数据"的具体机理。 - -### 2.7 现状的不合理之处 - -把上面拼起来,问题的本质是: - -> **当前实现用"所有已注册 DN 必须同步 ack 缓存失效"来保证正确性,却没有区分"DN 真宕机"(缓存已随进程消失,本质安全)与"DN 活着但与 CN 分区"(缓存仍在,真正危险)。对前者本可放行,却一并判失败,于是牺牲了 DDL 的高可用。** - -证据: - -- 一个**真正宕机**的 DN,重启后必然走注册流程,从 CN 的 `runtimeConfiguration.getTableInfo()` 重建 `DataNodeTableCache`(`DataNode.java:523` 的 `init(...)`),**不可能用旧缓存服务任何请求**。它在宕机期间也不服务任何请求。对它而言,"等它 ack 失效"在逻辑上是多余的。 -- CN 侧其实**已经知道** DN 是否可达:`DataNodeHeartbeatCache` 通过 Phi-Accrual `failureDetector` 把失联 DN 标为 `Unknown`。树模型的 `DeleteDatabaseProcedure` 走的 `ConfigNodeProcedureEnv.invalidateCache`(L164-221)已经会**检查 NodeStatus、对 `Unknown` 重试 10 次 / 5s** ——说明"失效时参考 NodeStatus"这条路代码里已有先例,只是最终仍是"超时即失败",没有走到"放行"。 - -缺的那一环是:**没有任何机制保证一个"活着但分区"的 DN 会主动停止使用旧缓存**。只要补上这一环(DN 自我 fencing),"不可达"就能安全地等价于"已隔离",CN 就能放心放行。 - ---- - -## 3. 解决方案 - -### 3.1 设计目标 - -1. **正确性不回退**:任何已提交的表 schema 变更之后,集群中**不存在**任何 DN 用过期缓存接受写入或返回旧值(尤其是 DROP 列物理删除之前,必须保证没有 DN 还能写该列)。 -2. **可用性提升**:挂掉**少数** DN(典型:3 副本挂 1)时,表模型 DDL 仍能成功。 -3. **复用现有设施**:尽量基于现有心跳 / NodeStatus / 注册 resync,不引入新的重协议。 -4. **常态零额外开销**:全员存活时路径与现状一致,无新增等待。 - -### 3.2 核心思想:把"缓存有效性"绑定到"与 ConfigNode 的租约" - -引入一个概念:**DN 的表模型缓存只有在它"持有 CN 租约"期间才可信**。租约就用现有心跳承载——DN 持续收到 CN 心跳即续约;一旦在 `T_fence` 内收不到心跳,租约过期,DN 必须**自我隔离(self-fencing)**。 - -于是: - -- **DN 真宕机** → 进程没了,缓存没了,重启 resync,安全。 -- **DN 活着但分区** → `T_fence` 后租约过期,自我隔离(作废表缓存 + 对依赖缓存的表操作 fail-closed),不再可能产生脏数据。 -- 两种情况下,"CN 联系不上的 DN"在 `T_fence` 之后都**保证不会用旧缓存服务请求**。CN 据此放行。 - -### 3.3 组件一:DataNode 自我隔离(self-fencing)——新增 - -这是方案中**唯一全新的机制**,也是正确性的基石。 - -1. **记录最后心跳时间**:在 `getDataNodeHeartBeat`(`DataNodeInternalRPCServiceImpl.java:2226`)里记录"最近一次收到 CN 心跳"的单调时钟时间戳(DN 当前**不**记录,但 handler 就在那,改动很小)。 -2. **后台 fencing 检查**:DN 起一个轻量定时任务,若 `now - lastHeartbeatFromCN > T_fence`,进入 **FENCED** 态: - - 作废 `DataNodeTableCache`(`databaseTableMap` + `preUpdateTableMap`)与 `TableDeviceSchemaCache`(属性 + last)。 - - 设 `tableSchemaFenced = true`。 -3. **FENCED 态下 fail-closed**: - - 写入校验(`TableHeaderSchemaValidator`)与取 schema(`getTableInWrite` / `getTable`)在 FENCED 态下**不信任本地缓存**:要么回 CN 现拉,CN 不可达则**直接拒绝该操作**(fail-closed,宁可不可用也不写脏);要么干脆对表写入/查询返回"schema 暂不可用,请重试"。 - - 属性 / last 缓存查询同理:FENCED 态视为 miss,回源;回不了源则失败。 -4. **续约即恢复**:恢复收到 CN 心跳后,**先强制 resync**(组件二)再清除 FENCED 态。 - -> 失败语义:分区少数侧的客户端在 `T_fence` 后会被 fail-closed 拒绝表读写——这正是 CP 系统对少数派分区的**正确**行为;多数侧(含 CN)保持可用。 - -### 3.4 组件二:重连后强制 resync——增强现有路径 - -DN 从 FENCED 恢复(或重启注册)时,在**对外服务表请求之前**必须把缓存与 CN 对齐: - -- 复用现有注册 resync:重启路径已通过 `DataNodeTableCache.init(runtimeConfiguration.getTableInfo())`(`DataNode.java:523`)重建缓存。 -- 对"未重启、仅心跳恢复"的 FENCED→恢复路径,新增一次**主动全量拉取**(沿用 `getTable` 已有的 `fetchTables`/`ClusterConfigTaskExecutor` 回 CN 的能力):拉到当前 schema 版本后再清 FENCED。 -- 可选优化:在心跳响应里带一个**单调递增的 schema epoch**;DN 比对本地 epoch,落后才触发全量拉取,常态只续约不拉数据。 - -### 3.5 组件三:ConfigNode DDL 容忍"已隔离/已宕机"的 DN——核心改动 - -改造 §2.2 那一步"广播失效 + 任一失败即 setFailure"的逻辑。把失效广播的结果分三类处理,而不是一刀切失败: - -对每个未返回 SUCCESS 的 DN,查其 `NodeStatus` / 最近成功联系时间: - -| DN 情况 | 判定 | 处理 | -|---|---|---| -| `Running`(可达),但 RPC 报错 | 真错误(如 DN 内部异常) | 重试;仍失败则**失败 Procedure**(与现状一致) | -| `Unknown` / 失联,且失联时长 < `T_proceed` | 可能还没自我隔离 | **等待**至 `T_proceed`(或在此期间它恢复并 ack) | -| 失联时长(`hbAge`)≥ `T_proceed` | 保证已自我隔离或已宕机 | **视为安全,放行**;该 DN 恢复时由其**自驱 resync**,CN 无需记录(见通用方案 `cluster-metadata-ha-fencing-design.md` §2.3) | -| 已被移除 / 确认下线 | 不再是集群成员 | 放行 | - -放行后: - -- DDL 照常提交元数据(并对 DROP 列继续物理删除——此时已保证无活着的旧缓存 DN)。 -- 把"该 DN 需要 resync"持久化(或依赖组件二的 DN 自恢复 resync)。该 DN 恢复时被强制对齐后才重新服务(组件二保证它在对齐前处于 FENCED,不会脏读/脏写)。 - -> 这本质上是把树模型 `ConfigNodeProcedureEnv.invalidateCache`(L164-221)已有的"NodeStatus 感知 + 重试",从"超时即失败"改成"确认隔离后放行"。 - -### 3.6 时序与正确性论证(lease ordering) - -记号: -- `T_hb`:心跳间隔。 -- `T_fence`:DN 自我隔离阈值(收不到心跳超过它就 fence)。 -- `T_proceed`:CN 判定"该失联 DN 已安全隔离"所需的失联时长。 - -**安全不变式**:CN 在提交变更(及 DROP 列物理删除)时,对每个未 ack 的 DN,要么它已 ack,要么它**已经自我隔离**。 - -**为什么 `T_proceed > T_fence + margin` 即可保证**: -心跳方向是 CN→DN。CN 对某 DN 的"最近一次成功联系"时刻 `t_cn`,在真实时间上**不早于** DN"最近一次收到心跳"的时刻 `t_dn`(DN 先收到、CN 才拿到响应)。DN 在 `t_dn + T_fence` 自我隔离。因此只要 CN 自 `t_cn` 起又过了 `T_fence + margin`(`margin` 覆盖时钟漂移、DN fence 检查周期、网络抖动),就能确信 `now ≥ t_dn + T_fence`,即该 DN **已隔离**。取 `T_proceed = T_fence + margin` 成立。 - -**与 commit 不对称的呼应**:§2.5 已说明系统本就容忍 commit 阶段缺席(最终一致)。本方案只是把"变更前失效"这步从"全员强同步"放宽为"全员 ack 或 已隔离",正确性边界没有降低——因为"已隔离"DN 与"已清缓存"DN 对外行为等价(都不会用旧缓存服务)。 - -**DROP 列的特别说明**:物理删除(`EXECUTE_ON_REGIONS`)必须在"全员 ack 或 已隔离"**之后**才执行。隔离 DN 在恢复 resync 前处于 FENCED、不接受该表写入,故不会在删除后再写出幽灵列。顺序不变,安全。 - -### 3.7 各操作的处理要点 - -| 操作 | 现状失败点 | 方案后 | -|---|---|---| -| CREATE TABLE | PRE_RELEASE 广播 `PRE_UPDATE_TABLE` 全员必达 | 失联且已隔离的 DN 放行;它恢复 resync 时自然学到新表 | -| ADD COLUMN / SET PROPERTIES / RENAME | `preRelease` 全员必达 | 同上;隔离 DN 恢复后拉到新 schema | -| DROP COLUMN | `INVALIDATE_CACHE` 全员必达,且卡住后续物理删除 | 隔离 DN 放行后再删数据;隔离 DN 恢复前 FENCED,不会写幽灵列 | -| DROP TABLE | `invalidateTableCache` 全员必达 | 同 DROP COLUMN | -| COMMIT_RELEASE | 本就只告警 | 不变 | - -### 3.8 备选方案(讨论用) - -**备选 A:写路径 schema 版本号 fencing(更彻底但更侵入)** -给每张表一个在 CN consensus 提交的单调 schema 版本 `V`,并下沉到数据写入路径:每次写入用所用 schema 的 `V` 打戳,region 侧(持有权威已提交 `V`)拒绝**低版本**写入。这样即使某 DN 缓存过期,它的写入也会在 region 层被版本校验拦下,从根上杜绝脏写。 -- 优点:不依赖时间/租约推理,纯版本号比较,最严格。 -- 缺点:需把表 schema 版本贯穿到数据写入 consensus 路径(当前数据 region 并不知道表 schema 版本),改动面大;对读路径脏读仍需另行处理。 -- 建议:作为长期演进选项,可与组件一/三组合(fencing 解决可用性,版本号兜底正确性)。 - -**备选 B:仅缩小广播范围** -有人可能想"只对受影响 schema region 的副本所在 DN 广播"。**不可行**:表缓存存在于**每个** DN(任何 DN 都可能做查询协调者并缓存任意表 schema、也可能承载该表数据 region),不是只在副本 DN 上。所以无法用 quorum 替代全员。这条排除,正好反衬出组件一(让每个 DN 自我兜底)才是对的方向。 - -### 3.9 实施计划与涉及文件 - -**Phase 1:DataNode 自我 fencing(正确性基石,先落地)** -- `DataNodeInternalRPCServiceImpl.getDataNodeHeartBeat`(L2226):记录 `lastHeartbeatFromCnNanos`。 -- 新增 fencing 检查任务 + FENCED 状态(建议挂在 schema engine / `DataNodeTableCache` 附近)。 -- `DataNodeTableCache`:FENCED 态下 `getTableInWrite` / `getTable` 不信任本地缓存;提供 `fenceAll()` / `clearFence()`。 -- `TableDeviceSchemaCache`:FENCED 态下属性/last 查询按 miss 处理。 -- `TableHeaderSchemaValidator`(L102/L343):FENCED 态写入校验 fail-closed。 -- 配置项:`T_fence`(默认 **20s**,与 `failureDetectorFixedThresholdInMs` 对齐,避免误 fence 健康 DN)。表 DDL 中 **CREATE TABLE / ADD COLUMN 等加性操作即时放行**(陈旧 DN 对未知实体天然 fail-closed),只有 DROP/RENAME/SET 等**破坏/语义变更类**在确有 DN 不可达时等 `T_proceed = T_fence + margin ≈ 25s`(`margin≈5s` 内部派生)。分类与论证见通用方案 §2.6/§3。 - -**Phase 2:DN 恢复 resync** -- 心跳响应增加 `schemaEpoch`(可选优化);DN 心跳恢复后落后才全量拉取。 -- 复用 `DataNode.java:523` 注册 resync;新增"心跳恢复"分支的主动拉取。 - -**Phase 3:ConfigNode DDL 放行逻辑(兑现可用性)** -- `SchemaUtils.preReleaseTable / commitReleaseTable / rollbackPreRelease`(≈L243-318):返回结果区分"真失败 / 失联可放行"。 -- `AbstractAlterOrDropTableProcedure.preRelease`(L89-110)、`CreateTableProcedure`(L153-169)、`DropTableProcedure`(L141-167)、`DropTableColumnProcedure`(L152-188):把"任一失败即 setFailure"改为"按 §3.5 表格分类处理"。 -- 引入基于 `hbAge` 的判定(参考并改造 `ConfigNodeProcedureEnv.invalidateCache` L164-221),落地"加性即时放行 / 破坏类等 `T_proceed = T_fence + margin`"。 -- 隔离 DN 恢复时由其**自驱 resync**,CN 无需记录 laggards(见通用方案 §2.3)。 - -**Phase 4(可选):备选 A 的写路径版本号兜底**——长期演进。 - -**测试** -- 复用 / 扩展现有 IT:在 1C3D 集群停 1 DN,验证 CREATE / ADD / DROP / SET / RENAME / DROP TABLE 均成功。 -- 注入"活着但分区"场景(阻断 DN↔CN 心跳但保留 DN↔client):验证 `T_fence` 后该 DN 对表读写 fail-closed,不产生脏数据;恢复后 resync 正确。 -- DROP 列并发写:验证隔离 DN 不会写出幽灵列。 -- 仅运行本次新增/改动的 IT,不跑全量。 - -### 3.10 风险与权衡 - -- **DDL 延迟**:仅当确有 DN 失联时,CN 需等待至 `T_proceed` 才放行;全员存活时无额外等待。可接受。 -- **少数派读不可用**:分区少数侧的 DN 在 `T_fence` 后对表读写 fail-closed,牺牲该侧可用性换正确性——符合 CP 取舍。若未来要给"可容忍轻微陈旧"的读开口子,可在备选 A 的版本号体系下单独放宽,但默认 fail-closed。 -- **`T_fence` 取值**:默认 20s(与失败检测阈值对齐,避免误 fence)。破坏类操作的下线等待 ~25s 主要由加性快速路径抵消;全员存活恒零等待。详见通用方案 §2.6。 -- **时钟假设**:论证用单调时钟与保守 margin,不依赖跨节点钟同步。 - ---- - -## 4. 附:关键文件索引 - -CN 侧: -- `confignode/.../procedure/impl/schema/SchemaUtils.java` —— `preReleaseTable`/`commitReleaseTable`/`rollbackPreRelease`(≈L243-318) -- `confignode/.../procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java` —— `preRelease`/`commitRelease`/`rollbackPreRelease`,"All dataNodes must clear..." 注释 -- `confignode/.../procedure/impl/schema/table/{CreateTable,DropTable,AddTableColumn,DropTableColumn,SetTableProperties,RenameTableColumn}Procedure.java` -- `confignode/.../procedure/env/ConfigNodeProcedureEnv.java` —— 树模型 `invalidateCache`(L164-221,NodeStatus 重试先例) -- `confignode/.../manager/node/NodeManager.java` —— `getRegisteredDataNodeLocations`(L688-697) -- `confignode/.../manager/load/cache/node/DataNodeHeartbeatCache.java` —— Phi-Accrual 失败检测 → `Unknown` -- `node-commons/.../client/request/AsyncRequestManager.java` —— `MAX_RETRY_NUM = 6` - -DN 侧: -- `datanode/.../schemaengine/table/DataNodeTableCache.java` —— 表 schema 缓存 + 两阶段协议(`databaseTableMap` L64 / `preUpdateTableMap` L67 / `getTableInWrite` L316 / `getTable` L329) -- `datanode/.../queryengine/plan/relational/metadata/fetcher/cache/TableDeviceSchemaCache.java` —— 属性 + last 缓存(`getDeviceAttribute` L141 / `invalidate` L614,L676) -- `datanode/.../queryengine/plan/relational/metadata/fetcher/TableDeviceSchemaFetcher.java` —— 属性读缓存路径(`tryGetTableDeviceInCache` L413-452) -- `datanode/.../queryengine/plan/relational/metadata/fetcher/TableHeaderSchemaValidator.java` —— 写入校验入口(L102 / L343) -- `datanode/.../queryengine/plan/analyze/lock/SchemaLockType.java` —— `VALIDATE_VS_DELETION_TABLE`(L52-62) -- `datanode/.../protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java` —— RPC handler:`updateTable` L1813 / `invalidateTableCache` L1842 / `invalidateColumnCache` L2033 / `deleteColumnData` L2051 / `getDataNodeHeartBeat` L2226 -- `datanode/.../schemaengine/schemaregion/attribute/DeviceAttributeStore.java` —— 属性权威存储(snapshot 持久化) -- `datanode/.../service/DataNode.java` —— 启动 resync(`DataNodeTableCache.init` L523)