diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java index 4852b9d116e25..403671ac3c64f 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppCommonConfig.java @@ -85,6 +85,12 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + setProperty("metadata_lease_fence_ms", String.valueOf(metadataLeaseFenceMs)); + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { setProperty("time_partition_interval", String.valueOf(partitionInterval)); diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java index 582c9a049e492..df54eb295be48 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppSharedCommonConfig.java @@ -61,6 +61,13 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + cnConfig.setMetadataLeaseFenceMs(metadataLeaseFenceMs); + dnConfig.setMetadataLeaseFenceMs(metadataLeaseFenceMs); + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { cnConfig.setPartitionInterval(partitionInterval); diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java index 48c157e957be8..0df0b61ce002f 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteCommonConfig.java @@ -44,6 +44,11 @@ public CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold) { return this; } + @Override + public CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + return this; + } + @Override public CommonConfig setPartitionInterval(long partitionInterval) { return this; diff --git a/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java b/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java index dc21234e2bad2..41263510f38d7 100644 --- a/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/itbase/env/CommonConfig.java @@ -32,6 +32,8 @@ public interface CommonConfig { CommonConfig setMemtableSizeThreshold(long memtableSizeThreshold); + CommonConfig setMetadataLeaseFenceMs(long metadataLeaseFenceMs); + CommonConfig setPartitionInterval(long partitionInterval); CommonConfig setCompressor(String compressor); diff --git a/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java b/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java new file mode 100644 index 0000000000000..1df3241b43d72 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/relational/it/schema/IoTDBTableDDLHAIT.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.relational.it.schema; + +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.it.env.cluster.node.DataNodeWrapper; +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.TableClusterIT; +import org.apache.iotdb.itbase.env.BaseEnv; + +import org.awaitility.Awaitility; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.concurrent.TimeUnit; + +import static org.junit.Assert.assertTrue; + +/** + * HA behavior of table-model DDL: a table DDL must broadcast a cache-invalidation to every + * DataNode. Before the metadata-lease/fence change it hard-failed whenever any DataNode was + * unreachable (so a single down DataNode broke CREATE TABLE, contradicting multi-replica HA). With + * the change the ConfigNode proceeds once the unreachable DataNode is provably self-fenced (it + * fails closed on its stale caches and resyncs on recovery, so it cannot serve dirty schema). + * + *

This test stops one DataNode and asserts CREATE TABLE still succeeds. + */ +@RunWith(IoTDBTestRunner.class) +@Category({TableClusterIT.class}) +public class IoTDBTableDDLHAIT { + + @BeforeClass + public static void setUp() throws Exception { + // Small fence threshold so the ConfigNode can prove the stopped DataNode is self-fenced quickly + // (T_proceed = fence + ~5s internal margin), keeping the test fast. Live DataNodes keep + // heartbeating (~1s), so they do not spuriously fence. + EnvFactory.getEnv().getConfig().getCommonConfig().setMetadataLeaseFenceMs(2000); + EnvFactory.getEnv().initClusterEnvironment(1, 3); + } + + @AfterClass + public static void tearDown() throws Exception { + EnvFactory.getEnv().cleanClusterEnvironment(); + } + + @Test + public void createTableSucceedsWhileOneDataNodeIsDown() throws Exception { + final DataNodeWrapper liveDataNode = EnvFactory.getEnv().getDataNodeWrapper(0); + final DataNodeWrapper victimDataNode = EnvFactory.getEnv().getDataNodeWrapper(2); + + // Pin the connection to a DataNode we will keep alive, so stopping the victim cannot break it. + try (final Connection connection = + EnvFactory.getEnv() + .getConnection(liveDataNode, "root", "root", BaseEnv.TABLE_SQL_DIALECT); + final Statement statement = connection.createStatement()) { + statement.execute("CREATE DATABASE test_ha"); + statement.execute("USE test_ha"); + + // Sanity: with all DataNodes up the DDL broadcast acks everywhere and succeeds immediately. + statement.execute("CREATE TABLE t_all_up (region STRING TAG, temperature FLOAT FIELD)"); + + // Take one DataNode down. Its last successful ConfigNode contact is now frozen; after + // T_proceed the ConfigNode can treat it as self-fenced and stop waiting for its ack. + victimDataNode.stop(); + Assert.assertFalse("victim DataNode should be stopped", victimDataNode.isAlive()); + + // The DDL broadcast can no longer reach the stopped DataNode. Previously this hard-failed; + // now it must still succeed (after blocking ~T_proceed while the fence is proven). + statement.execute("CREATE TABLE t_after_down (region STRING TAG, temperature FLOAT FIELD)"); + + // Confirm the new table is really visible on the live DataNode. + Awaitility.await() + .atMost(30, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .until(() -> tableExists(statement, "t_after_down")); + assertTrue( + "CREATE TABLE must succeed with one DataNode down", + tableExists(statement, "t_after_down")); + } + } + + private static boolean tableExists(final Statement statement, final String tableName) + throws Exception { + try (final ResultSet resultSet = statement.executeQuery("SHOW TABLES")) { + while (resultSet.next()) { + if (tableName.equalsIgnoreCase(resultSet.getString(1))) { + return true; + } + } + } + return false; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java index e7a31b1dc73eb..c08aee0d1bdc1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/heartbeat/DataNodeHeartbeatHandler.java @@ -26,6 +26,7 @@ import org.apache.iotdb.commons.cluster.RegionStatus; import org.apache.iotdb.confignode.conf.ConfigNodeConfig; import org.apache.iotdb.confignode.conf.ConfigNodeDescriptor; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.load.LoadManager; import org.apache.iotdb.confignode.manager.load.cache.consensus.ConsensusGroupHeartbeatSample; import org.apache.iotdb.confignode.manager.load.cache.node.NodeHeartbeatSample; @@ -82,6 +83,16 @@ public DataNodeHeartbeatHandler( @Override public void onComplete(TDataNodeHeartbeatResp heartbeatResp) { + // A successful response confirms ConfigNode->DataNode contact; stamp it on the ConfigNode clock + // for the metadata-lease verdict. Kept separate from the load-cache samples (which record the + // echoed send-time) and deliberately not touched in onError, so failures never advance it. + final DataNodeContactTracker contactTracker = DataNodeContactTracker.getInstance(); + contactTracker.recordSuccessfulResponse(nodeId); + contactTracker.recordCapability( + nodeId, + heartbeatResp.isSetSupportsMetadataLeaseFencing() + && heartbeatResp.isSupportsMetadataLeaseFencing()); + // Update NodeCache loadManager .getLoadCache() diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java index fe687d17556f7..cf61bbaea0445 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java @@ -37,6 +37,7 @@ import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.consensus.ConsensusManager; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.pipe.agent.PipeConfigNodeAgent; import org.apache.iotdb.confignode.persistence.executor.ConfigPlanExecutor; import org.apache.iotdb.confignode.persistence.schema.ConfigNodeSnapshotParser; @@ -291,6 +292,14 @@ public void notifyLeaderReady() { // Always start load services first configManager.getLoadManager().startLoadServices(); + // Reset every DataNode's last-contact time to now on (re)acquiring leadership: a stale + // timestamp + // left from a previous leadership term (while another ConfigNode was contacting the DataNodes) + // would otherwise let the metadata-broadcast verdict wrongly judge a live DataNode as fenced. + DataNodeContactTracker.getInstance() + .onLeadershipAcquired( + configManager.getNodeManager().getRegisteredDataNodeLocations().keySet()); + if (CONF.isEnableTopologyProbing()) { configManager.getLoadManager().startTopologyService(); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java new file mode 100644 index 0000000000000..74a49ca5c450d --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagator.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.confignode.manager.IManager; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.DataNodeState; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; +import org.apache.iotdb.rpc.TSStatusCode; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.function.IntPredicate; +import java.util.function.IntToLongFunction; +import java.util.function.LongSupplier; +import java.util.function.Supplier; + +/** + * Drives one Tier-A metadata cache-invalidation broadcast to the cluster and turns "which DataNodes + * acknowledged" into a {@link Verdict} via {@link MetadataBroadcastVerdict}. Instead of the legacy + * "any unreachable DataNode fails the operation", a DataNode that is provably self-fenced (out of + * ConfigNode contact for at least {@code T_proceed} and known to support fencing) is treated as + * safe to proceed past, delivering availability without risking dirty data (see the design doc). + * + *

The caller supplies a {@link CacheBroadcast} closure wrapping its specific RPC (table + * pre-release, schema-cache invalidation, permission-cache invalidation, ...); this class is + * agnostic to the request type and only interprets the per-DataNode {@link TSStatus} responses. + * + *

Stateless and cheap to construct per operation. Clock and sleep are injectable for testing. + */ +public class ClusterCachePropagator { + + /** + * {@code T_proceed = T_fence + margin}. The margin (default 5s) covers heartbeat-recording + * granularity and scheduling jitter; see design §2.6. Kept internal (not a user knob). + */ + private static final long DEFAULT_PROCEED_MARGIN_MS = 5_000L; + + /** How often to re-broadcast while waiting for unacked DataNodes to ack or to cross T_proceed. */ + private static final long RETRY_INTERVAL_MS = 1_000L; + + /** + * Extra slack on top of T_proceed before giving up, so a just-died DataNode can cross T_proceed. + */ + private static final long WAIT_BUDGET_BUFFER_MS = 5_000L; + + /** Broadcasts the cache invalidation to {@code targets} and returns the per-nodeId responses. */ + @FunctionalInterface + public interface CacheBroadcast { + Map sendTo(Map targets); + } + + /** Injectable sleep so the retry loop can be driven deterministically in tests. */ + @FunctionalInterface + interface Sleeper { + void sleepMs(long ms) throws InterruptedException; + } + + private final Supplier> registeredDataNodes; + private final IntPredicate supportsFencing; + private final IntToLongFunction hbAgeMs; + private final LongSupplier tProceedMs; + private final LongSupplier nanoClock; + private final Sleeper sleeper; + + public ClusterCachePropagator(final IManager configManager) { + this( + () -> configManager.getNodeManager().getRegisteredDataNodeLocations(), + nodeId -> DataNodeContactTracker.getInstance().supportsFencing(nodeId), + nodeId -> DataNodeContactTracker.getInstance().getMillisSinceLastSuccessfulResponse(nodeId), + () -> + CommonDescriptor.getInstance().getConfig().getMetadataLeaseFenceMs() + + DEFAULT_PROCEED_MARGIN_MS, + System::nanoTime, + Thread::sleep); + } + + ClusterCachePropagator( + final Supplier> registeredDataNodes, + final IntPredicate supportsFencing, + final IntToLongFunction hbAgeMs, + final LongSupplier tProceedMs, + final LongSupplier nanoClock, + final Sleeper sleeper) { + this.registeredDataNodes = registeredDataNodes; + this.supportsFencing = supportsFencing; + this.hbAgeMs = hbAgeMs; + this.tProceedMs = tProceedMs; + this.nanoClock = nanoClock; + this.sleeper = sleeper; + } + + /** + * Broadcast once and classify the result. {@code waitBudgetExhausted} turns a would-be {@link + * Verdict#WAIT} into {@link Verdict#FAIL} (the caller's retry budget ran out). + */ + public Verdict propagateOnce(final CacheBroadcast broadcast, final boolean waitBudgetExhausted) { + final Map targets = registeredDataNodes.get(); + final Map responses = broadcast.sendTo(targets); + final long tProceed = tProceedMs.getAsLong(); + final int successCode = TSStatusCode.SUCCESS_STATUS.getStatusCode(); + final List states = new ArrayList<>(targets.size()); + for (final Integer nodeId : targets.keySet()) { + final TSStatus status = responses.get(nodeId); + final boolean acked = status != null && status.getCode() == successCode; + // retiredOrFenceAcked is left false: there is no explicit fence-ack signal yet, and a + // Removing DataNode may still serve clients, so it must ack or be provably fenced like any + // other (see MetadataBroadcastVerdict's SAFE_GONE rule). + states.add( + new DataNodeState( + acked, false, supportsFencing.test(nodeId), hbAgeMs.applyAsLong(nodeId))); + } + return MetadataBroadcastVerdict.decide(states, tProceed, waitBudgetExhausted); + } + + /** + * Broadcast and retry until the verdict is {@link Verdict#PROCEED} (returns {@code true}) or the + * wait budget is exhausted with at least one DataNode still unsafe ({@link Verdict#FAIL}, returns + * {@code false}). Blocks the calling (procedure) thread for up to {@code T_proceed + buffer}. + */ + public boolean propagate(final CacheBroadcast broadcast) { + final long deadlineNanos = + nanoClock.getAsLong() + + TimeUnit.MILLISECONDS.toNanos(tProceedMs.getAsLong() + WAIT_BUDGET_BUFFER_MS); + while (true) { + final boolean waitBudgetExhausted = nanoClock.getAsLong() >= deadlineNanos; + final Verdict verdict = propagateOnce(broadcast, waitBudgetExhausted); + if (verdict == Verdict.PROCEED) { + return true; + } + if (verdict == Verdict.FAIL) { + return false; + } + try { + sleeper.sleepMs(RETRY_INTERVAL_MS); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + return false; + } + } + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java new file mode 100644 index 0000000000000..3bcba9e11bdd1 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTracker.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import java.util.Collection; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.LongSupplier; + +/** + * Tracks, per DataNode, the time the ConfigNode last received a successful heartbeat + * response from it, stamped with the ConfigNode's own monotonic clock at receipt. + * + *

This is the sound signal for deciding whether an unreachable DataNode has self-fenced (used by + * the metadata-lease verdict). It must be kept separate from the load-cache {@code + * NodeHeartbeatSample}s, which (a) record the heartbeat send time echoed back by the + * DataNode — not response receipt — and (b) are advanced to the current time by failure ({@code + * onError}) samples. Either property would break the verdict: send-time can make the ConfigNode + * believe a DataNode is fenced while it just renewed from a delayed heartbeat, and failure-advanced + * time would keep the age from ever growing. + * + *

By construction there is no method that advances the time on failure: only {@link + * #recordSuccessfulResponse(int)} updates it. A never-contacted DataNode reads as age 0 (treated as + * just-contacted) so the verdict never wrongly declares an unknown DataNode fenced. + */ +public class DataNodeContactTracker { + + private final LongSupplier nanoClock; + + private final Map lastSuccessfulResponseNanos = new ConcurrentHashMap<>(); + + // Whether each DataNode reports that it supports metadata-lease self-fencing. Defaults to false + // for not-yet-reported / not-yet-upgraded DataNodes, so the verdict treats them conservatively + // (never FENCED-SAFE) until they prove capability. + private final Map supportsFencing = new ConcurrentHashMap<>(); + + private DataNodeContactTracker() { + this(System::nanoTime); + } + + DataNodeContactTracker(final LongSupplier nanoClock) { + this.nanoClock = nanoClock; + } + + /** Record that a successful heartbeat response from the DataNode was just received. */ + public void recordSuccessfulResponse(final int dataNodeId) { + lastSuccessfulResponseNanos.put(dataNodeId, nanoClock.getAsLong()); + } + + /** + * Milliseconds since the ConfigNode last received a successful heartbeat response from the + * DataNode. Returns 0 (treated as just-contacted) if never recorded — conservative, so an unknown + * DataNode is never declared fenced. + */ + public long getMillisSinceLastSuccessfulResponse(final int dataNodeId) { + final Long lastNanos = lastSuccessfulResponseNanos.get(dataNodeId); + if (lastNanos == null) { + return 0L; + } + final long elapsedNanos = nanoClock.getAsLong() - lastNanos; + return elapsedNanos > 0 ? elapsedNanos / 1_000_000L : 0L; + } + + /** + * On acquiring leadership, treat all currently-registered DataNodes as just-contacted, so a new + * leader does not declare a DataNode fenced based on absent/stale history. + */ + public void onLeadershipAcquired(final Collection registeredDataNodeIds) { + final long now = nanoClock.getAsLong(); + for (final Integer dataNodeId : registeredDataNodeIds) { + lastSuccessfulResponseNanos.put(dataNodeId, now); + } + } + + /** Record whether a DataNode reports support for metadata-lease self-fencing. */ + public void recordCapability(final int dataNodeId, final boolean dnSupportsFencing) { + supportsFencing.put(dataNodeId, dnSupportsFencing); + } + + /** + * Whether the DataNode is known to support self-fencing. Defaults to false (conservative): an + * unknown/old DataNode is never treated as fenced by the verdict. + */ + public boolean supportsFencing(final int dataNodeId) { + return supportsFencing.getOrDefault(dataNodeId, false); + } + + public void removeDataNode(final int dataNodeId) { + lastSuccessfulResponseNanos.remove(dataNodeId); + supportsFencing.remove(dataNodeId); + } + + public static DataNodeContactTracker getInstance() { + return DataNodeContactTrackerHolder.INSTANCE; + } + + private static final class DataNodeContactTrackerHolder { + private static final DataNodeContactTracker INSTANCE = new DataNodeContactTracker(); + + private DataNodeContactTrackerHolder() {} + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java new file mode 100644 index 0000000000000..07259b7bd6ef7 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdict.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import java.util.Collection; + +/** + * Pure decision logic for the ConfigNode's metadata-broadcast verdict: after broadcasting a + * cache-invalidation/update to all DataNodes, decide whether it is safe to commit the metadata + * change given which DataNodes acknowledged and the state of those that did not. + * + *

Rules (design v5): + * + *

+ * + *

The overall verdict is {@code PROCEED} when no DataNode is {@code UNSAFE}; otherwise {@code + * WAIT} until the wait budget is exhausted, then {@code FAIL}. There is no "additive fast-path": + * every Tier-A operation follows the same rule (so a Running-but-unacked DataNode is never + * skipped). + */ +public final class MetadataBroadcastVerdict { + + public enum Verdict { + PROCEED, + WAIT, + FAIL + } + + public enum Disposition { + ACKED, + SAFE_GONE, + FENCED_SAFE, + UNSAFE + } + + private MetadataBroadcastVerdict() {} + + /** Per-DataNode inputs for one broadcast round. */ + public static final class DataNodeState { + private final boolean acked; + private final boolean retiredOrFenceAcked; + private final boolean supportsFencing; + private final long hbAgeMs; + + public DataNodeState( + final boolean acked, + final boolean retiredOrFenceAcked, + final boolean supportsFencing, + final long hbAgeMs) { + this.acked = acked; + this.retiredOrFenceAcked = retiredOrFenceAcked; + this.supportsFencing = supportsFencing; + this.hbAgeMs = hbAgeMs; + } + } + + public static Disposition classify(final DataNodeState state, final long tProceedMs) { + if (state.acked) { + return Disposition.ACKED; + } + if (state.retiredOrFenceAcked) { + return Disposition.SAFE_GONE; + } + if (!state.supportsFencing) { + // Capability is checked before any timing test: a DataNode that cannot self-fence can never + // be assumed fenced, no matter how long it has been silent. + return Disposition.UNSAFE; + } + if (state.hbAgeMs >= tProceedMs) { + return Disposition.FENCED_SAFE; + } + return Disposition.UNSAFE; + } + + public static Verdict decide( + final Collection states, + final long tProceedMs, + final boolean waitBudgetExhausted) { + for (final DataNodeState state : states) { + if (classify(state, tProceedMs) == Disposition.UNSAFE) { + return waitBudgetExhausted ? Verdict.FAIL : Verdict.WAIT; + } + } + return Verdict.PROCEED; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java index 960d0a7977f51..6b3428cffeaa7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java @@ -22,7 +22,6 @@ import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; -import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; import org.apache.iotdb.common.rpc.thrift.TSStatus; @@ -46,6 +45,7 @@ import org.apache.iotdb.confignode.exception.AddPeerException; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.consensus.ConsensusManager; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.manager.load.LoadManager; import org.apache.iotdb.confignode.manager.load.cache.region.RegionHeartbeatSample; import org.apache.iotdb.confignode.manager.node.NodeManager; @@ -95,7 +95,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; @@ -162,62 +161,63 @@ public void preDeleteDatabase( * @throws TException Thrift IOE */ public boolean invalidateCache(final String databaseName) throws IOException, TException { - final List allDataNodes = getNodeManager().getRegisteredDataNodes(); final TInvalidateCacheReq invalidateCacheReq = new TInvalidateCacheReq(); invalidateCacheReq.setStorageGroup(true); invalidateCacheReq.setFullPath(databaseName); - for (final TDataNodeConfiguration dataNodeConfiguration : allDataNodes) { - final int dataNodeId = dataNodeConfiguration.getLocation().getDataNodeId(); - - // If the node is not alive, retry for up to 10 times - NodeStatus nodeStatus = getLoadManager().getNodeStatus(dataNodeId); - int retryNum = 10; - if (nodeStatus == NodeStatus.Unknown) { - for (int i = 0; i < retryNum && nodeStatus == NodeStatus.Unknown; i++) { - try { - TimeUnit.MILLISECONDS.sleep(500); - } catch (final InterruptedException e) { - LOG.error("Sleep failed in ConfigNodeProcedureEnv: ", e); - Thread.currentThread().interrupt(); - break; - } - nodeStatus = getLoadManager().getNodeStatus(dataNodeId); - } - } + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // caches + // and resyncs on recovery), instead of hard-failing whenever any DataNode is Unknown. This runs + // before the physical database-schema delete in the state machine, so the "delete only after + // PROCEED" ordering holds. (throws kept for source compatibility with callers.) + return new ClusterCachePropagator(configManager) + .propagate(targets -> invalidateDatabaseCacheOnce(targets, invalidateCacheReq)); + } - if (nodeStatus == NodeStatus.Unknown) { - LOG.warn( - "Invalidate cache failed, because DataNode {} is Unknown", - dataNodeConfiguration.getLocation().getInternalEndPoint()); - return false; + /** + * One broadcast round of the database cache invalidation over {@code targets}: synchronously + * invalidate partition then schema cache on each DataNode and report SUCCESS for a DataNode only + * if both succeeded. Unknown DataNodes are not contacted (a sync send would block on connect + * timeouts) and are reported as not-acked, so the {@link ClusterCachePropagator} can decide + * whether they are provably fenced. + */ + private Map invalidateDatabaseCacheOnce( + final Map targets, final TInvalidateCacheReq invalidateCacheReq) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : targets.entrySet()) { + final int dataNodeId = entry.getKey(); + final TSStatus notAcked = new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + if (getLoadManager().getNodeStatus(dataNodeId) == NodeStatus.Unknown) { + result.put(dataNodeId, notAcked); + continue; } - - // Always invalidate PartitionCache first - final TSStatus invalidatePartitionStatus = - (TSStatus) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( - dataNodeConfiguration.getLocation().getInternalEndPoint(), - invalidateCacheReq, - CnToDnSyncRequestType.INVALIDATE_PARTITION_CACHE); - - final TSStatus invalidateSchemaStatus = - (TSStatus) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( - dataNodeConfiguration.getLocation().getInternalEndPoint(), - invalidateCacheReq, - CnToDnSyncRequestType.INVALIDATE_SCHEMA_CACHE); - - if (!verifySucceed(invalidatePartitionStatus, invalidateSchemaStatus)) { - LOG.error( - "Invalidate cache failed, invalidate partition cache status is {}, invalidate schemaengine cache status is {}", - invalidatePartitionStatus, - invalidateSchemaStatus); - return false; + try { + // Always invalidate PartitionCache first. + final TSStatus invalidatePartitionStatus = + (TSStatus) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithRetry( + entry.getValue().getInternalEndPoint(), + invalidateCacheReq, + CnToDnSyncRequestType.INVALIDATE_PARTITION_CACHE); + final TSStatus invalidateSchemaStatus = + (TSStatus) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithRetry( + entry.getValue().getInternalEndPoint(), + invalidateCacheReq, + CnToDnSyncRequestType.INVALIDATE_SCHEMA_CACHE); + result.put( + dataNodeId, + verifySucceed(invalidatePartitionStatus, invalidateSchemaStatus) + ? invalidatePartitionStatus + : notAcked); + } catch (final Exception e) { + LOG.warn( + "Invalidate cache failed for DataNode {}", entry.getValue().getInternalEndPoint(), e); + result.put(dataNodeId, notAcked); } } - return true; + return result; } public boolean verifySucceed(TSStatus... status) { @@ -876,10 +876,6 @@ private ConsensusManager getConsensusManager() { return configManager.getConsensusManager(); } - private NodeManager getNodeManager() { - return configManager.getNodeManager(); - } - private ClusterSchemaManager getClusterSchemaManager() { return configManager.getClusterSchemaManager(); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java index 5b505ec001bff..6d9fa18bab89d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java @@ -40,6 +40,7 @@ import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.lease.DataNodeContactTracker; import org.apache.iotdb.confignode.manager.load.balancer.region.GreedyCopySetRegionGroupAllocator; import org.apache.iotdb.confignode.manager.load.balancer.region.IRegionGroupAllocator; import org.apache.iotdb.confignode.manager.load.cache.node.NodeHeartbeatSample; @@ -455,6 +456,9 @@ public void removeDataNodePersistence(List removedDataNodes) PartitionMetrics.unbindDataNodePartitionMetricsWhenUpdate( MetricService.getInstance(), NodeUrlUtils.convertTEndPointUrl(dataNodeLocation.getClientRpcEndPoint())); + // Drop the removed DataNode's metadata-lease contact/capability state so it is not retained, + // and a future DataNode reusing the id cannot inherit stale fencing history. + DataNodeContactTracker.getInstance().removeDataNode(dataNodeLocation.getDataNodeId()); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java index 8c8d2019f4de8..b5783e908a944 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterLogicalViewProcedure.java @@ -31,8 +31,6 @@ import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.commons.schema.view.viewExpression.ViewExpression; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; @@ -42,7 +40,6 @@ import org.apache.iotdb.db.exception.BatchProcessException; import org.apache.iotdb.db.exception.metadata.view.ViewNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TAlterViewReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -123,27 +120,15 @@ protected Flow executeFromState( } private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, false)) { // all dataNodes must clear the related schemaengine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, - viewPathToSourceMap.keySet()); - setFailure( - new ProcedureException( - new MetadataException( - ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, + viewPathToSourceMap.keySet()); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java index 26ea988f98e72..d223263e4935b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/AlterTimeSeriesDataTypeProcedure.java @@ -28,8 +28,6 @@ import org.apache.iotdb.commons.path.PathDeserializeUtil; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeAlterTimeSeriesPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -42,7 +40,6 @@ import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.db.exception.metadata.PathNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TAlterTimeSeriesReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -249,26 +246,17 @@ public static void invalidateCache( final String requestMessage, final Consumer setFailure, final boolean needLock) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(measurementPathBytes).setNeedLock(needLock), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed past provably-fenced DataNodes instead of hard-failing on the first unreachable one + // (see SchemaUtils.invalidateMatchedSchemaCache). Runs before the physical datatype change, so + // the "alter only after PROCEED" ordering holds. + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), measurementPathBytes, needLock)) { // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, - requestMessage); - setFailure.accept( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, requestMessage); + setFailure.accept( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeactivateTemplateProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeactivateTemplateProcedure.java index ab4913da04d81..b94f6d76642a2 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeactivateTemplateProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeactivateTemplateProcedure.java @@ -30,8 +30,6 @@ import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.commons.schema.template.Template; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeactivateTemplatePlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -44,7 +42,6 @@ import org.apache.iotdb.mpp.rpc.thrift.TConstructSchemaBlackListWithTemplateReq; import org.apache.iotdb.mpp.rpc.thrift.TDeactivateTemplateReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteDataForDeleteSchemaReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TRollbackSchemaBlackListWithTemplateReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.TSStatusCode; @@ -181,29 +178,17 @@ protected List processResponseOfOneDataNode( private void invalidateCache(final ConfigNodeProcedureEnv env) { // if no target timeseries, return directly - if (!timeSeriesPatternTree.isEmpty()) { - Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(timeSeriesPatternTreeBytes), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance() - .sendAsyncRequestWithRetry(clientHandler); - Map statusMap = clientHandler.getResponseMap(); - for (TSStatus status : statusMap.values()) { - // all dataNodes must clear the related schema cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMA_CACHE_OF_TEMPLATE_TIMESERIES, - requestMessage); - setFailure( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMA_CACHE_FAILED))); - return; - } - } + if (!timeSeriesPatternTree.isEmpty() + && !SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), timeSeriesPatternTreeBytes, false)) { + // all dataNodes must clear the related schema cache + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMA_CACHE_OF_TEMPLATE_TIMESERIES, + requestMessage); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMA_CACHE_FAILED))); + return; } setNextState(DeactivateTemplateState.DELETE_DATA); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java index 4f63e96840c20..b1996e8ee92e7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteLogicalViewProcedure.java @@ -27,8 +27,6 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeleteLogicalViewPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -41,7 +39,6 @@ import org.apache.iotdb.db.exception.metadata.view.ViewNotExistException; import org.apache.iotdb.mpp.rpc.thrift.TConstructViewSchemaBlackListReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteViewSchemaReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TRollbackViewSchemaBlackListReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.TSStatusCode; @@ -167,26 +164,15 @@ protected List processResponseOfOneDataNode( } private void invalidateCache(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, false)) { // all dataNodes must clear the related schemaengine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, requestMessage); - setFailure( - new ProcedureException( - new MetadataException( - ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_VIEW, requestMessage); + setFailure( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_VIEW_SCHEMAENGINE_CACHE_FAILED))); + return; } setNextState(DeleteLogicalViewState.DELETE_VIEW_SCHEMA); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java index 0b5e45b5ca1f5..2d3d99e5fd94b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/DeleteTimeSeriesProcedure.java @@ -27,8 +27,6 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.path.PathPatternTree; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeDeleteTimeSeriesPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; @@ -42,7 +40,6 @@ import org.apache.iotdb.mpp.rpc.thrift.TConstructSchemaBlackListReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteDataForDeleteSchemaReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteTimeSeriesReq; -import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TRollbackSchemaBlackListReq; import org.apache.iotdb.pipe.api.exception.PipeException; import org.apache.iotdb.rpc.TSStatusCode; @@ -197,26 +194,18 @@ public static void invalidateCache( final String requestMessage, final Consumer setFailure, final boolean needLock) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, - new TInvalidateMatchedSchemaCacheReq(patternTreeBytes).setNeedLock(needLock), - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // schema cache and resyncs on recovery, so it cannot serve the to-be-deleted/altered series), + // instead of hard-failing on the first unreachable DataNode. This runs before the physical + // delete in the state machine, so the "delete only after PROCEED" ordering holds. + if (!SchemaUtils.invalidateMatchedSchemaCache( + env.getConfigManager(), patternTreeBytes, needLock)) { // All dataNodes must clear the related schemaEngine cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, - requestMessage); - setFailure.accept( - new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); - return; - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_SCHEMAENGINE_CACHE_OF_TIMESERIES, requestMessage); + setFailure.accept( + new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_SCHEMAENGINE_CACHE_FAILED))); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java index 4b8d0a533afe3..4e7b43669a587 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SchemaUtils.java @@ -36,6 +36,7 @@ import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.db.exception.metadata.PathNotExistException; @@ -43,7 +44,9 @@ import org.apache.iotdb.mpp.rpc.thrift.TCheckSchemaRegionUsingTemplateResp; import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateReq; import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateResp; +import org.apache.iotdb.mpp.rpc.thrift.TInvalidateMatchedSchemaCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTemplateReq; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -60,6 +63,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Supplier; import java.util.stream.Collectors; public class SchemaUtils { @@ -240,27 +244,47 @@ protected void onAllReplicasetFailure( } } - public static Map preReleaseTable( - final String database, - final TsTable table, - final ConfigManager configManager, - final String oldName) { + /** Build the PRE_UPDATE_TABLE request used to pre-release a table change to DataNodes. */ + public static TUpdateTableReq preUpdateTableReq( + final String database, final TsTable table, final String oldName) { final TUpdateTableReq req = new TUpdateTableReq(); req.setType(TsTableInternalRPCType.PRE_UPDATE_TABLE.getOperationType()); req.setTableInfo(TsTableInternalRPCUtil.serializeSingleTsTableWithDatabase(database, table)); req.setOldName(oldName); + return req; + } - final Map dataNodeLocationMap = - configManager.getNodeManager().getRegisteredDataNodeLocations(); + /** + * Broadcast a table update to exactly {@code targets} and return the full per-nodeId response map + * (both successes and failures). Used by {@link + * org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator}, which needs to know which + * DataNodes acknowledged in order to decide whether it is safe to proceed past the rest. + */ + public static Map broadcastTableUpdate( + final TUpdateTableReq req, final Map targets) { final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TABLE, req, dataNodeLocationMap); + new DataNodeAsyncRequestContext<>(CnToDnAsyncRequestType.UPDATE_TABLE, req, targets); CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - return clientHandler.getResponseMap().entrySet().stream() + return clientHandler.getResponseMap(); + } + + private static Map failedOnly(final Map responses) { + return responses.entrySet().stream() .filter(entry -> entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } + public static Map preReleaseTable( + final String database, + final TsTable table, + final ConfigManager configManager, + final String oldName) { + return failedOnly( + broadcastTableUpdate( + preUpdateTableReq(database, table, oldName), + configManager.getNodeManager().getRegisteredDataNodeLocations())); + } + public static Map commitReleaseTable( final String database, final String tableName, @@ -289,11 +313,9 @@ public static Map commitReleaseTable( .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } - public static Map rollbackPreRelease( - final String database, - final String tableName, - final ConfigManager configManager, - final @Nullable String oldName) { + /** Build the ROLLBACK_UPDATE_TABLE request used to roll back a pre-released table change. */ + public static TUpdateTableReq rollbackUpdateTableReq( + final String database, final String tableName, final String oldName) { final TUpdateTableReq req = new TUpdateTableReq(); req.setType(TsTableInternalRPCType.ROLLBACK_UPDATE_TABLE.getOperationType()); final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); @@ -305,16 +327,73 @@ public static Map rollbackPreRelease( } req.setTableInfo(outputStream.toByteArray()); req.setOldName(oldName); + return req; + } - final Map dataNodeLocationMap = - configManager.getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TABLE, req, dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - return clientHandler.getResponseMap().entrySet().stream() - .filter(entry -> entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + public static Map rollbackPreRelease( + final String database, + final String tableName, + final ConfigManager configManager, + final @Nullable String oldName) { + return failedOnly( + broadcastTableUpdate( + rollbackUpdateTableReq(database, tableName, oldName), + configManager.getNodeManager().getRegisteredDataNodeLocations())); + } + + /** + * Broadcast an INVALIDATE_MATCHED_SCHEMA_CACHE to all DataNodes through {@link + * ClusterCachePropagator}: proceed once every unreachable DataNode is provably self-fenced (it + * fails closed on its schema cache and resyncs on recovery, so it cannot serve the + * deleted/altered series), instead of hard-failing on the first unreachable DataNode. Returns + * whether it is safe to proceed; the caller maps {@code false} to its own failure. + * + *

The propagator may re-broadcast while waiting for unacked DataNodes, so a fresh request with + * a duplicated buffer is built on each attempt — a consumed buffer can never be re-sent as an + * empty (and silently-successful) invalidation. + */ + public static boolean invalidateMatchedSchemaCache( + final ConfigManager configManager, + final ByteBuffer patternTreeBytes, + final boolean needLock) { + return new ClusterCachePropagator(configManager) + .propagate( + targets -> { + final DataNodeAsyncRequestContext + clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.INVALIDATE_MATCHED_SCHEMA_CACHE, + new TInvalidateMatchedSchemaCacheReq(patternTreeBytes.duplicate()) + .setNeedLock(needLock), + targets); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + }); + } + + /** + * Broadcast an UPDATE_TEMPLATE to all DataNodes through {@link ClusterCachePropagator}: proceed + * once every unreachable DataNode is provably self-fenced (it fails closed on its template cache + * and resyncs on recovery), instead of hard-failing on the first unreachable DataNode. Returns + * whether it is safe to proceed. + * + *

The request is rebuilt from {@code requestSupplier} on every attempt: the propagator may + * re-broadcast while waiting, and {@code TUpdateTemplateReq}'s binary field is backed by a {@link + * ByteBuffer}, so reusing one request could re-send a consumed (empty) payload. + */ + public static boolean broadcastTemplateUpdate( + final ConfigManager configManager, final Supplier requestSupplier) { + return new ClusterCachePropagator(configManager) + .propagate( + targets -> { + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.UPDATE_TEMPLATE, requestSupplier.get(), targets); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestWithRetry(clientHandler); + return clientHandler.getResponseMap(); + }); } public static TSStatus executeInConsensusLayer( diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java index dca79a02366f6..31641528269af 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedure.java @@ -34,6 +34,7 @@ import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ConfigNodeMessages; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; @@ -115,13 +116,8 @@ void setConfigNodeTTL(final ConfigNodeProcedureEnv env) { } void updateDataNodeTTL(final ConfigNodeProcedureEnv env) { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - sendTTLRequest( - dataNodeLocationMap, - buildSetTTLReq(plan.getPathPattern(), plan.getTTL(), plan.isDataBase())); - if (hasFailedDataNode(clientHandler)) { + if (!broadcastTTLAndDecide( + env, buildSetTTLReq(plan.getPathPattern(), plan.getTTL(), plan.isDataBase()))) { LOGGER.error(ProcedureMessages.FAILED_TO_UPDATE_TTL_CACHE_OF_DATANODE); setFailure( new ProcedureException( @@ -129,6 +125,17 @@ void updateDataNodeTTL(final ConfigNodeProcedureEnv env) { } } + /** + * Broadcast the TTL update to all DataNodes and decide whether it is safe to proceed: proceed + * once every unreachable DataNode is provably self-fenced (it fails closed on TTL in compaction + * and resyncs on recovery) instead of hard-failing on the first unreachable DataNode. + * Package-private and overridable for tests. + */ + boolean broadcastTTLAndDecide(final ConfigNodeProcedureEnv env, final TSetTTLReq req) { + return new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> sendTTLRequest(targets, req).getResponseMap()); + } + private void capturePreviousTTLState(final ConfigNodeProcedureEnv env) { if (previousTTLStateCaptured) { return; @@ -168,19 +175,6 @@ private TSetTTLReq buildSetTTLReq( Collections.singletonList(String.join(".", pathPattern)), ttl, isDataBase); } - private boolean hasFailedDataNode( - final DataNodeAsyncRequestContext clientHandler) { - if (!clientHandler.getRequestIndices().isEmpty()) { - return true; - } - for (TSStatus status : clientHandler.getResponseMap().values()) { - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - return true; - } - } - return false; - } - private long getTTLOrDefault(final ConfigNodeProcedureEnv env, final String[] pathPattern) { final long ttl = env.getConfigManager().getTTLManager().getTTL(pathPattern); return ttl == TTLCache.NULL_TTL ? TTL_NOT_EXIST : ttl; @@ -220,30 +214,20 @@ private void restoreTTLOnConfigNode( } private void rollbackDataNodeTTL(final ConfigNodeProcedureEnv env) throws ProcedureException { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - restoreTTLOnDataNodes(dataNodeLocationMap, plan.getPathPattern(), previousTTL); + restoreTTLOnDataNodes(env, plan.getPathPattern(), previousTTL); if (plan.isDataBase()) { restoreTTLOnDataNodes( - dataNodeLocationMap, - getDatabaseWildcardPathPattern(plan.getPathPattern()), - previousDatabaseWildcardTTL); + env, getDatabaseWildcardPathPattern(plan.getPathPattern()), previousDatabaseWildcardTTL); } } private void restoreTTLOnDataNodes( - final Map dataNodeLocationMap, - final String[] pathPattern, - final long ttl) + final ConfigNodeProcedureEnv env, final String[] pathPattern, final long ttl) throws ProcedureException { - if (dataNodeLocationMap.isEmpty()) { - return; - } - final DataNodeAsyncRequestContext clientHandler = - sendTTLRequest( - dataNodeLocationMap, - buildSetTTLReq(pathPattern, ttl == TTL_NOT_EXIST ? TTLCache.NULL_TTL : ttl, false)); - if (hasFailedDataNode(clientHandler)) { + // Same proceed-past-fenced semantics as the forward update: a down DataNode must not block + // rollback (it resyncs TTL on recovery); only a live unacked DataNode fails it. + if (!broadcastTTLAndDecide( + env, buildSetTTLReq(pathPattern, ttl == TTL_NOT_EXIST ? TTLCache.NULL_TTL : ttl, false))) { throw new ProcedureException( new MetadataException( "Rollback dataNode ttl cache failed for " + String.join(".", pathPattern))); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java index 55fffedad6145..8f24fe92eefe7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTemplateProcedure.java @@ -215,30 +215,25 @@ private void preReleaseTemplate(final ConfigNodeProcedureEnv env) { return; } - final TUpdateTemplateReq req = new TUpdateTemplateReq(); - req.setType(TemplateInternalRPCUpdateType.ADD_TEMPLATE_PRE_SET_INFO.toByte()); - req.setTemplateInfo( - TemplateInternalRPCUtil.generateAddTemplateSetInfoBytes(template, templateSetPath)); - - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TEMPLATE, req, dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final Map.Entry entry : statusMap.entrySet()) { - if (entry.getValue().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.warn( - ProcedureMessages.FAILED_TO_SYNC_TEMPLATE_PRE_SET_INFO_ON_PATH_TO, - templateName, - templateSetPath, - dataNodeLocationMap.get(entry.getKey())); - setFailure( - new ProcedureException( - new MetadataException(ProcedureMessages.PRE_SET_TEMPLATE_FAILED))); - return; - } + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // template cache and resyncs on recovery) instead of hard-failing on the first unreachable one. + if (!SchemaUtils.broadcastTemplateUpdate( + env.getConfigManager(), + () -> { + final TUpdateTemplateReq req = new TUpdateTemplateReq(); + req.setType(TemplateInternalRPCUpdateType.ADD_TEMPLATE_PRE_SET_INFO.toByte()); + req.setTemplateInfo( + TemplateInternalRPCUtil.generateAddTemplateSetInfoBytes(template, templateSetPath)); + return req; + })) { + LOGGER.warn( + ProcedureMessages.FAILED_TO_SYNC_TEMPLATE_PRE_SET_INFO_ON_PATH_TO, + templateName, + templateSetPath, + "an unreachable DataNode is not provably fenced"); + setFailure( + new ProcedureException(new MetadataException(ProcedureMessages.PRE_SET_TEMPLATE_FAILED))); + return; } setNextState(SetTemplateState.VALIDATE_TIMESERIES_EXISTENCE); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java index 1fd7aefb33065..d7bb9d0894660 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/UnsetTemplateProcedure.java @@ -155,29 +155,24 @@ private void invalidateCache(final ConfigNodeProcedureEnv env) { } private void executeInvalidateCache(final ConfigNodeProcedureEnv env) throws ProcedureException { - final Map dataNodeLocationMap = - env.getConfigManager().getNodeManager().getRegisteredDataNodeLocations(); - final TUpdateTemplateReq invalidateTemplateSetInfoReq = new TUpdateTemplateReq(); - invalidateTemplateSetInfoReq.setType( - TemplateInternalRPCUpdateType.INVALIDATE_TEMPLATE_SET_INFO.toByte()); - invalidateTemplateSetInfoReq.setTemplateInfo(getInvalidateTemplateSetInfo()); - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.UPDATE_TEMPLATE, - invalidateTemplateSetInfoReq, - dataNodeLocationMap); - CnToDnInternalServiceAsyncRequestManager.getInstance().sendAsyncRequestWithRetry(clientHandler); - final Map statusMap = clientHandler.getResponseMap(); - for (final TSStatus status : statusMap.values()) { + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on its + // template cache and resyncs on recovery) instead of hard-failing on the first unreachable one. + if (!SchemaUtils.broadcastTemplateUpdate( + env.getConfigManager(), + () -> { + final TUpdateTemplateReq invalidateTemplateSetInfoReq = new TUpdateTemplateReq(); + invalidateTemplateSetInfoReq.setType( + TemplateInternalRPCUpdateType.INVALIDATE_TEMPLATE_SET_INFO.toByte()); + invalidateTemplateSetInfoReq.setTemplateInfo(getInvalidateTemplateSetInfo()); + return invalidateTemplateSetInfoReq; + })) { // all dataNodes must clear the related template cache - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error( - ProcedureMessages.FAILED_TO_INVALIDATE_TEMPLATE_CACHE_OF_TEMPLATE_SET_ON, - template.getName(), - path); - throw new ProcedureException( - new MetadataException(ProcedureMessages.INVALIDATE_TEMPLATE_CACHE_FAILED)); - } + LOGGER.error( + ProcedureMessages.FAILED_TO_INVALIDATE_TEMPLATE_CACHE_OF_TEMPLATE_SET_ON, + template.getName(), + path); + throw new ProcedureException( + new MetadataException(ProcedureMessages.INVALIDATE_TEMPLATE_CACHE_FAILED)); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java index 7cf1ff1c24f83..f1cf52e74caa1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/AbstractAlterOrDropTableProcedure.java @@ -27,11 +27,13 @@ import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; import org.apache.iotdb.confignode.procedure.impl.schema.DataNodeTSStatusTaskExecutor; import org.apache.iotdb.confignode.procedure.impl.schema.SchemaUtils; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; import org.apache.tsfile.utils.ReadWriteIOUtils; import org.slf4j.Logger; @@ -91,17 +93,22 @@ protected void preRelease(final ConfigNodeProcedureEnv env) { } protected void preRelease(final ConfigNodeProcedureEnv env, final @Nullable String oldName) { - final Map failedResults = - SchemaUtils.preReleaseTable(database, table, env.getConfigManager(), oldName); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // Proceed once every unreachable DataNode is provably self-fenced instead of hard-failing the + // DDL: a fenced DataNode fails closed on its now-stale table cache and resyncs on lease + // recovery, so it cannot serve dirty schema. Only fail if an unacked DataNode is not provably + // fenced (it may still be serving clients). + final TUpdateTableReq req = SchemaUtils.preUpdateTableReq(database, table, oldName); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_PRE_RELEASE_FOR_TABLE_TO_DATANODE_FAILURE_RESULTS, getActionMessage(), database, table.getTableName(), - failedResults); + "an unreachable DataNode is not provably fenced"); setFailure( new ProcedureException( new MetadataException( @@ -138,18 +145,21 @@ protected void rollbackPreRelease(final ConfigNodeProcedureEnv env) { protected void rollbackPreRelease( final ConfigNodeProcedureEnv env, final @Nullable String tableName) { - final Map failedResults = - SchemaUtils.rollbackPreRelease( - database, table.getTableName(), env.getConfigManager(), tableName); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // A down DataNode must not block rollback either: proceed past provably-fenced DataNodes (which + // resync on recovery) and only fail on an unacked DataNode that is not provably fenced. + final TUpdateTableReq req = + SchemaUtils.rollbackUpdateTableReq(database, table.getTableName(), tableName); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_ROLLBACK_PRE_RELEASE_FOR_TABLE_INFO_TO_DATANODE, getActionMessage(), database, table.getTableName(), - failedResults); + "an unreachable DataNode is not provably fenced"); setFailure( new ProcedureException( new MetadataException( diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java index 05e0facb3018e..b4350fbcc5040 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/schema/table/CreateTableProcedure.java @@ -29,6 +29,7 @@ import org.apache.iotdb.confignode.consensus.request.write.table.RollbackCreateTablePlan; import org.apache.iotdb.confignode.exception.DatabaseNotExistsException; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; @@ -36,6 +37,7 @@ import org.apache.iotdb.confignode.procedure.state.schema.CreateTableState; import org.apache.iotdb.confignode.procedure.store.ProcedureType; import org.apache.iotdb.confignode.rpc.thrift.TDatabaseSchema; +import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -151,16 +153,22 @@ protected void preCreateTable(final ConfigNodeProcedureEnv env) { } private void preReleaseTable(final ConfigNodeProcedureEnv env) { - final Map failedResults = - SchemaUtils.preReleaseTable(database, table, env.getConfigManager(), null); - - if (!failedResults.isEmpty()) { - // All dataNodes must clear the related schema cache + // Broadcast the pre-update to all DataNodes. Instead of failing whenever any DataNode is + // unreachable, proceed once every unacked DataNode is provably self-fenced: such a DataNode + // fails closed on its (now-stale) table cache and resyncs on lease recovery, so it cannot serve + // dirty schema. Only fail if an unacked DataNode is not provably fenced (it may still be + // serving clients). + final TUpdateTableReq req = SchemaUtils.preUpdateTableReq(database, table, null); + final boolean proceeded = + new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> SchemaUtils.broadcastTableUpdate(req, targets)); + + if (!proceeded) { LOGGER.warn( ProcedureMessages.FAILED_TO_SYNC_TABLE_PRE_CREATE_INFO_TO_DATANODE_FAILURE, database, table.getTableName(), - failedResults); + "an unreachable DataNode is not provably fenced"); setFailure( new ProcedureException(new MetadataException(ProcedureMessages.PRE_CREATE_TABLE_FAILED))); return; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/sync/AuthOperationProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/sync/AuthOperationProcedure.java index 9011267525361..8eca6e9ec44ab 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/sync/AuthOperationProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/sync/AuthOperationProcedure.java @@ -20,7 +20,9 @@ package org.apache.iotdb.confignode.procedure.impl.sync; import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.cluster.NodeStatus; import org.apache.iotdb.commons.conf.CommonConfig; import org.apache.iotdb.commons.conf.CommonDescriptor; import org.apache.iotdb.commons.exception.IoTDBException; @@ -31,6 +33,7 @@ import org.apache.iotdb.confignode.consensus.request.write.auth.AuthorPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.payload.PipeEnrichedPlan; import org.apache.iotdb.confignode.i18n.ProcedureMessages; +import org.apache.iotdb.confignode.manager.lease.ClusterCachePropagator; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.node.AbstractNodeProcedure; @@ -49,8 +52,9 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Iterator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import static org.apache.iotdb.confignode.procedure.state.auth.AuthOperationProcedureState.DATANODE_AUTHCACHE_INVALIDING; @@ -96,34 +100,25 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, AuthOperationProcedu writePlan(env); return Flow.HAS_MORE_STATE; case DATANODE_AUTHCACHE_INVALIDING: - TInvalidatePermissionCacheReq req = new TInvalidatePermissionCacheReq(); - TSStatus status; + final TInvalidatePermissionCacheReq req = new TInvalidatePermissionCacheReq(); req.setUsername(user); req.setRoleName(role); - Iterator> it = dataNodesToInvalid.iterator(); - while (it.hasNext()) { - Pair pair = it.next(); - if (pair.getRight() + this.timeoutMS < System.currentTimeMillis()) { - it.remove(); - continue; - } - status = - (TSStatus) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( - pair.getLeft().getLocation().getInternalEndPoint(), - req, - CnToDnSyncRequestType.INVALIDATE_PERMISSION_CACHE); - if (status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - it.remove(); - } - } - if (dataNodesToInvalid.isEmpty()) { + // Proceed once every unreachable DataNode is provably self-fenced (it fails closed on + // auth + // and resyncs on recovery), instead of silently dropping a DataNode after a timeout, + // which + // left a live but un-acked DataNode serving the just-revoked permission. Fail only when a + // live DataNode stays un-acked. (dataNodesToInvalid is retained for serialization + // compatibility but no longer drives this step.) + if (new ClusterCachePropagator(env.getConfigManager()) + .propagate(targets -> invalidatePermissionCacheOnce(env, targets, req))) { LOGGER.info(ProcedureMessages.AUTH_PROCEDURE_CLEAN_DATANODE_CACHE_SUCCESSFULLY); return Flow.NO_MORE_STATE; - } else { - setNextState(AuthOperationProcedureState.DATANODE_AUTHCACHE_INVALIDING); } + setFailure( + new ProcedureException( + String.format( + ProcedureMessages.FAIL_TO_EXECUTE_PLAN_AT_STATE, plan.toString(), state))); break; } } catch (Exception e) { @@ -171,6 +166,44 @@ private void writePlan(ConfigNodeProcedureEnv env) { } } + /** + * One broadcast round of the permission-cache invalidation over {@code targets}: synchronously + * invalidate each reachable DataNode and report its status. Unknown DataNodes are not contacted + * (a sync send would block on connect timeouts) and are reported as not-acked, so the {@link + * ClusterCachePropagator} can decide whether they are provably fenced. + */ + private Map invalidatePermissionCacheOnce( + final ConfigNodeProcedureEnv env, + final Map targets, + final TInvalidatePermissionCacheReq req) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : targets.entrySet()) { + final int dataNodeId = entry.getKey(); + final TSStatus notAcked = new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + if (env.getConfigManager().getLoadManager().getNodeStatus(dataNodeId) == NodeStatus.Unknown) { + result.put(dataNodeId, notAcked); + continue; + } + try { + result.put( + dataNodeId, + (TSStatus) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithRetry( + entry.getValue().getInternalEndPoint(), + req, + CnToDnSyncRequestType.INVALIDATE_PERMISSION_CACHE)); + } catch (final Exception e) { + LOGGER.warn( + "Invalidate permission cache failed for DataNode {}", + entry.getValue().getInternalEndPoint(), + e); + result.put(dataNodeId, notAcked); + } + } + return result; + } + @Override protected boolean isRollbackSupported(AuthOperationProcedureState state) { return state == AuthOperationProcedureState.INIT; diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java new file mode 100644 index 0000000000000..18910607a2fa9 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/ClusterCachePropagatorTest.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.IntPredicate; +import java.util.function.IntToLongFunction; + +public class ClusterCachePropagatorTest { + + private static final long T_PROCEED_MS = 25_000L; + + private static TSStatus success() { + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } + + private static TSStatus error() { + return new TSStatus(TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode()); + } + + private static Map twoDataNodes() { + final Map map = new HashMap<>(); + map.put(1, new TDataNodeLocation().setDataNodeId(1)); + map.put(2, new TDataNodeLocation().setDataNodeId(2)); + return map; + } + + /** Build a propagator whose loop seams are inert (only propagateOnce is exercised). */ + private static ClusterCachePropagator propagator( + final IntPredicate supportsFencing, final IntToLongFunction hbAgeMs) { + return new ClusterCachePropagator( + ClusterCachePropagatorTest::twoDataNodes, + supportsFencing, + hbAgeMs, + () -> T_PROCEED_MS, + () -> 0L, + ms -> {}); + } + + @Test + public void allAckedProceeds() { + final ClusterCachePropagator p = propagator(id -> true, id -> 0L); + final Verdict v = + p.propagateOnce( + targets -> { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, success()); + return r; + }, + false); + Assert.assertEquals(Verdict.PROCEED, v); + } + + @Test + public void unreachableButProvablyFencedProceeds() { + // DN2 did not respond, supports fencing, and has been silent past T_proceed -> provably fenced. + final ClusterCachePropagator p = propagator(id -> true, id -> id == 2 ? T_PROCEED_MS + 1 : 0L); + final Verdict v = p.propagateOnce(targets -> ackOnly(1), false); + Assert.assertEquals(Verdict.PROCEED, v); + } + + @Test + public void unreachableNotYetFencedWaits() { + // DN2 silent but not yet past T_proceed -> cannot assume fenced -> WAIT (budget not exhausted). + final ClusterCachePropagator p = propagator(id -> true, id -> id == 2 ? 10_000L : 0L); + Assert.assertEquals(Verdict.WAIT, p.propagateOnce(targets -> ackOnly(1), false)); + } + + @Test + public void unreachableNotYetFencedFailsWhenBudgetExhausted() { + final ClusterCachePropagator p = propagator(id -> true, id -> id == 2 ? 10_000L : 0L); + Assert.assertEquals(Verdict.FAIL, p.propagateOnce(targets -> ackOnly(1), true)); + } + + @Test + public void incapableDataNodeNeverFencedWaits() { + // DN2 does not support fencing: even silent "forever" it can never be assumed fenced + // (rolling-upgrade safety) -> strict semantics -> WAIT. + final ClusterCachePropagator p = propagator(id -> id != 2, id -> id == 2 ? 999_999L : 0L); + Assert.assertEquals(Verdict.WAIT, p.propagateOnce(targets -> ackOnly(1), false)); + } + + @Test + public void nonSuccessResponseIsNotAck() { + // DN2 responded but with a non-SUCCESS status: it did NOT apply the invalidation, and being + // reachable its hbAge is small, so it is UNSAFE (must retry), not silently accepted. + final ClusterCachePropagator p = propagator(id -> true, id -> id == 2 ? 1_000L : 0L); + final Verdict v = + p.propagateOnce( + targets -> { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, error()); + return r; + }, + false); + Assert.assertEquals(Verdict.WAIT, v); + } + + @Test + public void loopReturnsTrueWhenItEventuallyProceeds() { + final AtomicInteger calls = new AtomicInteger(); + final AtomicLong nanos = new AtomicLong(); + final ClusterCachePropagator p = + new ClusterCachePropagator( + ClusterCachePropagatorTest::twoDataNodes, + id -> true, + id -> id == 2 ? 10_000L : 0L, // DN2 not fenced, so round 1 must WAIT + () -> T_PROCEED_MS, + nanos::get, + ms -> nanos.addAndGet(ms * 1_000_000L)); + // Round 1: DN2 unreachable -> WAIT. Round 2: DN2 acks -> PROCEED. + final boolean proceeded = + p.propagate(targets -> calls.incrementAndGet() == 1 ? ackOnly(1) : ackBoth()); + Assert.assertTrue(proceeded); + Assert.assertEquals(2, calls.get()); + } + + @Test + public void loopReturnsFalseWhenBudgetExhausted() { + final AtomicLong nanos = new AtomicLong(); + final ClusterCachePropagator p = + new ClusterCachePropagator( + ClusterCachePropagatorTest::twoDataNodes, + id -> true, + id -> id == 2 ? 10_000L : 0L, // DN2 never fenced (alive but not acking) -> WAIT forever + () -> T_PROCEED_MS, + nanos::get, + ms -> nanos.addAndGet(ms * 1_000_000L)); + // DN2 keeps failing to ack; the fake clock advances on each sleep until the wait budget runs + // out, at which point the loop must give up with FAIL. + Assert.assertFalse(p.propagate(targets -> ackOnly(1))); + } + + private static Map ackOnly(final int nodeId) { + final Map r = new HashMap<>(); + r.put(nodeId, success()); + return r; + } + + private static Map ackBoth() { + final Map r = new HashMap<>(); + r.put(1, success()); + r.put(2, success()); + return r; + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java new file mode 100644 index 0000000000000..207b257ac9f2c --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/DataNodeContactTrackerTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class DataNodeContactTrackerTest { + + private static final int DN = 3; + + @Test + public void reportsMillisSinceLastSuccessfulResponse() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(1234)); + assertEquals(1234L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + } + + @Test + public void ageKeepsGrowingWithoutSuccessfulResponse() { + // Failures must NOT refresh the contact time. This is enforced structurally: only + // recordSuccessfulResponse updates it, so with no further success the age keeps growing. + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.SECONDS.toNanos(30)); + assertEquals(30_000L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + } + + @Test + public void leadershipAcquisitionResetsContactToNow() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + tracker.recordSuccessfulResponse(DN); + nowNanos.addAndGet(TimeUnit.SECONDS.toNanos(30)); // would otherwise look stale + tracker.onLeadershipAcquired(Arrays.asList(DN, 4)); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(DN)); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(4)); + } + + @Test + public void neverContactedReadsAsZeroSoVerdictTreatsAsRecent() { + // Conservative: an unknown DataNode must NOT look fenced (else the verdict would wrongly + // proceed past it), so its age reads as 0 until a real success/expiry is observed. + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final DataNodeContactTracker tracker = new DataNodeContactTracker(nowNanos::get); + assertEquals(0L, tracker.getMillisSinceLastSuccessfulResponse(999)); + } + + @Test + public void unknownDataNodeIsNotFencingCapable() { + final DataNodeContactTracker tracker = new DataNodeContactTracker(new AtomicLong()::get); + assertFalse(tracker.supportsFencing(DN)); + } + + @Test + public void recordsAndUpdatesFencingCapability() { + final DataNodeContactTracker tracker = new DataNodeContactTracker(new AtomicLong()::get); + tracker.recordCapability(DN, true); + assertTrue(tracker.supportsFencing(DN)); + tracker.recordCapability(DN, false); + assertFalse(tracker.supportsFencing(DN)); + } + + @Test + public void removeDataNodeClearsCapability() { + final DataNodeContactTracker tracker = new DataNodeContactTracker(new AtomicLong()::get); + tracker.recordCapability(DN, true); + tracker.removeDataNode(DN); + assertFalse(tracker.supportsFencing(DN)); + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java new file mode 100644 index 0000000000000..32248d56c9886 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/lease/MetadataBroadcastVerdictTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.lease; + +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.DataNodeState; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Disposition; +import org.apache.iotdb.confignode.manager.lease.MetadataBroadcastVerdict.Verdict; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; + +import static org.junit.Assert.assertEquals; + +public class MetadataBroadcastVerdictTest { + + private static final long T_PROCEED_MS = 25_000L; + + // acked + private static DataNodeState acked() { + return new DataNodeState(true, false, true, 0L); + } + + // capable, unacked, out of contact >= T_proceed -> provably fenced + private static DataNodeState fencedSafe() { + return new DataNodeState(false, false, true, T_PROCEED_MS + 1); + } + + // capable, unacked, heartbeat still fresh -> still possibly serving + private static DataNodeState freshUnacked() { + return new DataNodeState(false, false, true, 1_000L); + } + + // ---- classify ---- + + @Test + public void incapableDataNodeIsNeverFencedSafe() { + // Review point 4: capability is checked before any timing test; an old DN that cannot + // self-fence must be UNSAFE even if it has been silent far longer than T_proceed. + final DataNodeState oldDnLongSilent = new DataNodeState(false, false, false, T_PROCEED_MS * 10); + assertEquals( + Disposition.UNSAFE, MetadataBroadcastVerdict.classify(oldDnLongSilent, T_PROCEED_MS)); + } + + @Test + public void retiredFromRoutingIsSafeGone() { + // Review point 5: only "removed from routing / explicit fence-shutdown ack" is safe-gone, + // regardless of capability or how recently it was seen. + final DataNodeState retired = new DataNodeState(false, true, false, 0L); + assertEquals(Disposition.SAFE_GONE, MetadataBroadcastVerdict.classify(retired, T_PROCEED_MS)); + } + + @Test + public void removingButStillRoutableIsUnsafe() { + // Review point 5: a node that is merely Removing (still routable, not retired) with a fresh + // heartbeat must NOT be treated as safe. + assertEquals( + Disposition.UNSAFE, MetadataBroadcastVerdict.classify(freshUnacked(), T_PROCEED_MS)); + } + + @Test + public void capableAndLongSilentIsFencedSafe() { + assertEquals( + Disposition.FENCED_SAFE, MetadataBroadcastVerdict.classify(fencedSafe(), T_PROCEED_MS)); + } + + // ---- decide ---- + + @Test + public void allAckedProceeds() { + assertEquals( + Verdict.PROCEED, + MetadataBroadcastVerdict.decide(Arrays.asList(acked(), acked()), T_PROCEED_MS, false)); + } + + @Test + public void unackedButAllFencedSafeProceeds() { + assertEquals( + Verdict.PROCEED, + MetadataBroadcastVerdict.decide(Arrays.asList(acked(), fencedSafe()), T_PROCEED_MS, false)); + } + + @Test + public void freshUnackedWaitsWhileBudgetRemains() { + assertEquals( + Verdict.WAIT, + MetadataBroadcastVerdict.decide( + Collections.singletonList(freshUnacked()), T_PROCEED_MS, false)); + } + + @Test + public void freshUnackedFailsWhenWaitBudgetExhausted() { + assertEquals( + Verdict.FAIL, + MetadataBroadcastVerdict.decide( + Collections.singletonList(freshUnacked()), T_PROCEED_MS, true)); + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java index cb09c23659c39..a1813c1642cca 100644 --- a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/schema/SetTTLProcedureTest.java @@ -25,8 +25,6 @@ import org.apache.iotdb.commons.exception.IllegalPathException; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.schema.ttl.TTLCache; -import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; -import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; import org.apache.iotdb.confignode.consensus.request.write.database.SetTTLPlan; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.TTLManager; @@ -355,30 +353,11 @@ TSStatus writeConfigNodePlan(final ConfigNodeProcedureEnv env, final SetTTLPlan } @Override - DataNodeAsyncRequestContext sendTTLRequest( - final Map dataNodeLocationMap, final TSetTTLReq req) { + boolean broadcastTTLAndDecide(final ConfigNodeProcedureEnv env, final TSetTTLReq req) { requests.add(copyRequest(req)); - - final DataNodeAsyncRequestContext clientHandler = - new DataNodeAsyncRequestContext<>( - CnToDnAsyncRequestType.SET_TTL, copyRequest(req), dataNodeLocationMap); - final List requestIds = new ArrayList<>(clientHandler.getNodeLocationMap().keySet()); - final boolean shouldFail = failFirstDataNodeUpdate && requestCount++ == 0; - - for (Integer requestId : requestIds) { - clientHandler - .getResponseMap() - .put( - requestId, - new TSStatus( - shouldFail - ? TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode() - : TSStatusCode.SUCCESS_STATUS.getStatusCode())); - if (!shouldFail) { - clientHandler.getNodeLocationMap().remove(requestId); - } - } - return clientHandler; + // Simulate a live, un-acked DataNode on the first broadcast: the propagator verdict is FAIL + // (which triggers rollback). Later broadcasts (the rollback restore) proceed. + return !(failFirstDataNodeUpdate && requestCount++ == 0); } private SetTTLPlan copyPlan(final SetTTLPlan plan) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java index 9ee232c921b83..2236dffffbfa7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java @@ -57,6 +57,7 @@ import org.apache.iotdb.db.queryengine.plan.relational.type.AuthorRType; import org.apache.iotdb.db.queryengine.plan.statement.StatementType; import org.apache.iotdb.db.queryengine.plan.statement.sys.AuthorStatement; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -531,8 +532,15 @@ public void refreshToken() { heartBeatTimeStamp = currentTime; } - private void checkCacheAvailable() { - if (cacheOutDate) { + // Package-private for testing (ClusterAuthorityFetcherLeaseTest). + void checkCacheAvailable() { + // cacheOutDate is set by refreshToken() only when a heartbeat finally arrives after a long gap, + // so it cannot catch an *ongoing* ConfigNode partition (no heartbeat arrives, refreshToken() is + // never called). isFenced() is evaluated on this DataNode's own clock and fires without any + // heartbeat: while fenced we drop the permission cache and force a re-fetch from the + // ConfigNode, + // which fails closed while partitioned, so a missed REVOKE cannot keep authorizing a privilege. + if (cacheOutDate || MetadataLeaseManager.getInstance().isFenced()) { iAuthorCache.invalidAllCache(); } cacheOutDate = false; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index e5c996405649d..aec41b3fd7388 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -189,6 +189,7 @@ import org.apache.iotdb.db.queryengine.plan.statement.crud.InsertRowStatement; import org.apache.iotdb.db.queryengine.plan.statement.crud.QueryStatement; import org.apache.iotdb.db.schemaengine.SchemaEngine; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.schemaregion.ISchemaRegion; import org.apache.iotdb.db.schemaengine.schemaregion.read.resp.info.ITimeSeriesSchemaInfo; import org.apache.iotdb.db.schemaengine.schemaregion.read.resp.reader.ISchemaReader; @@ -2226,6 +2227,10 @@ private PathPatternTree filterPathPatternTree(PathPatternTree patternTree, Strin public TDataNodeHeartbeatResp getDataNodeHeartBeat(TDataNodeHeartbeatReq req) throws TException { TDataNodeHeartbeatResp resp = new TDataNodeHeartbeatResp(); + // Renew the metadata lease: receiving a ConfigNode heartbeat means this DataNode is still in + // contact with the cluster and may keep trusting its ConfigNode-pushed metadata caches. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + // Judging leader if necessary if (req.isNeedJudgeLeader()) { // Always get logical clock before judging leader @@ -2273,6 +2278,10 @@ public TDataNodeHeartbeatResp getDataNodeHeartBeat(TDataNodeHeartbeatReq req) th AuthorityChecker.getAuthorityFetcher().refreshToken(); resp.setHeartbeatTimestamp(req.getHeartbeatTimestamp()); resp.setStatus(commonConfig.getNodeStatus().getStatus()); + // Advertise that this DataNode supports metadata-lease self-fencing, so the ConfigNode may + // treat + // it as safely fenced when unreachable (older DataNodes that omit this are handled strictly). + resp.setSupportsMetadataLeaseFencing(true); if (commonConfig.getStatusReason() != null) { resp.setStatusReason(commonConfig.getStatusReason()); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java index 93d75aeff2d8c..697a6a931c9d7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/metadata/fetcher/cache/TreeDeviceSchemaCacheManager.java @@ -29,6 +29,7 @@ import org.apache.iotdb.db.queryengine.common.schematree.ClusterSchemaTree; import org.apache.iotdb.db.queryengine.common.schematree.IMeasurementSchemaInfo; import org.apache.iotdb.db.queryengine.plan.analyze.schema.ISchemaComputation; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.template.ClusterTemplateManager; import org.apache.iotdb.db.schemaengine.template.ITemplateManager; @@ -65,6 +66,10 @@ public class TreeDeviceSchemaCacheManager { private TreeDeviceSchemaCacheManager() { tableDeviceSchemaCache = TableDeviceSchemaCache.getInstance(); + // On lease recovery (a ConfigNode heartbeat after this DataNode was fenced), drop everything: + // entries cached before the partition may have missed a ConfigNode invalidation, and forcing a + // miss while fenced does not cover entries that were never re-read during the partition. + MetadataLeaseManager.getInstance().addLeaseRecoveryListener(this::cleanUp); } public static TreeDeviceSchemaCacheManager getInstance() { @@ -92,6 +97,23 @@ public void releaseWriteLock() { readWriteLock.writeLock().unlock(); } + /** + * Look up a device's cached schema, but report a miss (return {@code null}) while the metadata + * lease is fenced. A fenced DataNode may hold a stale entry (it could have missed a ConfigNode + * cache-invalidation such as a DELETE TIMESERIES or datatype change while partitioned); forcing a + * miss makes the read-through callers re-fetch from the authoritative, quorum-backed SchemaRegion + * rather than validate writes/queries against possibly-stale schema. This is more available than + * hard-failing: the operation still succeeds whenever the SchemaRegion quorum is reachable, and + * only fails (fail-closed) when it is not. On lease recovery {@link #cleanUp()} additionally + * drops entries cached before the partition that were never re-read while fenced. + */ + private IDeviceSchema getDeviceSchemaOrMissWhenFenced(final String[] deviceIdNodes) { + if (MetadataLeaseManager.getInstance().isFenced()) { + return null; + } + return tableDeviceSchemaCache.getDeviceSchema(deviceIdNodes); + } + /** * Get SchemaEntity info without auto create schema * @@ -101,7 +123,7 @@ public void releaseWriteLock() { */ public ClusterSchemaTree get(final PartialPath devicePath, final String[] measurements) { final ClusterSchemaTree tree = new ClusterSchemaTree(); - final IDeviceSchema schema = tableDeviceSchemaCache.getDeviceSchema(devicePath.getNodes()); + final IDeviceSchema schema = getDeviceSchemaOrMissWhenFenced(devicePath.getNodes()); if (!(schema instanceof TreeDeviceNormalSchema)) { return tree; } @@ -130,7 +152,7 @@ public ClusterSchemaTree get(final PartialPath devicePath, final String[] measur */ public ClusterSchemaTree getMatchedTemplateSchema(final PartialPath devicePath) { final ClusterSchemaTree tree = new ClusterSchemaTree(); - final IDeviceSchema schema = tableDeviceSchemaCache.getDeviceSchema(devicePath.getNodes()); + final IDeviceSchema schema = getDeviceSchemaOrMissWhenFenced(devicePath.getNodes()); if (!(schema instanceof TreeDeviceTemplateSchema)) { return tree; } @@ -150,7 +172,7 @@ public ClusterSchemaTree getMatchedTemplateSchema(final PartialPath devicePath) public ClusterSchemaTree getMatchedNormalSchema(final PartialPath fullPath) { final ClusterSchemaTree tree = new ClusterSchemaTree(); final IDeviceSchema schema = - tableDeviceSchemaCache.getDeviceSchema( + getDeviceSchemaOrMissWhenFenced( Arrays.copyOf(fullPath.getNodes(), fullPath.getNodeLength() - 1)); if (!(schema instanceof TreeDeviceNormalSchema)) { return tree; @@ -171,7 +193,7 @@ public List computeWithoutTemplate(final ISchemaComputation schemaCompu final String[] measurements = schemaComputation.getMeasurements(); final IDeviceSchema schema = - tableDeviceSchemaCache.getDeviceSchema(schemaComputation.getDevicePath().getNodes()); + getDeviceSchemaOrMissWhenFenced(schemaComputation.getDevicePath().getNodes()); if (!(schema instanceof TreeDeviceNormalSchema)) { return IntStream.range(0, schemaComputation.getMeasurements().length) .boxed() @@ -229,7 +251,7 @@ public Pair, List> computeSourceOfLogicalView( final PartialPath fullPath = logicalViewSchema.getSourcePathIfWritable(); final IDeviceSchema schema = - tableDeviceSchemaCache.getDeviceSchema(fullPath.getDevicePath().getNodes()); + getDeviceSchemaOrMissWhenFenced(fullPath.getDevicePath().getNodes()); if (!(schema instanceof TreeDeviceNormalSchema)) { indexOfMissingMeasurements.add(i); continue; @@ -265,7 +287,7 @@ public List computeWithTemplate(final ISchemaComputation computation) { final List indexOfMissingMeasurements = new ArrayList<>(); final String[] measurements = computation.getMeasurements(); final IDeviceSchema deviceSchema = - tableDeviceSchemaCache.getDeviceSchema(computation.getDevicePath().getNodes()); + getDeviceSchemaOrMissWhenFenced(computation.getDevicePath().getNodes()); if (!(deviceSchema instanceof TreeDeviceTemplateSchema)) { return IntStream.range(0, measurements.length).boxed().collect(Collectors.toList()); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java new file mode 100644 index 0000000000000..c5a1feabf8805 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManager.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.lease; + +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.commons.utils.TestOnly; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.LongSupplier; + +/** + * Tracks the DataNode's "metadata lease" with the ConfigNode. The ConfigNode periodically sends + * heartbeats to the DataNode; while these arrive the DataNode may trust its ConfigNode-pushed + * metadata caches (table/tree schema, device attributes, templates, TTL, permissions, ...). If no + * heartbeat is received within {@code metadata_lease_fence_ms} ({@code T_fence}), the lease has + * expired and the DataNode must self-fence: stop trusting those caches so a partitioned DataNode + * cannot serve stale schema and generate dirty data. + * + *

This class only tracks the lease state; wiring fail-closed behavior into the read/write/auth + * paths and resync-on-recovery is done by the respective subsystems. + * + *

A monotonic clock ({@link System#nanoTime()}) is used so the lease is immune to wall-clock + * adjustments. The clock and fence threshold are injectable for testing. + */ +public class MetadataLeaseManager { + + private static final Logger LOGGER = LoggerFactory.getLogger(MetadataLeaseManager.class); + + private final List leaseRecoveryListeners = new CopyOnWriteArrayList<>(); + + private final LongSupplier nanoClock; + private final LongSupplier fenceThresholdMsSupplier; + + private volatile long lastConfigNodeHeartbeatNanos; + + private MetadataLeaseManager() { + this( + System::nanoTime, + () -> CommonDescriptor.getInstance().getConfig().getMetadataLeaseFenceMs()); + } + + MetadataLeaseManager(final LongSupplier nanoClock, final LongSupplier fenceThresholdMsSupplier) { + this.nanoClock = nanoClock; + this.fenceThresholdMsSupplier = fenceThresholdMsSupplier; + // Startup registration performs a full resync, so treat construction time as a fresh contact. + this.lastConfigNodeHeartbeatNanos = nanoClock.getAsLong(); + } + + /** + * Register a listener to run when the lease recovers, i.e. a ConfigNode heartbeat arrives after + * the lease had expired. Push-maintained caches (e.g. {@code DataNodeTableCache}) register here + * to invalidate themselves on recovery, since they may have missed ConfigNode pushes while + * fenced; subsequent lookups then re-fetch fresh state instead of trusting stale entries. + */ + public void addLeaseRecoveryListener(final Runnable listener) { + leaseRecoveryListeners.add(listener); + } + + /** + * Renew the lease: record that a ConfigNode heartbeat has just been received. If the lease had + * expired (the DataNode was fenced), this heartbeat is a recovery, so the registered recovery + * listeners run to drop possibly-stale ConfigNode-pushed caches before they are trusted again. + */ + public void recordConfigNodeHeartbeat() { + final boolean wasFenced = isFenced(); + this.lastConfigNodeHeartbeatNanos = nanoClock.getAsLong(); + if (wasFenced) { + for (final Runnable listener : leaseRecoveryListeners) { + try { + listener.run(); + } catch (final Exception e) { + // A misbehaving listener must not break heartbeat processing / lease renewal. + LOGGER.warn("Metadata lease recovery listener failed", e); + } + } + } + } + + /** Milliseconds elapsed since the last ConfigNode heartbeat was received (never negative). */ + public long getMillisSinceLastConfigNodeHeartbeat() { + final long elapsedNanos = nanoClock.getAsLong() - lastConfigNodeHeartbeatNanos; + return elapsedNanos > 0 ? elapsedNanos / 1_000_000L : 0L; + } + + /** Whether the metadata lease has expired (no ConfigNode heartbeat within {@code T_fence}). */ + public boolean isFenced() { + return getMillisSinceLastConfigNodeHeartbeat() > fenceThresholdMsSupplier.getAsLong(); + } + + /** Force the lease to appear expired, for tests that exercise fail-closed behavior. */ + @TestOnly + public void expireLeaseForTest() { + this.lastConfigNodeHeartbeatNanos = + nanoClock.getAsLong() - (fenceThresholdMsSupplier.getAsLong() + 1_000L) * 1_000_000L; + } + + public static MetadataLeaseManager getInstance() { + return MetadataLeaseManagerHolder.INSTANCE; + } + + private static final class MetadataLeaseManagerHolder { + private static final MetadataLeaseManager INSTANCE = new MetadataLeaseManager(); + + private MetadataLeaseManagerHolder() {} + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java index f545a9bda237d..f109aa5ef84d6 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCache.java @@ -20,6 +20,7 @@ package org.apache.iotdb.db.schemaengine.table; import org.apache.iotdb.calc.plan.relational.metadata.CommonMetadataUtils; +import org.apache.iotdb.commons.exception.IoTDBRuntimeException; import org.apache.iotdb.commons.schema.table.NonCommittableTsTable; import org.apache.iotdb.commons.schema.table.TsTable; import org.apache.iotdb.commons.schema.table.TsTableInternalRPCUtil; @@ -29,6 +30,7 @@ import org.apache.iotdb.db.conf.IoTDBDescriptor; import org.apache.iotdb.db.i18n.DataNodeSchemaMessages; import org.apache.iotdb.db.queryengine.plan.execution.config.executor.ClusterConfigTaskExecutor; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.tsfile.utils.Pair; @@ -73,7 +75,10 @@ public class DataNodeTableCache implements ITableCache { IoTDBDescriptor.getInstance().getConfig().getDataNodeTableCacheSemaphorePermitNum()); private DataNodeTableCache() { - // Do nothing + // On lease recovery (a ConfigNode heartbeat after this DataNode was fenced), this cache may + // have + // missed ConfigNode pushes, so drop everything and let subsequent lookups re-fetch fresh state. + MetadataLeaseManager.getInstance().addLeaseRecoveryListener(this::invalidateAll); } private static final class DataNodeTableCacheHolder { @@ -263,6 +268,22 @@ public void invalid(String database) { } } + /** + * Drop the entire cache. Used on metadata-lease recovery: after the DataNode was fenced it may + * have missed ConfigNode pushes, so the cached schema is no longer trustworthy and must be + * re-fetched lazily on the next lookup. + */ + public void invalidateAll() { + readWriteLock.writeLock().lock(); + try { + databaseTableMap.clear(); + preUpdateTableMap.clear(); + instanceVersion.incrementAndGet(); + } finally { + readWriteLock.writeLock().unlock(); + } + } + @GuardedBy("TableDeviceSchemaCache#writeLock") @Override public void invalid(String database, final String tableName) { @@ -313,7 +334,27 @@ public long getInstanceVersion() { return instanceVersion.get(); } + /** + * Fail closed when the metadata lease has expired: a fenced DataNode may hold a stale + * table-schema cache (it could have missed a ConfigNode invalidation while partitioned), so + * refuse to serve it rather than risk validating writes/queries against stale schema and + * producing dirty data. The error is retryable; the operation succeeds again once the lease + * recovers and the cache resyncs. + */ + private void failIfMetadataLeaseFenced() { + final MetadataLeaseManager lease = MetadataLeaseManager.getInstance(); + if (lease.isFenced()) { + throw new IoTDBRuntimeException( + String.format( + "DataNode metadata lease expired (%d ms since last ConfigNode heartbeat); refusing to " + + "serve table schema from a possibly-stale cache, please retry.", + lease.getMillisSinceLastConfigNodeHeartbeat()), + TSStatusCode.INTERNAL_REQUEST_RETRY_ERROR.getStatusCode()); + } + } + public TsTable getTableInWrite(final String database, final String tableName) { + failIfMetadataLeaseFenced(); final TsTable result = getTableInCache(database, tableName); return Objects.nonNull(result) ? result : getTable(database, tableName, false); } @@ -327,6 +368,7 @@ public TsTable getTable(final String database, final String tableName) { * #preUpdateTableMap}, due to the failure of "commit" or rollback of "pre-update". */ public TsTable getTable(String database, final String tableName, final boolean force) { + failIfMetadataLeaseFenced(); database = PathUtils.unQualifyDatabaseName(database); final Map> preUpdateTables = mayGetTableInPreUpdateMap(database, tableName); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java index 2183d6ac8812b..e2204e8cf0b57 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java @@ -108,6 +108,9 @@ public static void bind() { // bind memory related metrics metricService.addMetricSet(GlobalMemoryMetrics.getInstance()); + + // bind metadata lease (ConfigNode heartbeat freshness) metrics + metricService.addMetricSet(new MetadataLeaseMetrics()); } private static void initSystemMetrics(MetricService metricService) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java new file mode 100644 index 0000000000000..99b33befc9acf --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/MetadataLeaseMetrics.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.service.metrics; + +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.metrics.AbstractMetricService; +import org.apache.iotdb.metrics.metricsets.IMetricSet; +import org.apache.iotdb.metrics.utils.MetricLevel; +import org.apache.iotdb.metrics.utils.MetricType; + +/** + * Exposes the DataNode's metadata-lease state for observability: how long it has been since the + * last ConfigNode heartbeat was received. A value approaching {@code metadata_lease_fence_ms} + * indicates the DataNode is about to (or has) self-fenced its ConfigNode-pushed metadata caches. + */ +public class MetadataLeaseMetrics implements IMetricSet { + + private static final String METADATA_LEASE_HEARTBEAT_AGE_MS = "metadata_lease_heartbeat_age_ms"; + + @Override + public void bindTo(final AbstractMetricService metricService) { + metricService.createAutoGauge( + METADATA_LEASE_HEARTBEAT_AGE_MS, + MetricLevel.IMPORTANT, + MetadataLeaseManager.getInstance(), + MetadataLeaseManager::getMillisSinceLastConfigNodeHeartbeat); + } + + @Override + public void unbindFrom(final AbstractMetricService metricService) { + metricService.remove(MetricType.AUTO_GAUGE, METADATA_LEASE_HEARTBEAT_AGE_MS); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java index a639fba299cb9..3ce86271968f5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/execute/utils/MultiTsFileDeviceIterator.java @@ -28,6 +28,7 @@ import org.apache.iotdb.commons.schema.table.column.TsTableColumnSchema; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.db.queryengine.plan.analyze.cache.schema.DataNodeTTLCache; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; import org.apache.iotdb.db.schemaengine.table.DataNodeTableCache; import org.apache.iotdb.db.storageengine.dataregion.compaction.io.CompactionTsFileReader; import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.constant.CompactionType; @@ -236,7 +237,15 @@ public Pair nextDevice() throws IllegalPathException, IOExce IDeviceID deviceID = currentDevice.left; boolean isAligned = currentDevice.right; ignoreAllNullRows = !isAligned || deviceID.getTableName().startsWith("root."); - if (!ignoreAllNullRows) { + if (MetadataLeaseManager.getInstance().isFenced()) { + // Metadata lease fenced: this DataNode may hold a stale TTL (it could have missed a + // ConfigNode + // TTL update while partitioned). A too-short stale TTL would make compaction permanently + // delete data that a missed TTL-increase says to keep, so use an infinite TTL: compaction + // deletes nothing by TTL while fenced, and real TTL deletion resumes once the lease recovers + // and the cache resyncs. (Checked first so the table path also avoids the fenced cache.) + ttlForCurrentDevice = Long.MAX_VALUE; + } else if (!ignoreAllNullRows) { ttlForCurrentDevice = DataNodeTTLCache.getInstance().getTTLForTable(databaseName, deviceID.getTableName()); } else { diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java new file mode 100644 index 0000000000000..fec9a63c4a2fe --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcherLeaseTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.auth; + +import org.apache.iotdb.commons.auth.entity.User; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +/** + * When the DataNode metadata lease has expired (no ConfigNode heartbeat within {@code T_fence}), + * the permission cache must not be trusted: a partitioned DataNode could have missed a REVOKE + * broadcast and would otherwise keep authorizing a privilege that was already revoked. {@link + * ClusterAuthorityFetcher#checkCacheAvailable()} therefore drops the cache while fenced, forcing a + * re-fetch from the ConfigNode (which fails closed while the DataNode is partitioned). + * + *

This closes a window the pre-existing {@code refreshToken()} timeout did not: that timeout + * only marks the cache stale when a heartbeat finally arrives after a long gap, so during an + * ongoing partition (no heartbeat at all) the stale cache kept being served. {@code + * isFenced()} is evaluated on the DataNode's own clock and needs no heartbeat to fire. + */ +public class ClusterAuthorityFetcherLeaseTest { + + @After + public void tearDown() { + // Restore the process-wide lease singleton so other tests in this JVM are unaffected. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + } + + @Test + public void fencedLeaseDropsPermissionCache() { + final ClusterAuthorityFetcher fetcher = new ClusterAuthorityFetcher(new BasicAuthorityCache()); + final User user = new User("user_fenced", "password"); + fetcher.getAuthorCache().putUserCache(user.getName(), user); + Assert.assertNotNull(fetcher.getAuthorCache().getUserCache(user.getName())); + + MetadataLeaseManager.getInstance().expireLeaseForTest(); + fetcher.checkCacheAvailable(); + + Assert.assertNull( + "a fenced DataNode must drop its permission cache so a missed REVOKE cannot keep authorizing", + fetcher.getAuthorCache().getUserCache(user.getName())); + } + + @Test + public void activeLeaseKeepsPermissionCache() { + final ClusterAuthorityFetcher fetcher = new ClusterAuthorityFetcher(new BasicAuthorityCache()); + final User user = new User("user_active", "password"); + fetcher.getAuthorCache().putUserCache(user.getName(), user); + + // An active lease (a ConfigNode heartbeat was just received) must not needlessly drop the + // cache. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + fetcher.checkCacheAvailable(); + + Assert.assertNotNull( + "an active lease must not needlessly drop the permission cache", + fetcher.getAuthorCache().getUserCache(user.getName())); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/metadata/cache/TreeDeviceSchemaCacheManagerLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/metadata/cache/TreeDeviceSchemaCacheManagerLeaseTest.java new file mode 100644 index 0000000000000..6f91ec4504235 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/metadata/cache/TreeDeviceSchemaCacheManagerLeaseTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.metadata.cache; + +import org.apache.iotdb.commons.exception.IllegalPathException; +import org.apache.iotdb.commons.path.MeasurementPath; +import org.apache.iotdb.commons.path.PartialPath; +import org.apache.iotdb.db.queryengine.common.schematree.ClusterSchemaTree; +import org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.cache.TreeDeviceSchemaCacheManager; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.Collections; + +/** + * While the metadata lease is fenced, the tree-model schema cache must not be trusted: this + * DataNode could have missed a ConfigNode cache-invalidation (e.g. a DELETE TIMESERIES / datatype + * change) while partitioned, so a stale cached entry could validate a write or resolve a query + * against schema that no longer exists. Because the cache is read-through, the fix is to report a + * cache miss while fenced so the caller re-fetches from the authoritative, quorum-backed + * SchemaRegion (more available than hard-failing: the op still succeeds whenever that quorum is + * reachable). + */ +public class TreeDeviceSchemaCacheManagerLeaseTest { + + private TreeDeviceSchemaCacheManager manager; + + @Before + public void setUp() throws IllegalPathException { + manager = TreeDeviceSchemaCacheManager.getInstance(); + manager.cleanUp(); + final ClusterSchemaTree tree = new ClusterSchemaTree(); + tree.appendSingleMeasurement( + new PartialPath("root.sg1.d1.s1"), + new MeasurementSchema("s1", TSDataType.INT32), + null, + null, + null, + false); + tree.setDatabases(Collections.singleton("root.sg1")); + manager.put(tree); + } + + @After + public void tearDown() { + manager.cleanUp(); + // Restore the process-wide lease singleton so other tests in this JVM are unaffected. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + } + + @Test + public void fencedLeaseForcesTreeSchemaCacheMiss() throws IllegalPathException { + final PartialPath device1 = new PartialPath("root.sg1.d1"); + final String[] measurements = new String[] {"s1"}; + + // Sanity: with an active lease the cached entry is served (a cache hit). + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + Assert.assertFalse( + "an active lease should serve the cached tree schema", + manager.get(device1, measurements).getAllDevices().isEmpty()); + Assert.assertFalse( + manager + .getMatchedNormalSchema(new MeasurementPath("root.sg1.d1.s1")) + .getAllDevices() + .isEmpty()); + + // Fenced: every tree-schema lookup must report a miss so the caller re-fetches from the + // authoritative SchemaRegion instead of trusting a possibly-stale cached entry. + MetadataLeaseManager.getInstance().expireLeaseForTest(); + Assert.assertTrue( + "a fenced lease must report a tree-schema cache miss (force re-fetch)", + manager.get(device1, measurements).getAllDevices().isEmpty()); + Assert.assertTrue( + manager + .getMatchedNormalSchema(new MeasurementPath("root.sg1.d1.s1")) + .getAllDevices() + .isEmpty()); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java new file mode 100644 index 0000000000000..70d0f0ece1a51 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/lease/MetadataLeaseManagerTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.lease; + +import org.junit.Test; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class MetadataLeaseManagerTest { + + private static final long T_FENCE_MS = 20_000L; + + private MetadataLeaseManager newManager(final AtomicLong nowNanos) { + return new MetadataLeaseManager(nowNanos::get, () -> T_FENCE_MS); + } + + @Test + public void notFencedWithinThresholdAfterHeartbeat() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + manager.recordConfigNodeHeartbeat(); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS - 1)); + assertFalse(manager.isFenced()); + } + + @Test + public void fencedAfterThresholdElapsedWithoutHeartbeat() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + manager.recordConfigNodeHeartbeat(); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + } + + @Test + public void recoversFromFencedAfterNewHeartbeat() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + manager.recordConfigNodeHeartbeat(); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + + manager.recordConfigNodeHeartbeat(); + assertFalse(manager.isFenced()); + } + + @Test + public void reportsMillisSinceLastHeartbeat() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + manager.recordConfigNodeHeartbeat(); + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(1234)); + assertEquals(1234L, manager.getMillisSinceLastConfigNodeHeartbeat()); + } + + @Test + public void runsRecoveryListenerWhenHeartbeatArrivesAfterFence() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + final AtomicInteger recoveries = new AtomicInteger(); + manager.addLeaseRecoveryListener(recoveries::incrementAndGet); + + manager.recordConfigNodeHeartbeat(); + assertEquals(0, recoveries.get()); + + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(T_FENCE_MS + 1)); + assertTrue(manager.isFenced()); + + manager.recordConfigNodeHeartbeat(); + assertEquals(1, recoveries.get()); + assertFalse(manager.isFenced()); + } + + @Test + public void doesNotRunRecoveryListenerWhenLeaseNeverExpired() { + final AtomicLong nowNanos = new AtomicLong(TimeUnit.SECONDS.toNanos(100)); + final MetadataLeaseManager manager = newManager(nowNanos); + final AtomicInteger recoveries = new AtomicInteger(); + manager.addLeaseRecoveryListener(recoveries::incrementAndGet); + + for (int i = 0; i < 3; i++) { + nowNanos.addAndGet(TimeUnit.MILLISECONDS.toNanos(1_000)); + manager.recordConfigNodeHeartbeat(); + } + assertEquals(0, recoveries.get()); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java new file mode 100644 index 0000000000000..709679e86b362 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/schemaengine/table/DataNodeTableCacheLeaseTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.schemaengine.table; + +import org.apache.iotdb.commons.exception.IoTDBRuntimeException; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.junit.After; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class DataNodeTableCacheLeaseTest { + + @After + public void renewLease() { + // Renew so a forced-fenced lease does not leak into other tests sharing this JVM fork. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + } + + @Test + public void getTableInWriteFailsClosedWhenLeaseFenced() { + MetadataLeaseManager.getInstance().expireLeaseForTest(); + try { + DataNodeTableCache.getInstance().getTableInWrite("root.db", "t"); + fail("expected fail-closed retry exception while the metadata lease is fenced"); + } catch (final IoTDBRuntimeException e) { + // A fenced DataNode must refuse to validate writes against a possibly-stale cache, and the + // error must be the retryable one (not, say, table-not-exists from the stale cache). + assertEquals(TSStatusCode.INTERNAL_REQUEST_RETRY_ERROR.getStatusCode(), e.getErrorCode()); + } + } + + @Test + public void getTableFailsClosedWhenLeaseFenced() { + MetadataLeaseManager.getInstance().expireLeaseForTest(); + try { + DataNodeTableCache.getInstance().getTable("root.db", "t"); + fail("expected fail-closed retry exception while the metadata lease is fenced"); + } catch (final IoTDBRuntimeException e) { + assertEquals(TSStatusCode.INTERNAL_REQUEST_RETRY_ERROR.getStatusCode(), e.getErrorCode()); + } + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/compaction/utils/MultiTsFileDeviceIteratorLeaseTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/compaction/utils/MultiTsFileDeviceIteratorLeaseTest.java new file mode 100644 index 0000000000000..2868ed04d86db --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/compaction/utils/MultiTsFileDeviceIteratorLeaseTest.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.compaction.utils; + +import org.apache.iotdb.commons.exception.IllegalPathException; +import org.apache.iotdb.commons.exception.MetadataException; +import org.apache.iotdb.db.exception.StorageEngineException; +import org.apache.iotdb.db.queryengine.plan.analyze.cache.schema.DataNodeTTLCache; +import org.apache.iotdb.db.schemaengine.lease.MetadataLeaseManager; +import org.apache.iotdb.db.storageengine.dataregion.compaction.AbstractCompactionTest; +import org.apache.iotdb.db.storageengine.dataregion.compaction.execute.utils.MultiTsFileDeviceIterator; +import org.apache.iotdb.db.storageengine.dataregion.read.control.FileReaderManager; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; + +import org.apache.tsfile.exception.write.WriteProcessException; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; + +/** + * Compaction physically deletes data older than its TTL window. A DataNode partitioned from the + * ConfigNode may hold a stale TTL (it could have missed a ConfigNode TTL update while partitioned), + * and a too-short stale TTL would make compaction permanently delete data that a missed + * TTL-increase says to keep. While the metadata lease is fenced, compaction must therefore fall + * back to an infinite TTL (delete nothing by TTL); real TTL deletion resumes after the lease + * recovers and the cache resyncs. + */ +public class MultiTsFileDeviceIteratorLeaseTest extends AbstractCompactionTest { + + private final String oldThreadName = Thread.currentThread().getName(); + + @Before + public void setUp() + throws IOException, WriteProcessException, MetadataException, InterruptedException { + super.setUp(); + Thread.currentThread().setName("pool-1-IoTDB-Compaction-Worker-1"); + } + + @After + public void tearDown() throws IOException, StorageEngineException { + DataNodeTTLCache.getInstance().clearAllTTLForTree(); + // Restore the process-wide lease singleton so other tests in this JVM are unaffected. + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + super.tearDown(); + for (final TsFileResource tsFileResource : seqResources) { + FileReaderManager.getInstance().closeFileAndRemoveReader(tsFileResource.getTsFileID()); + } + Thread.currentThread().setName(oldThreadName); + } + + @Test + public void activeLeaseAppliesRealTtlInCompaction() + throws MetadataException, IOException, WriteProcessException { + registerTimeseriesInMManger(1, 1, false); + createFiles(1, 1, 1, 100, 0, 0, 50, 50, false, true); + DataNodeTTLCache.getInstance().setTTLForTree(COMPACTION_TEST_SG + ".**", 100_000L); + + MetadataLeaseManager.getInstance().recordConfigNodeHeartbeat(); + try (final MultiTsFileDeviceIterator it = new MultiTsFileDeviceIterator(seqResources)) { + Assert.assertTrue(it.hasNextDevice()); + it.nextDevice(); + Assert.assertNotEquals(Long.MAX_VALUE, it.getTTLForCurrentDevice()); + Assert.assertNotEquals(Long.MIN_VALUE, it.getTimeLowerBoundForCurrentDevice()); + } + } + + @Test + public void fencedLeaseUsesInfiniteTtlInCompaction() + throws MetadataException, IOException, WriteProcessException, IllegalPathException { + registerTimeseriesInMManger(1, 1, false); + createFiles(1, 1, 1, 100, 0, 0, 50, 50, false, true); + // A finite TTL is configured, so without the fence fallback compaction would delete by it. + DataNodeTTLCache.getInstance().setTTLForTree(COMPACTION_TEST_SG + ".**", 100_000L); + + MetadataLeaseManager.getInstance().expireLeaseForTest(); + try (final MultiTsFileDeviceIterator it = new MultiTsFileDeviceIterator(seqResources)) { + Assert.assertTrue(it.hasNextDevice()); + it.nextDevice(); + Assert.assertEquals( + "a fenced DataNode must use an infinite TTL in compaction so a stale TTL cannot delete data", + Long.MAX_VALUE, + it.getTTLForCurrentDevice()); + Assert.assertEquals(Long.MIN_VALUE, it.getTimeLowerBoundForCurrentDevice()); + } + } +} diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index 4d7b5bcea87eb..52c44f973e291 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -744,6 +744,16 @@ failure_detector_phi_threshold=30 # Datatype: long failure_detector_phi_acceptable_pause_in_ms=10000 +# A DataNode self-fences its ConfigNode-pushed metadata caches (table/tree schema, templates, TTL, +# permissions, ...) if it has not received a ConfigNode heartbeat within this duration, so a +# partitioned DataNode stops trusting stale caches. Kept aligned with +# failure_detector_fixed_threshold_in_ms so a DataNode fences itself around the same time the +# cluster would consider it down. The ConfigNode also uses this to decide how long it must wait +# before treating an unreachable DataNode as safely fenced. +# effectiveMode: restart +# Datatype: long +metadata_lease_fence_ms=20000 + # Whether to enable topology probing between DataNodes # effectiveMode: hot_reload # Datatype: Boolean diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java index 6a8956e423b48..aa81003b1b2e6 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java @@ -439,6 +439,13 @@ public class CommonConfig { private volatile long remoteWriteMaxRetryDurationInMs = 60000; + // The DataNode self-fences its ConfigNode-pushed metadata caches (table/tree schema, template, + // TTL, permission, ...) if it has not received a ConfigNode heartbeat within this duration. Kept + // aligned with the failure detector threshold so a partitioned DataNode stops trusting stale + // caches around the same time the cluster would consider it dead. Also used by the ConfigNode to + // derive how long it must wait before treating an unreachable DataNode as safely fenced. + private volatile long metadataLeaseFenceMs = 20_000; + private final RateLimiter querySamplingRateLimiter = RateLimiter.create(160); // if querySamplingRateLimiter < 0, means that there is no rate limit, we need to full sample all // the queries @@ -2679,6 +2686,14 @@ public void setRemoteWriteMaxRetryDurationInMs(long remoteWriteMaxRetryDurationI this.remoteWriteMaxRetryDurationInMs = remoteWriteMaxRetryDurationInMs; } + public long getMetadataLeaseFenceMs() { + return metadataLeaseFenceMs; + } + + public void setMetadataLeaseFenceMs(long metadataLeaseFenceMs) { + this.metadataLeaseFenceMs = metadataLeaseFenceMs; + } + public int getArenaNum() { return arenaNum; } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java index 5cd954a09f7b8..014ad3e879785 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java @@ -306,6 +306,11 @@ public void loadCommonProps(TrimProperties properties) throws IOException { properties.getProperty( "path_log_max_size", String.valueOf(config.getPathLogMaxSize())))); + config.setMetadataLeaseFenceMs( + Long.parseLong( + properties.getProperty( + "metadata_lease_fence_ms", String.valueOf(config.getMetadataLeaseFenceMs())))); + loadRetryProperties(properties); loadBinaryAllocatorProps(properties); } diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index 92a7602b34dee..74e9054592247 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -316,6 +316,9 @@ struct TDataNodeHeartbeatResp { 15: optional list pipeRemainingEventCountList 16: optional list pipeRemainingTimeList 17: optional map dataRegionRawDataSize + // Whether this DataNode supports metadata-lease self-fencing. Used by the ConfigNode during + // rolling upgrade: an unreachable DataNode that does not report this cannot be assumed fenced. + 18: optional bool supportsMetadataLeaseFencing } struct TPipeHeartbeatReq {