diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/LeaderElectionManager.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/LeaderElectionManager.java index 4788aff385..7b0a446e81 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/LeaderElectionManager.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/LeaderElectionManager.java @@ -15,10 +15,11 @@ */ package io.javaoperatorsdk.operator; -import java.util.Arrays; import java.util.Collection; +import java.util.List; import java.util.UUID; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -36,9 +37,39 @@ import io.javaoperatorsdk.operator.api.config.ConfigurationService; import io.javaoperatorsdk.operator.api.config.LeaderElectionConfiguration; +/** + * Manages the leader-election lifecycle for an {@link Operator} instance. Leader election ensures + * that, in a high-availability setup with multiple replicas of the same operator, only one replica + * at a time actively reconciles resources. The replica currently holding the lease is referred to + * as the leader, and the others stand by until the lease becomes available. + * + *
Leader election is opt-in. It is enabled when a {@link LeaderElectionConfiguration} is + * supplied via {@link + * io.javaoperatorsdk.operator.api.config.ConfigurationServiceOverrider#withLeaderElectionConfiguration(LeaderElectionConfiguration) + * ConfigurationServiceOverrider#withLeaderElectionConfiguration(LeaderElectionConfiguration)}. The + * configuration controls the lease name, namespace, durations, and optional user-supplied {@link + * LeaderCallbacks}. + * + *
{@link #stopLeading()} behaves differently depending on how it was triggered: + * + *
The lifecycle methods {@link #start()} and {@link #stop()} are called by {@link Operator} as
+ * part of {@link Operator#start()} and {@link Operator#stop()} respectively. Users typically do not
+ * interact with this class directly.
+ */
public class LeaderElectionManager {
private static final Logger log = LoggerFactory.getLogger(LeaderElectionManager.class);
+ private static final List The hook is registered regardless of whether leader election is enabled. A leader pod
+ * receiving {@code SIGTERM} will therefore release its lease cleanly so that a standby replica
+ * can take over without waiting for lease expiry.
+ *
+ * NOTE: You may also want to tune the Pod's {@code terminationGracePeriodSeconds} to be
+ * at least as long as the configured {@code reconciliationTerminationTimeout}, plus a small
+ * buffer for the rest of the shutdown sequence (releasing the leader-election lease and closing
+ * the Kubernetes client). If the grace period elapses before {@link #stop()} returns, the kubelet
+ * sends {@code SIGKILL}, in-flight reconciliations are abandoned, and any held leader-election
+ * lease is not released cleanly.
+ */
+ public void installShutdownHook() {
+ if (shutdownHookInstalled.compareAndSet(false, true)) {
+ Runtime.getRuntime().addShutdownHook(new Thread(this::stop));
+ }
+ }
+
/**
* Adds a shutdown hook that automatically calls {@link #stop()} when the app shuts down. Note
* that graceful shutdown is usually not needed, but some {@link Reconciler} implementations might
@@ -137,16 +165,14 @@ protected ConfigurationService initConfigurationService(
* Note that you might want to tune "terminationGracePeriodSeconds" for the Pod running the
* controller.
*
- * @param gracefulShutdownTimeout timeout to wait for executor threads to complete actual
- * reconciliations
+ * @param gracefulShutdownTimeout ignored, configure {@link
+ * ConfigurationService#reconciliationTerminationTimeout()} instead
+ * @deprecated Use {@link #installShutdownHook()} instead
*/
+ @Deprecated(forRemoval = true)
@SuppressWarnings("unused")
public void installShutdownHook(Duration gracefulShutdownTimeout) {
- if (!leaderElectionManager.isLeaderElectionEnabled()) {
- Runtime.getRuntime().addShutdownHook(new Thread(this::stop));
- } else {
- log.warn("Leader election is on, shutdown hook will not be installed.");
- }
+ installShutdownHook();
}
public KubernetesClient getKubernetesClient() {
@@ -188,6 +214,30 @@ public synchronized void start() {
}
}
+ /**
+ * Stops the operator and releases its resources. The shutdown sequence is:
+ *
+ * It is safe to call this method from a JVM shutdown hook (see {@link #installShutdownHook()})
+ * as the graceful-shutdown path coordinates with the leader-election callbacks so that {@code
+ * System.exit} is not invoked while the JVM is already shutting down.
+ *
+ * If the operator was never successfully started, this method only stops the executor service
+ * manager so that no thread pools are leaked.
+ *
+ * @throws OperatorException if an error occurs during shutdown
+ */
@Override
public void stop() throws OperatorException {
Duration reconciliationTerminationTimeout =
diff --git a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/LeaderElectionManagerTest.java b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/LeaderElectionManagerTest.java
index 510890e56e..a885d7604c 100644
--- a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/LeaderElectionManagerTest.java
+++ b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/LeaderElectionManagerTest.java
@@ -109,6 +109,18 @@ void testInitPermissionsMultipleRulesWithResourceName(@TempDir Path tempDir) thr
assertTrue(leaderElectionManager.isLeaderElectionEnabled());
}
+ @Test
+ void stopLeadingDoesNotInvokeSystemExitWhenStopWasCalledFirst() {
+ // When stop() is called before the onStopLeading callback fires (which is what happens when
+ // stop()'s future cancellation triggers the callback), stopLeading() must skip
+ // System.exit(1). Otherwise calling stop() from inside a JVM shutdown hook deadlocks against
+ // the java.lang.Shutdown class lock. If this regression is ever reintroduced, this test
+ // method would terminate the JUnit JVM via System.exit(1) instead of failing cleanly.
+ final var leaderElectionManager = leaderElectionManager(null);
+ leaderElectionManager.stop();
+ leaderElectionManager.stopLeading();
+ }
+
@Test
void testFailedToInitMissingPermission(@TempDir Path tempDir) throws IOException {
var namespace = "foo";
+ *
+ *
+ *