Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public CompletableFuture<Status> stop() {
public CompletableFuture<Status> connect() {
SessionState local = state.get();
// create new stream to connect
final Stream stream = new Stream(rpc);
final Stream stream = new Stream(rpc, connectTimeout);
if (!updateState(local, makeConnectionState(local, stream))) {
logger.warn("{} cannot be connected with state {}", this, local.getState());
stream.cancelStream();
Expand Down Expand Up @@ -214,7 +214,7 @@ private void restoreSession(long disconnectedAt, int retryCount, List<StreamMsg<
return;
}

Stream stream = new Stream(rpc);
Stream stream = new Stream(rpc, connectTimeout);
if (!updateState(local, makeConnectionState(local, stream))) {
logger.warn("{} cannot be reconnected with state {}", this, state.get().getState());
completeMessagesWithBadSession(messagesToRetry);
Expand Down
17 changes: 17 additions & 0 deletions coordination/src/main/java/tech/ydb/coordination/impl/Stream.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ class Stream {
private final Map<Long, StreamMsg<?>> messages = new ConcurrentHashMap<>();

Stream(Rpc rpc) {
this(rpc, Duration.ofSeconds(5));
}

Stream(Rpc rpc, Duration connectTimeout) {
this.scheduler = rpc.getScheduler();
this.stream = rpc.createSession(GrpcRequestSettings.newBuilder()
.disableDeadline()
Expand All @@ -53,6 +57,19 @@ class Stream {
stopFuture.complete(status);
}
});

// Guard against a half-open TCP connection where the gRPC stream never delivers
// SessionStarted. Without this timeout the reconnect loop stalls indefinitely,
// keeping CompletableFutures for pending operations (e.g. acquireEphemeralSemaphore)
// unresolved even after the application-level acquire timeout has expired.
scheduler.schedule(() -> {
if (startFuture.isDone()) {
return;
}
logger.warn("stream {} connect timeout after {} ms, cancelling", hashCode(), connectTimeout.toMillis());
stream.cancel();
startFuture.complete(Result.fail(Status.of(StatusCode.TIMEOUT)));
}, connectTimeout.toMillis(), TimeUnit.MILLISECONDS);
Comment on lines +61 to +72
}

public CompletableFuture<Status> getFinishedFuture() {
Expand Down