From 45c562d39354e164939be32d4fa91026b1ae65ed Mon Sep 17 00:00:00 2001
From: Vikrant Puppala <vikrant.puppala@databricks.com>
Date: Mon, 1 Jun 2026 06:31:51 +0000
Subject: [PATCH] Fix flaky TestConn_ExecContext: don't cancel a finished
 operation

The sentinel Watch loop had a race: once StatusFn reported Done, it
spawned the OnDoneFn processor and looped back into its select with two
potentially-ready cases at once -- the processor result (resCh) and
ctx.Done(). Go's select picks randomly among ready cases, so a context
cancellation arriving at the same instant as completion could win and
trigger OnCancelFn, cancelling an operation that had already finished.

TestConn_ExecContext/"ExecContext uses new context to close operation"
cancels the context inside GetOperationStatus while returning FINISHED,
then asserted cancelOperationCount == 1 -- i.e. it asserted on the losing
side of that random select. The Go scheduler version only shifts the
probability; the race is in the design, not the language version. The
racy pattern dates to #41 (Nov 2022) and the test to #86 (Dec 2022); it
was not introduced by #364.

Fix: once Done is observed, the operation has logically completed and its
outcome is authoritative. Drain the OnDoneFn result via a new waitForDone
helper that selects only on resCh/errCh, no longer on ctx.Done()/timeout.
A finished operation is now never cancelled, deterministically.

Update the connection test to assert the correct deterministic behaviour
(success, cancelOperationCount == 0, CloseOperation still runs on a fresh
context) and add a sentinel regression test that pre-cancels the context
in the same StatusFn call that reports Done.

Verified: TestConn_ExecContext -count=300 green; sentinel suite green
under -race -count=5.

Co-authored-by: Isaac
Signed-off-by: Vikrant Puppala <vikrant.puppala@databricks.com>
---
 connection_test.go                 | 15 ++++++++---
 internal/sentinel/sentinel.go      | 23 ++++++++++++++--
 internal/sentinel/sentinel_test.go | 43 ++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/connection_test.go b/connection_test.go
index 92f08371..7fd48ce9 100644
--- a/connection_test.go
+++ b/connection_test.go
@@ -1576,11 +1576,20 @@ func TestConn_ExecContext(t *testing.T) {
 		ctx, cancel = context.WithCancel(ctx)
 		defer cancel()
 		res, err := testConn.ExecContext(ctx, "insert 10", []driver.NamedValue{})
-		assert.Error(t, err)
-		assert.Nil(t, res)
+		// GetOperationStatus reports FINISHED in the same call that cancels the
+		// context. Because the operation completed, the sentinel must report
+		// success and must NOT cancel a finished operation, regardless of
+		// scheduler timing — this assertion previously raced on cancelOperationCount.
+		assert.NoError(t, err)
+		assert.NotNil(t, res)
+		rowsAffected, _ := res.RowsAffected()
+		assert.Equal(t, int64(10), rowsAffected)
 		assert.Equal(t, 1, executeStatementCount)
-		assert.Equal(t, 1, cancelOperationCount)
+		assert.Equal(t, 0, cancelOperationCount)
 		assert.Equal(t, 1, getOperationStatusCount)
+		// CloseOperation must still run, on a fresh (non-cancelled) context, even
+		// though the context passed to ExecContext was cancelled mid-poll. Its
+		// FnCloseOperation asserts ctx.Err() == nil.
 		assert.Equal(t, 1, closeOperationCount)
 	})
 }
diff --git a/internal/sentinel/sentinel.go b/internal/sentinel/sentinel.go
index 529f2299..fe594018 100644
--- a/internal/sentinel/sentinel.go
+++ b/internal/sentinel/sentinel.go
@@ -114,10 +114,16 @@ func (s Sentinel) Watch(ctx context.Context, interval, timeout time.Duration) (W
 			if done() {
 				intervalTimer.Stop()
 				if s.OnDoneFn != nil {
+					// The operation has completed, so the OnDoneFn result is the
+					// authoritative outcome. Run the processor and drain its
+					// result without selecting on ctx.Done()/timeout: once we've
+					// observed completion, a concurrent cancellation or timeout
+					// must not race the success path and trigger an unnecessary
+					// cancel of a finished operation.
 					go processor(statusResp)
-				} else {
-					return WatchSuccess, statusResp, nil
+					return s.waitForDone(resCh, errCh)
 				}
+				return WatchSuccess, statusResp, nil
 			}
 		case err := <-errCh:
 			return WatchErr, nil, err
@@ -136,3 +142,16 @@ func (s Sentinel) Watch(ctx context.Context, interval, timeout time.Duration) (W
 		}
 	}
 }
+
+// waitForDone blocks until the asynchronous OnDoneFn processor reports its
+// result. The operation has already been observed as complete by the time this
+// is called, so the outcome is determined solely by OnDoneFn and is no longer
+// subject to cancellation or timeout.
+func (s Sentinel) waitForDone(resCh chan any, errCh chan error) (WatchStatus, any, error) {
+	select {
+	case err := <-errCh:
+		return WatchErr, nil, err
+	case res := <-resCh:
+		return WatchSuccess, res, nil
+	}
+}
diff --git a/internal/sentinel/sentinel_test.go b/internal/sentinel/sentinel_test.go
index 20758ec7..67f588e4 100644
--- a/internal/sentinel/sentinel_test.go
+++ b/internal/sentinel/sentinel_test.go
@@ -257,6 +257,49 @@ func TestWatch(t *testing.T) {
 		assert.Nil(t, res)
 		assert.ErrorContains(t, err, "failed")
 	})
+	t.Run("it should not cancel when context is canceled concurrently with Done", func(t *testing.T) {
+		// Regression test for a race where, once StatusFn reports Done, a
+		// context cancellation that arrives at the same time could win the
+		// select against the OnDoneFn result and trigger OnCancelFn for an
+		// operation that has already completed. A finished operation must
+		// never be canceled, regardless of scheduler timing.
+		//
+		// We pre-cancel the context and report Done in the same StatusFn call,
+		// so both the success path and ctx.Done() are ready when Watch
+		// re-enters its select. The outcome must deterministically be success
+		// with no cancellation, on every run.
+		//
+		// A tiny non-zero interval keeps each iteration to microseconds (a 0
+		// interval would default to 100ms and make the loop take many seconds).
+		// The fix makes the outcome deterministic, so a modest iteration count
+		// is enough to guard against regression cheaply.
+		for i := 0; i < 100; i++ {
+			ctx, cancel := context.WithCancel(context.Background())
+			cancelFnCalls := 0
+			s := Sentinel{
+				StatusFn: func() (Done, any, error) {
+					// Cancel the context right as we report completion.
+					cancel()
+					return func() bool {
+						return true
+					}, "completed", nil
+				},
+				OnCancelFn: func() (any, error) {
+					cancelFnCalls++
+					return nil, nil
+				},
+				OnDoneFn: func(statusResp any) (any, error) {
+					return statusResp, nil
+				},
+			}
+			status, res, err := s.Watch(ctx, time.Microsecond, 0)
+			assert.Equal(t, WatchSuccess, status)
+			assert.Equal(t, "completed", res)
+			assert.Equal(t, 0, cancelFnCalls, "OnCancelFn must not be called for a completed operation (iteration %d)", i)
+			assert.NoError(t, err)
+			cancel()
+		}
+	})
 	t.Run("it should return statusFn error", func(t *testing.T) {
 		statusFnCalls := 0
 		cancelFnCalls := 0