From 45c562d39354e164939be32d4fa91026b1ae65ed Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Mon, 1 Jun 2026 06:31:51 +0000 Subject: [PATCH] Fix flaky TestConn_ExecContext: don't cancel a finished operation The sentinel Watch loop had a race: once StatusFn reported Done, it spawned the OnDoneFn processor and looped back into its select with two potentially-ready cases at once -- the processor result (resCh) and ctx.Done(). Go's select picks randomly among ready cases, so a context cancellation arriving at the same instant as completion could win and trigger OnCancelFn, cancelling an operation that had already finished. TestConn_ExecContext/"ExecContext uses new context to close operation" cancels the context inside GetOperationStatus while returning FINISHED, then asserted cancelOperationCount == 1 -- i.e. it asserted on the losing side of that random select. The Go scheduler version only shifts the probability; the race is in the design, not the language version. The racy pattern dates to #41 (Nov 2022) and the test to #86 (Dec 2022); it was not introduced by #364. Fix: once Done is observed, the operation has logically completed and its outcome is authoritative. Drain the OnDoneFn result via a new waitForDone helper that selects only on resCh/errCh, no longer on ctx.Done()/timeout. A finished operation is now never cancelled, deterministically. Update the connection test to assert the correct deterministic behaviour (success, cancelOperationCount == 0, CloseOperation still runs on a fresh context) and add a sentinel regression test that pre-cancels the context in the same StatusFn call that reports Done. Verified: TestConn_ExecContext -count=300 green; sentinel suite green under -race -count=5. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --- connection_test.go | 15 ++++++++--- internal/sentinel/sentinel.go | 23 ++++++++++++++-- internal/sentinel/sentinel_test.go | 43 ++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/connection_test.go b/connection_test.go index 92f08371..7fd48ce9 100644 --- a/connection_test.go +++ b/connection_test.go @@ -1576,11 +1576,20 @@ func TestConn_ExecContext(t *testing.T) { ctx, cancel = context.WithCancel(ctx) defer cancel() res, err := testConn.ExecContext(ctx, "insert 10", []driver.NamedValue{}) - assert.Error(t, err) - assert.Nil(t, res) + // GetOperationStatus reports FINISHED in the same call that cancels the + // context. Because the operation completed, the sentinel must report + // success and must NOT cancel a finished operation, regardless of + // scheduler timing — this assertion previously raced on cancelOperationCount. + assert.NoError(t, err) + assert.NotNil(t, res) + rowsAffected, _ := res.RowsAffected() + assert.Equal(t, int64(10), rowsAffected) assert.Equal(t, 1, executeStatementCount) - assert.Equal(t, 1, cancelOperationCount) + assert.Equal(t, 0, cancelOperationCount) assert.Equal(t, 1, getOperationStatusCount) + // CloseOperation must still run, on a fresh (non-cancelled) context, even + // though the context passed to ExecContext was cancelled mid-poll. Its + // FnCloseOperation asserts ctx.Err() == nil. assert.Equal(t, 1, closeOperationCount) }) } diff --git a/internal/sentinel/sentinel.go b/internal/sentinel/sentinel.go index 529f2299..fe594018 100644 --- a/internal/sentinel/sentinel.go +++ b/internal/sentinel/sentinel.go @@ -114,10 +114,16 @@ func (s Sentinel) Watch(ctx context.Context, interval, timeout time.Duration) (W if done() { intervalTimer.Stop() if s.OnDoneFn != nil { + // The operation has completed, so the OnDoneFn result is the + // authoritative outcome. Run the processor and drain its + // result without selecting on ctx.Done()/timeout: once we've + // observed completion, a concurrent cancellation or timeout + // must not race the success path and trigger an unnecessary + // cancel of a finished operation. go processor(statusResp) - } else { - return WatchSuccess, statusResp, nil + return s.waitForDone(resCh, errCh) } + return WatchSuccess, statusResp, nil } case err := <-errCh: return WatchErr, nil, err @@ -136,3 +142,16 @@ func (s Sentinel) Watch(ctx context.Context, interval, timeout time.Duration) (W } } } + +// waitForDone blocks until the asynchronous OnDoneFn processor reports its +// result. The operation has already been observed as complete by the time this +// is called, so the outcome is determined solely by OnDoneFn and is no longer +// subject to cancellation or timeout. +func (s Sentinel) waitForDone(resCh chan any, errCh chan error) (WatchStatus, any, error) { + select { + case err := <-errCh: + return WatchErr, nil, err + case res := <-resCh: + return WatchSuccess, res, nil + } +} diff --git a/internal/sentinel/sentinel_test.go b/internal/sentinel/sentinel_test.go index 20758ec7..67f588e4 100644 --- a/internal/sentinel/sentinel_test.go +++ b/internal/sentinel/sentinel_test.go @@ -257,6 +257,49 @@ func TestWatch(t *testing.T) { assert.Nil(t, res) assert.ErrorContains(t, err, "failed") }) + t.Run("it should not cancel when context is canceled concurrently with Done", func(t *testing.T) { + // Regression test for a race where, once StatusFn reports Done, a + // context cancellation that arrives at the same time could win the + // select against the OnDoneFn result and trigger OnCancelFn for an + // operation that has already completed. A finished operation must + // never be canceled, regardless of scheduler timing. + // + // We pre-cancel the context and report Done in the same StatusFn call, + // so both the success path and ctx.Done() are ready when Watch + // re-enters its select. The outcome must deterministically be success + // with no cancellation, on every run. + // + // A tiny non-zero interval keeps each iteration to microseconds (a 0 + // interval would default to 100ms and make the loop take many seconds). + // The fix makes the outcome deterministic, so a modest iteration count + // is enough to guard against regression cheaply. + for i := 0; i < 100; i++ { + ctx, cancel := context.WithCancel(context.Background()) + cancelFnCalls := 0 + s := Sentinel{ + StatusFn: func() (Done, any, error) { + // Cancel the context right as we report completion. + cancel() + return func() bool { + return true + }, "completed", nil + }, + OnCancelFn: func() (any, error) { + cancelFnCalls++ + return nil, nil + }, + OnDoneFn: func(statusResp any) (any, error) { + return statusResp, nil + }, + } + status, res, err := s.Watch(ctx, time.Microsecond, 0) + assert.Equal(t, WatchSuccess, status) + assert.Equal(t, "completed", res) + assert.Equal(t, 0, cancelFnCalls, "OnCancelFn must not be called for a completed operation (iteration %d)", i) + assert.NoError(t, err) + cancel() + } + }) t.Run("it should return statusFn error", func(t *testing.T) { statusFnCalls := 0 cancelFnCalls := 0