diff --git a/proxy/dualwrite.go b/proxy/dualwrite.go index 9937c880d..9ec0b1888 100644 --- a/proxy/dualwrite.go +++ b/proxy/dualwrite.go @@ -319,6 +319,8 @@ func (d *DualWriter) writeSecondary(cmd string, iArgs []any) { // structured warning with enough context to diagnose EVALSHA timeouts. func (d *DualWriter) recordSecondaryWriteFailure(cmd string, iArgs []any, elapsed time.Duration, attempts int, usedNOSCRIPTFallback bool, sErr error) { d.metrics.SecondaryWriteErrors.Inc() + reason := classifySecondaryWriteError(sErr) + d.metrics.SecondaryWriteErrorsByReason.WithLabelValues(cmd, reason).Inc() d.metrics.CommandTotal.WithLabelValues(cmd, d.secondary.Name(), "error").Inc() fingerprint := fmt.Sprintf("secondary_write_%s", cmd) if d.sentry.ShouldReport(fingerprint) { @@ -488,3 +490,42 @@ func argsToBytes(iArgs []any) [][]byte { } return out } + +// secondaryWriteErrorPatterns maps substrings found in secondary-write error +// messages to their Prometheus reason label. The list is scanned in order so +// more-specific patterns (e.g. "retry limit exceeded" which embeds "write +// conflict") must precede the generic ones to win the classification. +var secondaryWriteErrorPatterns = []struct { + substr string + reason string +}{ + {"retry limit exceeded", "retry_limit"}, + {"write conflict", "write_conflict"}, + {"deadline exceeded", "deadline_exceeded"}, + {"not leader", "not_leader"}, + {"leader not found", "not_leader"}, + {"txn already committed", "txn_already_finalized"}, + {"txn already aborted", "txn_already_finalized"}, + {"txn locked", "txn_locked"}, +} + +// classifySecondaryWriteError maps a secondary-write error to a small fixed set +// of reason labels suitable for a Prometheus counter. The elastickv secondary +// backend is in-house, so matching on substrings of the error message is safe. +// +// Order in secondaryWriteErrorPatterns matters (see its doc). +func classifySecondaryWriteError(err error) string { + if err == nil { + return "other" + } + if errors.Is(err, context.DeadlineExceeded) { + return "deadline_exceeded" + } + msg := err.Error() + for _, p := range secondaryWriteErrorPatterns { + if strings.Contains(msg, p.substr) { + return p.reason + } + } + return "other" +} diff --git a/proxy/metrics.go b/proxy/metrics.go index f2b4d406d..cd8b3837e 100644 --- a/proxy/metrics.go +++ b/proxy/metrics.go @@ -7,12 +7,13 @@ type ProxyMetrics struct { CommandTotal *prometheus.CounterVec CommandDuration *prometheus.HistogramVec - PrimaryWriteErrors prometheus.Counter - SecondaryWriteErrors prometheus.Counter - PrimaryReadErrors prometheus.Counter - ShadowReadErrors prometheus.Counter - Divergences *prometheus.CounterVec - MigrationGaps *prometheus.CounterVec + PrimaryWriteErrors prometheus.Counter + SecondaryWriteErrors prometheus.Counter + SecondaryWriteErrorsByReason *prometheus.CounterVec + PrimaryReadErrors prometheus.Counter + ShadowReadErrors prometheus.Counter + Divergences *prometheus.CounterVec + MigrationGaps *prometheus.CounterVec ActiveConnections prometheus.Gauge @@ -48,6 +49,11 @@ func NewProxyMetrics(reg prometheus.Registerer) *ProxyMetrics { Name: "secondary_write_errors_total", Help: "Total write errors from the secondary backend.", }), + SecondaryWriteErrorsByReason: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "proxy", + Name: "secondary_write_errors_by_reason_total", + Help: "secondary write failures broken out by redis command and error classification (write_conflict / retry_limit / not_leader / deadline_exceeded / txn_already_finalized / txn_locked / other)", + }, []string{"cmd", "reason"}), PrimaryReadErrors: prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "proxy", Name: "primary_read_errors_total", @@ -98,6 +104,7 @@ func NewProxyMetrics(reg prometheus.Registerer) *ProxyMetrics { m.CommandDuration, m.PrimaryWriteErrors, m.SecondaryWriteErrors, + m.SecondaryWriteErrorsByReason, m.PrimaryReadErrors, m.ShadowReadErrors, m.Divergences, diff --git a/proxy/metrics_test.go b/proxy/metrics_test.go new file mode 100644 index 000000000..4f75d0866 --- /dev/null +++ b/proxy/metrics_test.go @@ -0,0 +1,77 @@ +package proxy + +import ( + "context" + "errors" + "fmt" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" +) + +func TestClassifySecondaryWriteError(t *testing.T) { + tests := []struct { + name string + err error + want string + }{ + {"nil error falls back to other", nil, "other"}, + {"plain write conflict", errors.New("write conflict on key foo"), "write_conflict"}, + { + "retry-limit message wins over embedded write-conflict substring", + errors.New("redis txn retry limit exceeded: write conflict"), + "retry_limit", + }, + {"not leader", errors.New("not leader"), "not_leader"}, + {"linearizable read not leader", errors.New("linearizable read: not leader"), "not_leader"}, + {"leader not found (ErrLeaderNotFound message)", errors.New("leader not found"), "not_leader"}, + {"context deadline exceeded via errors.Is", context.DeadlineExceeded, "deadline_exceeded"}, + {"wrapped deadline exceeded", fmt.Errorf("dispatch failed: %w", context.DeadlineExceeded), "deadline_exceeded"}, + {"deadline exceeded substring only", errors.New("rpc: deadline exceeded"), "deadline_exceeded"}, + {"txn already committed", errors.New("txn already committed"), "txn_already_finalized"}, + {"txn already aborted", errors.New("txn already aborted"), "txn_already_finalized"}, + {"txn locked", errors.New("key: foo: txn locked"), "txn_locked"}, + {"unknown", errors.New("some random failure"), "other"}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := classifySecondaryWriteError(tc.err) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestRecordSecondaryWriteFailureEmitsBothCounters(t *testing.T) { + metrics := newTestMetrics() + primary := newMockBackend("primary") + secondary := newMockBackend("secondary") + d := NewDualWriter( + primary, secondary, + ProxyConfig{Mode: ModeDualWrite, SecondaryTimeout: time.Second}, + metrics, newTestSentry(), testLogger, + ) + + err := errors.New("write conflict on !txn|rb|foo") + d.recordSecondaryWriteFailure("SET", []any{"SET", "k", "v"}, 5*time.Millisecond, 1, false, err) + + assert.InDelta(t, 1, testutil.ToFloat64(metrics.SecondaryWriteErrors), 0.001, + "unlabelled counter should still tick for dashboard backwards compatibility") + assert.InDelta(t, 1, testutil.ToFloat64( + metrics.SecondaryWriteErrorsByReason.WithLabelValues("SET", "write_conflict"), + ), 0.001, "labelled counter should record the write_conflict reason") + + // Second failure: different reason + command should populate a distinct label pair. + d.recordSecondaryWriteFailure("EVALSHA", []any{"EVALSHA", "deadbeef"}, time.Millisecond, 3, false, + errors.New("redis txn retry limit exceeded: write conflict")) + + assert.InDelta(t, 2, testutil.ToFloat64(metrics.SecondaryWriteErrors), 0.001) + assert.InDelta(t, 1, testutil.ToFloat64( + metrics.SecondaryWriteErrorsByReason.WithLabelValues("EVALSHA", "retry_limit"), + ), 0.001) + assert.InDelta(t, 1, testutil.ToFloat64( + metrics.SecondaryWriteErrorsByReason.WithLabelValues("SET", "write_conflict"), + ), 0.001, "previous label pair should be unchanged") +}