diff --git a/README.md b/README.md index abd394582..2ac0a7f16 100644 --- a/README.md +++ b/README.md @@ -52,16 +52,20 @@ Provisioned monitoring assets live under: - `monitoring/prometheus/prometheus.yml` - `monitoring/grafana/dashboards/elastickv-cluster-overview.json` -- `monitoring/grafana/dashboards/elastickv-cluster-summary.json` +- `monitoring/grafana/dashboards/elastickv-dynamodb.json` - `monitoring/grafana/dashboards/elastickv-raft-status.json` +- `monitoring/grafana/dashboards/elastickv-redis-summary.json` +- `monitoring/grafana/dashboards/elastickv-pebble-internals.json` - `monitoring/grafana/provisioning/` - `monitoring/docker-compose.yml` The provisioned dashboards are organized by operator task: -- `Elastickv Cluster Overview` is the landing page for leader identity, cluster-wide latency/error posture, and per-node Raft health -- `Elastickv Request Health` is the DynamoDB/API drilldown for slow operations, noisy nodes, and hot/erroring tables +- `Elastickv Cluster` is the landing page for leader identity, cluster-wide latency/error posture, and per-node Raft health +- `Elastickv DynamoDB` is the DynamoDB-compatible API drilldown for slow operations, noisy nodes, and hot/erroring tables - `Elastickv Raft Status` is the control-plane drilldown for membership, leader changes, failed proposals, node state, index drift, backlog, and leader contact +- `Elastickv Redis` is the Redis-compatible API drilldown for per-command throughput/latency/errors, with a collapsible `Hot Path` row for GET fast-path (PR #560) verification +- `Elastickv Pebble Internals` is the storage-engine drilldown for block cache, L0 pressure, compactions, memtables, and store write conflicts If you bind `--metricsAddress` to a non-loopback address, `--metricsToken` is required. Prometheus must send the same bearer token, for example: diff --git a/docs/redis_hotpath_dashboard.md b/docs/redis_hotpath_dashboard.md index 38f84544f..ae6bea49b 100644 --- a/docs/redis_hotpath_dashboard.md +++ b/docs/redis_hotpath_dashboard.md @@ -1,9 +1,11 @@ # Redis Hot Path Dashboard (PR #560 verification) -`monitoring/grafana/dashboards/elastickv-redis-hotpath.json` is the +The "Hot Path (legacy PR #560)" collapsed row at the bottom of +`monitoring/grafana/dashboards/elastickv-redis-summary.json` is the operator view for the Redis GET hot path. It was added to confirm that PR #560 (`a45ca291` "perf(redis): fast-path GET to avoid ~17-seek type -probe") landed cleanly in production. +probe") landed cleanly in production. Expand the row to see the +panels described below. ## How to confirm #560 worked diff --git a/internal/raftengine/etcd/engine.go b/internal/raftengine/etcd/engine.go index 2a8ccc4d0..3e4a10f19 100644 --- a/internal/raftengine/etcd/engine.go +++ b/internal/raftengine/etcd/engine.go @@ -750,7 +750,7 @@ func (e *Engine) LastQuorumAck() time.Time { // heartbeat channel was full. Monotonic across the life of the engine. // Surfaced to Prometheus via the monitoring package so the hot-path // dashboard can graph stepCh saturation alongside LinearizableRead -// rate (see monitoring/grafana/dashboards/elastickv-redis-hotpath.json). +// rate (see the "Hot Path" row in monitoring/grafana/dashboards/elastickv-redis-summary.json). func (e *Engine) DispatchDropCount() uint64 { if e == nil { return 0 diff --git a/kv/coordinator.go b/kv/coordinator.go index de4264c4e..466ecf1a2 100644 --- a/kv/coordinator.go +++ b/kv/coordinator.go @@ -60,7 +60,7 @@ type LeaseReadObserver interface { // WithLeaseReadObserver wires a LeaseReadObserver onto a Coordinate. // This is the mechanism monitoring uses to surface the lease-hit ratio // panel on the Redis hot-path dashboard (see -// monitoring/grafana/dashboards/elastickv-redis-hotpath.json). +// the "Hot Path" row in monitoring/grafana/dashboards/elastickv-redis-summary.json). // // Typed-nil guard: a caller passing a typed-nil pointer // (e.g. `var o *myObserver; WithLeaseReadObserver(o)`) produces an diff --git a/monitoring/grafana/dashboards/elastickv-cluster-overview.json b/monitoring/grafana/dashboards/elastickv-cluster-overview.json index 5806cf5c6..3dda96dfd 100644 --- a/monitoring/grafana/dashboards/elastickv-cluster-overview.json +++ b/monitoring/grafana/dashboards/elastickv-cluster-overview.json @@ -1280,7 +1280,7 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Elastickv Cluster Overview", + "title": "Elastickv Cluster", "uid": "elastickv-cluster", "version": 2 } diff --git a/monitoring/grafana/dashboards/elastickv-cluster-summary.json b/monitoring/grafana/dashboards/elastickv-dynamodb.json similarity index 99% rename from monitoring/grafana/dashboards/elastickv-cluster-summary.json rename to monitoring/grafana/dashboards/elastickv-dynamodb.json index 0fd41a61b..df485c029 100644 --- a/monitoring/grafana/dashboards/elastickv-cluster-summary.json +++ b/monitoring/grafana/dashboards/elastickv-dynamodb.json @@ -15,7 +15,7 @@ } ] }, - "description": "DynamoDB-compatible request health for elastickv: throughput, latency, errors, tables, and noisy nodes.", + "description": "DynamoDB-compatible API health for elastickv: per-operation throughput and latency, per-table request/error breakdown, and item-volume panels.", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -918,7 +918,7 @@ "elastickv", "dynamodb", "requests", - "summary" + "api" ], "templating": { "list": [ @@ -1019,7 +1019,7 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Elastickv Request Health", - "uid": "elastickv-cluster-summary", + "title": "Elastickv DynamoDB", + "uid": "elastickv-dynamodb", "version": 2 } diff --git a/monitoring/grafana/dashboards/elastickv-pebble-internals.json b/monitoring/grafana/dashboards/elastickv-pebble-internals.json new file mode 100644 index 000000000..31d642bfe --- /dev/null +++ b/monitoring/grafana/dashboards/elastickv-pebble-internals.json @@ -0,0 +1,633 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Pebble storage engine internals for elastickv: block cache, L0 pressure, compactions, memtables, FSM apply sync mode, and store write conflicts.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": "$datasource", + "description": "Block cache hit rate = hits / (hits + misses). Greater than 95% is healthy; below 80% means the working set no longer fits in the configured block cache and every read falls through to the filesystem, which usually shows up as a step change in GET p99 and disk read IOPS.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": 0}, + {"color": "yellow", "value": 0.80}, + {"color": "green", "value": 0.95} + ] + }, + "min": 0, + "max": 1, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 0, "y": 0}, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum(rate(elastickv_pebble_block_cache_hits_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])) / clamp_min(sum(rate(elastickv_pebble_block_cache_hits_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])) + sum(rate(elastickv_pebble_block_cache_misses_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])), 1e-9)", + "legendFormat": "hit rate", + "refId": "A", + "instant": true + } + ], + "title": "Block Cache Hit Rate", + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "Block cache hit rate over time. Watch for sudden drops after a deploy, a compaction burst, or a workload shift: these indicate the hot set no longer fits.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "min": 0, + "max": 1, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 16, "x": 8, "y": 0}, + "id": 2, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum(rate(elastickv_pebble_block_cache_hits_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])) / clamp_min(sum(rate(elastickv_pebble_block_cache_hits_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])) + sum(rate(elastickv_pebble_block_cache_misses_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])), 1e-9)", + "legendFormat": "hit rate", + "range": true, + "refId": "A" + } + ], + "title": "Block Cache Hit Rate Over Time", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Current block cache size per node versus the configured capacity. If 'size' sits pegged at 'capacity' and hit rate is falling, the cache is full and evicting useful blocks; consider raising the capacity or shrinking the working set.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 6}, + "id": 3, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_block_cache_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "size {{node_id}}", + "range": true, + "refId": "A" + }, + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_block_cache_capacity_bytes{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "capacity {{node_id}}", + "range": true, + "refId": "B" + } + ], + "title": "Block Cache Size vs Capacity", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Block cache hit and miss rates side-by-side. A climbing miss rate is the earliest warning that the cache is under-sized; the hit-rate panel lags it because hit rate is a ratio.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 6}, + "id": 4, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (node_id) (rate(elastickv_pebble_block_cache_hits_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval]))", + "legendFormat": "hits {{node_id}}", + "range": true, + "refId": "A" + }, + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (node_id) (rate(elastickv_pebble_block_cache_misses_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval]))", + "legendFormat": "misses {{node_id}}", + "range": true, + "refId": "B" + } + ], + "title": "Block Cache Hits / Misses Rate", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Number of L0 sublevels per node. Pebble begins throttling writes around 20 sublevels and will hard-stall the FSM if compaction cannot keep up; sustained red here usually means ingest outran compaction.", + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": 0}, + {"color": "orange", "value": 10}, + {"color": "red", "value": 20} + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 0, "y": 12}, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_l0_sublevels{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "{{node_id}}", + "refId": "A", + "instant": true + } + ], + "title": "L0 Sublevels", + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "Number of files currently in L0 per node. More than ~20 files and Pebble's write-stall heuristic will start holding up new writes. Use alongside the sublevel panel to distinguish 'many small files' from 'many overlapping files'.", + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": 0}, + {"color": "orange", "value": 10}, + {"color": "red", "value": 20} + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 8, "y": 12}, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_l0_num_files{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "{{node_id}}", + "refId": "A", + "instant": true + } + ], + "title": "L0 Num Files", + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "Pebble's own estimate of outstanding compaction work, in bytes. Growing unboundedly means compaction is losing the race; compare against the compaction rate below.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 16, "y": 12}, + "id": 7, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_compact_estimated_debt_bytes{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Compaction Estimated Debt", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Number of compactions currently running per node. Should move in lockstep with the configured max concurrent compactions; a flat-zero line while debt climbs indicates compactions are being blocked (e.g. by a flush pipeline stall).", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 0, "y": 18}, + "id": 8, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_compact_in_progress{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Compactions In Progress", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Rate of completed compactions per node. Correlate with the debt panel: a healthy engine shows debt oscillating while this rate is non-zero.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 8, "y": 18}, + "id": 9, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "rate(elastickv_pebble_compact_count_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Compaction Rate", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "FSM apply sync mode gauge: the label `mode` is `sync` or `nosync`, and the value is 1 for the active mode. Operators care because nosync gives the fastest Raft apply but loses unflushed writes on a crash; sync pays fsync on every batch. Green = nosync in this deploy's convention. **Requires `elastickv_pebble_fsm_apply_sync_mode` (PR #592). Empty until that PR merges — this is not a scrape failure.**", + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": 0}, + {"color": "green", "value": 1} + ] + }, + "mappings": [ + {"type": "value", "options": {"0": {"text": "off"}, "1": {"text": "on"}}} + ], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 16, "y": 18}, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "none", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_fsm_apply_sync_mode{job=\"elastickv\",node_id=~\"$node_id\",mode=\"nosync\"}", + "legendFormat": "{{node_id}} nosync", + "refId": "A", + "instant": true + } + ], + "title": "FSM Apply Sync Mode (nosync = 1 is desired)", + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "Active memtable count per node. Normally 1-2; sustained growth here means flushes are not keeping up with writes, which cascades into L0 file growth and eventually a write stall.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 0, "y": 24}, + "id": 11, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_memtable_count{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Memtable Count", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Total bytes held in active memtables per node. Caps at ~MemTableSize * MaxMemtableCount; steady readings near that cap mean flush is the bottleneck.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 8, "y": 24}, + "id": 12, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_memtable_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Memtable Size", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Memtables that have been flushed but cannot yet be freed because an iterator or snapshot still references them. Climbing zombie counts indicate leaked iterators or long-lived snapshots pinning memory.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 8, "x": 16, "y": 24}, + "id": 13, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "elastickv_pebble_memtable_zombie_count{job=\"elastickv\",node_id=~\"$node_id\"}", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Memtable Zombies", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Rate of OCC-style store write conflicts, grouped by key prefix. Hot prefixes light up here before they show up as Lua retry pressure or client timeouts; use this to find the hotspot key family.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 24, "x": 0, "y": 30}, + "id": 14, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (key_prefix) (rate(elastickv_store_write_conflict_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval]))", + "legendFormat": "{{key_prefix}}", + "range": true, + "refId": "A" + } + ], + "title": "Store Write Conflicts by Prefix", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "elastickv", + "pebble", + "storage", + "internals" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "definition": "label_values(elastickv_pebble_l0_num_files{job=\"elastickv\"}, node_id)", + "includeAll": true, + "label": "Node ID", + "multi": true, + "name": "node_id", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(elastickv_pebble_l0_num_files{job=\"elastickv\"}, node_id)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Elastickv Pebble Internals", + "uid": "elastickv-pebble-internals", + "version": 1 +} diff --git a/monitoring/grafana/dashboards/elastickv-redis-hotpath.json b/monitoring/grafana/dashboards/elastickv-redis-hotpath.json deleted file mode 100644 index ffc936d60..000000000 --- a/monitoring/grafana/dashboards/elastickv-redis-hotpath.json +++ /dev/null @@ -1,648 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "Redis GET hot-path health for elastickv. Built to verify PR #560 (GET fast-path skipping rawKeyTypeAt). The three panels that answer 'did #560 work?' are LinearizableRead Rate, GET Latency (p50/p95/p99), and Lease Fast-Path Hit Ratio: a successful rollout shows LinearizableRead rate collapsing, GET p99 holding or improving, and the hit ratio climbing toward 1.0.", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "id": null, - "links": [], - "panels": [ - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": 0 - }, - { - "color": "orange", - "value": 0.9 - }, - { - "color": "green", - "value": 0.99 - } - ] - }, - "min": 0, - "max": 1, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "id": 1, - "options": { - "colorMode": "value", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\",outcome=\"hit\"}[5m])) / clamp_min(sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\"}[5m])), 1e-9)", - "legendFormat": "hit ratio", - "refId": "A", - "instant": true - } - ], - "title": "Lease Fast-Path Hit Ratio", - "type": "stat", - "description": "Fraction of Coordinator.LeaseRead calls served from the local AppliedIndex instead of a full LinearizableRead. Steady-state should be >= 0.99 on a hot workload; a drop indicates leader flaps, clock-skew-driven lease expiry, or a regression on the fast path. This is one of the three #560-verification panels: if #560 landed, GET traffic should flow through LeaseRead and push this toward 1.0." - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "orange", - "value": 0.05 - }, - { - "color": "red", - "value": 0.25 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 6, - "y": 0 - }, - "id": 2, - "options": { - "colorMode": "value", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "histogram_quantile(0.99, sum by (le) (rate(elastickv_redis_request_duration_seconds_bucket{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m])))", - "legendFormat": "GET p99", - "refId": "A", - "instant": true - } - ], - "title": "GET p99 (success)", - "type": "stat", - "description": "Current p99 latency for successful GET commands over the last 5 minutes. One of the three #560-verification panels: the fast path halved pebble seek count, so this should be FLAT or DOWN across the rollout, never up." - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "orange", - "value": 10 - }, - { - "color": "red", - "value": 100 - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 12, - "y": 0 - }, - "id": 3, - "options": { - "colorMode": "value", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\",outcome=\"miss\"}[5m]))", - "legendFormat": "lease misses/s", - "refId": "A", - "instant": true - } - ], - "title": "LinearizableRead Rate (lease miss)", - "type": "stat", - "description": "Rate of Coordinator.LeaseRead calls that FELL BACK to LinearizableRead. Every miss corresponds to one slow-path raft ReadIndex round-trip, the exact behaviour #560 eliminates for GETs on steady leaders. This is the third #560-verification panel: watch it collapse as the rollout reaches each node." - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "orange", - "value": 0.001 - }, - { - "color": "red", - "value": 0.01 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 0 - }, - "id": 4, - "options": { - "colorMode": "value", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum(rate(elastickv_redis_errors_total{job=\"$job\",node_id=~\"$node_id\",command=\"GET\"}[5m])) / clamp_min(sum(rate(elastickv_redis_requests_total{job=\"$job\",node_id=~\"$node_id\",command=\"GET\"}[5m])), 1e-9)", - "legendFormat": "GET error ratio", - "refId": "A", - "instant": true - } - ], - "title": "GET Error Ratio", - "type": "stat", - "description": "Fraction of GET requests returning an error. Paired with the GET p99 stat so operators can tell latency wins from quietly-shed traffic: a fast-path bug that mis-classifies a key could turn latency wins into NIL responses, which would light up this tile but not the latency one." - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisPlacement": "auto", - "lineInterpolation": "linear", - "lineWidth": 1, - "showPoints": "auto" - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "id": 5, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "histogram_quantile(0.50, sum by (le) (rate(elastickv_redis_request_duration_seconds_bucket{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m])))", - "legendFormat": "p50", - "range": true, - "refId": "A" - }, - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "histogram_quantile(0.95, sum by (le) (rate(elastickv_redis_request_duration_seconds_bucket{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m])))", - "legendFormat": "p95", - "range": true, - "refId": "B" - }, - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "histogram_quantile(0.99, sum by (le) (rate(elastickv_redis_request_duration_seconds_bucket{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m])))", - "legendFormat": "p99", - "range": true, - "refId": "C" - } - ], - "title": "GET Latency (p50 / p95 / p99)", - "type": "timeseries", - "description": "Per-quantile GET latency taken from elastickv_redis_request_duration_seconds. #560 reduces string-GET pebble SeekGE calls from ~17 to 1-2, so the p50 should barely move (it was already fast) while p95 and especially p99 tighten on the head of the distribution. Overlay with the deploy annotation to see the step change." - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisPlacement": "auto", - "lineInterpolation": "linear", - "lineWidth": 1, - "showPoints": "auto" - }, - "unit": "reqps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "id": 6, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum by (node_id) (rate(elastickv_redis_requests_total{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m]))", - "legendFormat": "{{node_id}} success", - "range": true, - "refId": "A" - }, - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum by (node_id) (rate(elastickv_redis_requests_total{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"error\"}[5m]))", - "legendFormat": "{{node_id}} error", - "range": true, - "refId": "B" - } - ], - "title": "GET Throughput and Errors (per node)", - "type": "timeseries", - "description": "Per-node GET request rate split by outcome. Shows whether the rollout affects traffic distribution: if one node's GET rate collapses post-deploy without a matching rise elsewhere, clients are likely disconnecting instead of fast-pathing, which invalidates the latency-win interpretation." - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisPlacement": "auto", - "fillOpacity": 10, - "lineInterpolation": "linear", - "lineWidth": 1, - "showPoints": "auto", - "stacking": { - "mode": "none" - } - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 12 - }, - "id": 7, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\",outcome=\"hit\"}[5m]))", - "legendFormat": "hits", - "range": true, - "refId": "A" - }, - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\",outcome=\"miss\"}[5m]))", - "legendFormat": "misses (= LinearizableRead)", - "range": true, - "refId": "B" - } - ], - "title": "Lease Hit vs LinearizableRead Rate Over Time", - "type": "timeseries", - "description": "Hit and miss rates as time series so operators can correlate miss spikes with leader elections (see the Raft dashboard's leader_changes counter) or lease clock-skew events. The miss series IS the raft engine's slow-path read rate; a clean #560 rollout should show this line fall sharply while hits rise." - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisPlacement": "auto", - "lineInterpolation": "linear", - "lineWidth": 1, - "showPoints": "auto" - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 12 - }, - "id": 8, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum by (group) (rate(elastickv_raft_step_queue_full_total{job=\"$job\",node_id=~\"$node_id\"}[5m]))", - "legendFormat": "stepCh full (group {{group}})", - "range": true, - "refId": "A" - }, - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum by (group) (rate(elastickv_raft_dispatch_dropped_total{job=\"$job\",node_id=~\"$node_id\"}[5m]))", - "legendFormat": "dispatch dropped (group {{group}})", - "range": true, - "refId": "B" - }, - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum by (group) (rate(elastickv_raft_dispatch_errors_total{job=\"$job\",node_id=~\"$node_id\"}[5m]))", - "legendFormat": "dispatch errors (group {{group}})", - "range": true, - "refId": "C" - } - ], - "title": "Raft Queue Saturation (stepCh full / outbound drops / errors)", - "type": "timeseries", - "description": "Counter rates from the etcd raft engine. stepCh-full means inbound messages from remote peers were dropped because the local raft loop was too slow to consume them (the 'etcd raft inbound step queue is full' log line). dispatch-dropped means outbound messages were discarded before transport because the per-peer channel was full. dispatch-errors means transport delivery failed. The pre-#560 seek storm caused all three to spike together; watch for them to fall after the rollout and stay flat." - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisPlacement": "auto", - "lineInterpolation": "linear", - "lineWidth": 1, - "showPoints": "auto" - }, - "unit": "reqps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 20 - }, - "id": 9, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "sum by (command) (rate(elastickv_redis_requests_total{job=\"$job\",node_id=~\"$node_id\",command=~\"GET|SET|TYPE|EXISTS\"}[5m]))", - "legendFormat": "{{command}}", - "range": true, - "refId": "A" - } - ], - "title": "GET vs SET vs TYPE vs EXISTS Rate", - "type": "timeseries", - "description": "Proxy for seek amplification: before #560, every GET internally issued a TYPE probe, so the ratio of TYPE-like rawKeyTypeAt work to GETs was ~1:1. We do not currently export pebble SeekGE directly; this panel lets operators sanity-check that GET's external rate hasn't shifted relative to SET/TYPE/EXISTS in a way that would explain latency changes. If pebble SeekGE telemetry is exposed later, a dedicated panel should be added here." - } - ], - "refresh": "10s", - "schemaVersion": 41, - "tags": [ - "elastickv", - "redis", - "hot-path", - "pr-560" - ], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "default", - "value": "default" - }, - "hide": 0, - "includeAll": false, - "label": "Datasource", - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "current": { - "selected": false, - "text": "elastickv", - "value": "elastickv" - }, - "hide": 0, - "includeAll": false, - "label": "Job", - "multi": false, - "name": "job", - "options": [ - { - "selected": true, - "text": "elastickv", - "value": "elastickv" - } - ], - "query": "elastickv", - "type": "custom" - }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": "All", - "value": "$__all" - }, - "datasource": "$datasource", - "definition": "label_values(elastickv_redis_requests_total{job=\"$job\"}, node_id)", - "includeAll": true, - "label": "Node ID", - "multi": true, - "name": "node_id", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(elastickv_redis_requests_total{job=\"$job\"}, node_id)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "sort": 1, - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "Elastickv Redis Hot Path (PR #560)", - "uid": "elastickv-redis-hotpath", - "version": 1 -} diff --git a/monitoring/grafana/dashboards/elastickv-redis-summary.json b/monitoring/grafana/dashboards/elastickv-redis-summary.json index c492d7614..48f0514b0 100644 --- a/monitoring/grafana/dashboards/elastickv-redis-summary.json +++ b/monitoring/grafana/dashboards/elastickv-redis-summary.json @@ -415,7 +415,7 @@ ], "title": "Errors by Command", "type": "timeseries", - "description": "Error rate split by Redis command." + "description": "Error rate split by Redis command. NOTE: the `unknown` bucket aggregates every command name not in the adapter's allow-list (including unsupported commands and malformed input). For a real-name breakdown see the 'Unsupported Commands by Name' panel, which uses the PR #594 elastickv_redis_unsupported_commands_total counter." }, { "datasource": "$datasource", @@ -1024,7 +1024,7 @@ }, { "datasource": "$datasource", - "description": "Number of redis.call()/pcall() invocations per Lua script execution. High count × per-call latency = total redis.call() time. If p99 count is large (e.g. >100), the script is iterating many keys and each Raft read compounds the latency.", + "description": "Number of redis.call()/pcall() invocations per Lua script execution. High count \u00d7 per-call latency = total redis.call() time. If p99 count is large (e.g. >100), the script is iterating many keys and each Raft read compounds the latency.", "fieldConfig": { "defaults": { "color": { @@ -1135,6 +1135,608 @@ ], "title": "Average Time per redis.call() Invocation", "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Rate of Redis commands rejected as unsupported, broken down by the real command name (PR #594). Cardinality is bounded: at most 32 distinct `command` label values plus an `other` overflow bucket, so this is safe to expose globally. Values like `invalid_utf8` are sentinels emitted when the wire payload was not valid UTF-8 and the command name could not be decoded. Pair this with the 'Errors by Command' panel: the `unknown` rows there are broken down by real name here. **Requires `elastickv_redis_unsupported_commands_total` (PR #594). Empty until that PR merges — this is not a scrape failure.**", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 62 + }, + "id": 22, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (command) (rate(elastickv_redis_unsupported_commands_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval]))", + "legendFormat": "{{command}}", + "range": true, + "refId": "A" + } + ], + "title": "Unsupported Commands by Name", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 23, + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 0.9 + }, + { + "color": "green", + "value": 0.99 + } + ] + }, + "min": 0, + "max": 1, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 24, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\",outcome=\"hit\"}[5m])) / clamp_min(sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\"}[5m])), 1e-9)", + "legendFormat": "hit ratio", + "refId": "A", + "instant": true + } + ], + "title": "Lease Fast-Path Hit Ratio", + "type": "stat", + "description": "Fraction of Coordinator.LeaseRead calls served from the local AppliedIndex instead of a full LinearizableRead. Steady-state should be >= 0.99 on a hot workload; a drop indicates leader flaps, clock-skew-driven lease expiry, or a regression on the fast path. This is one of the three #560-verification panels: if #560 landed, GET traffic should flow through LeaseRead and push this toward 1.0." + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "orange", + "value": 0.05 + }, + { + "color": "red", + "value": 0.25 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 25, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(elastickv_redis_request_duration_seconds_bucket{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m])))", + "legendFormat": "GET p99", + "refId": "A", + "instant": true + } + ], + "title": "GET p99 (success)", + "type": "stat", + "description": "Current p99 latency for successful GET commands over the last 5 minutes. One of the three #560-verification panels: the fast path halved pebble seek count, so this should be FLAT or DOWN across the rollout, never up." + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "orange", + "value": 10 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 26, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\",outcome=\"miss\"}[5m]))", + "legendFormat": "lease misses/s", + "refId": "A", + "instant": true + } + ], + "title": "LinearizableRead Rate (lease miss)", + "type": "stat", + "description": "Rate of Coordinator.LeaseRead calls that FELL BACK to LinearizableRead. Every miss corresponds to one slow-path raft ReadIndex round-trip, the exact behaviour #560 eliminates for GETs on steady leaders. This is the third #560-verification panel: watch it collapse as the rollout reaches each node." + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "orange", + "value": 0.001 + }, + { + "color": "red", + "value": 0.01 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 27, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum(rate(elastickv_redis_errors_total{job=\"$job\",node_id=~\"$node_id\",command=\"GET\"}[5m])) / clamp_min(sum(rate(elastickv_redis_requests_total{job=\"$job\",node_id=~\"$node_id\",command=\"GET\"}[5m])), 1e-9)", + "legendFormat": "GET error ratio", + "refId": "A", + "instant": true + } + ], + "title": "GET Error Ratio", + "type": "stat", + "description": "Fraction of GET requests returning an error. Paired with the GET p99 stat so operators can tell latency wins from quietly-shed traffic: a fast-path bug that mis-classifies a key could turn latency wins into NIL responses, which would light up this tile but not the latency one." + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 28, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le) (rate(elastickv_redis_request_duration_seconds_bucket{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(elastickv_redis_request_duration_seconds_bucket{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m])))", + "legendFormat": "p95", + "range": true, + "refId": "B" + }, + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(elastickv_redis_request_duration_seconds_bucket{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m])))", + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "GET Latency (p50 / p95 / p99)", + "type": "timeseries", + "description": "Per-quantile GET latency taken from elastickv_redis_request_duration_seconds. #560 reduces string-GET pebble SeekGE calls from ~17 to 1-2, so the p50 should barely move (it was already fast) while p95 and especially p99 tighten on the head of the distribution. Overlay with the deploy annotation to see the step change." + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 29, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (node_id) (rate(elastickv_redis_requests_total{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"success\"}[5m]))", + "legendFormat": "{{node_id}} success", + "range": true, + "refId": "A" + }, + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (node_id) (rate(elastickv_redis_requests_total{job=\"$job\",node_id=~\"$node_id\",command=\"GET\",outcome=\"error\"}[5m]))", + "legendFormat": "{{node_id}} error", + "range": true, + "refId": "B" + } + ], + "title": "GET Throughput and Errors (per node)", + "type": "timeseries", + "description": "Per-node GET request rate split by outcome. Shows whether the rollout affects traffic distribution: if one node's GET rate collapses post-deploy without a matching rise elsewhere, clients are likely disconnecting instead of fast-pathing, which invalidates the latency-win interpretation." + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto", + "stacking": { + "mode": "none" + } + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 30, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\",outcome=\"hit\"}[5m]))", + "legendFormat": "hits", + "range": true, + "refId": "A" + }, + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum(rate(elastickv_lease_read_total{job=\"$job\",node_id=~\"$node_id\",outcome=\"miss\"}[5m]))", + "legendFormat": "misses (= LinearizableRead)", + "range": true, + "refId": "B" + } + ], + "title": "Lease Hit vs LinearizableRead Rate Over Time", + "type": "timeseries", + "description": "Hit and miss rates as time series so operators can correlate miss spikes with leader elections (see the Raft dashboard's leader_changes counter) or lease clock-skew events. The miss series IS the raft engine's slow-path read rate; a clean #560 rollout should show this line fall sharply while hits rise." + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 31, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (group) (rate(elastickv_raft_step_queue_full_total{job=\"$job\",node_id=~\"$node_id\"}[5m]))", + "legendFormat": "stepCh full (group {{group}})", + "range": true, + "refId": "A" + }, + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (group) (rate(elastickv_raft_dispatch_dropped_total{job=\"$job\",node_id=~\"$node_id\"}[5m]))", + "legendFormat": "dispatch dropped (group {{group}})", + "range": true, + "refId": "B" + }, + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (group) (rate(elastickv_raft_dispatch_errors_total{job=\"$job\",node_id=~\"$node_id\"}[5m]))", + "legendFormat": "dispatch errors (group {{group}})", + "range": true, + "refId": "C" + } + ], + "title": "Raft Queue Saturation (stepCh full / outbound drops / errors)", + "type": "timeseries", + "description": "Counter rates from the etcd raft engine. stepCh-full means inbound messages from remote peers were dropped because the local raft loop was too slow to consume them (the 'etcd raft inbound step queue is full' log line). dispatch-dropped means outbound messages were discarded before transport because the per-peer channel was full. dispatch-errors means transport delivery failed. The pre-#560 seek storm caused all three to spike together; watch for them to fall after the rollout and stay flat." + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 32, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "sum by (command) (rate(elastickv_redis_requests_total{job=\"$job\",node_id=~\"$node_id\",command=~\"GET|SET|TYPE|EXISTS\"}[5m]))", + "legendFormat": "{{command}}", + "range": true, + "refId": "A" + } + ], + "title": "GET vs SET vs TYPE vs EXISTS Rate", + "type": "timeseries", + "description": "Proxy for seek amplification: before #560, every GET internally issued a TYPE probe, so the ratio of TYPE-like rawKeyTypeAt work to GETs was ~1:1. We do not currently export pebble SeekGE directly; this panel lets operators sanity-check that GET's external rate hasn't shifted relative to SET/TYPE/EXISTS in a way that would explain latency changes. If pebble SeekGE telemetry is exposed later, a dedicated panel should be added here." + } + ], + "title": "Hot Path (legacy PR #560)", + "type": "row" } ], "refresh": "10s", @@ -1164,6 +1766,27 @@ "regex": "", "type": "datasource" }, + { + "current": { + "selected": false, + "text": "elastickv", + "value": "elastickv" + }, + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [ + { + "selected": true, + "text": "elastickv", + "value": "elastickv" + } + ], + "query": "elastickv", + "type": "custom" + }, { "allValue": ".*", "current": { @@ -1220,7 +1843,7 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Elastickv Redis Request Health", + "title": "Elastickv Redis", "uid": "elastickv-redis-summary", "version": 1 } diff --git a/monitoring/hotpath.go b/monitoring/hotpath.go index 18d22e175..6591cb5f1 100644 --- a/monitoring/hotpath.go +++ b/monitoring/hotpath.go @@ -9,8 +9,9 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -// Hot-path metrics support the "Redis Hot Path" dashboard -// (monitoring/grafana/dashboards/elastickv-redis-hotpath.json). They +// Hot-path metrics support the "Hot Path (legacy PR #560)" collapsed +// row inside monitoring/grafana/dashboards/elastickv-redis-summary.json. +// They // were added to confirm PR #560 (GET fast-path) landed in production: // the LinearizableRead call rate should drop sharply on a // string-dominated workload while GET p99 stays flat or improves, and