From 6c7ae7913b3a5285d78950aaeb2c3daf6ba8ffca Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 28 Apr 2026 01:17:04 +0900 Subject: [PATCH 1/3] ops(rolling-update): add KeyViz heatmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror the existing ADMIN_* env-driven plumbing for the KeyViz heatmap sampler so operators can flip --keyvizEnabled and the --keyvizFanoutNodes peer list from deploy.env without editing the docker run command. The previous workflow required either editing the script per-deploy or running the binary outside the rolling- update path. What's added: - KEYVIZ_ENABLED master switch (validated as true|false at the top of the local script alongside ADMIN_ENABLED). - KEYVIZ_FANOUT_NODES comma-separated host:port list. Pre-quoted via printf %q for the same reason ADMIN_FULL_ACCESS_KEYS is: commas survive an unquoted env pass but pre-quoting keeps the pattern uniform. - build_keyviz_flags helper (nameref-output, mirrors build_admin_flags). Empty array when KEYVIZ_ENABLED != "true", so existing deploys see no behaviour change. - KEYVIZ_* env vars threaded through the SSH env passthrough so the heredoc-resident build_keyviz_flags reads them on the remote. - env.example block documenting the variables and pointing at the Phase 2-C fan-out auth follow-up (docs/design/2026_04_27_proposed_keyviz_cluster_fanout.md). What's not added: - KEYVIZ_STEP, KEYVIZ_MAX_TRACKED_ROUTES, KEYVIZ_HISTORY_COLUMNS, KEYVIZ_FANOUT_TIMEOUT — these have sensible binary-side defaults and operators have not asked for env-driven overrides yet. Adding them is a one-line _flags+= edit per knob if needed. - Inter-node fan-out auth — Phase 2-C does not yet ship a pre- shared bearer token, so a multi-node KEYVIZ_FANOUT_NODES setup surfaces 5/N peer 401s in the SPA degraded banner. The env.example comment documents this. Verified locally: deployed to a 5-node cluster with KEYVIZ_ENABLED=true and KEYVIZ_FANOUT_NODES set across all five admin listeners; --keyviz* flags appeared on every container's docker inspect, and the admin keyviz handler returned a matrix with the fan-out block populated. Self-review (CLAUDE.md 5 lenses): 1. Data loss -- None. Build/runbook plumbing only. 2. Concurrency -- None. 3. Performance -- None on the deploy path. Per-node sampler cost is covered by the sampler's own design doc. 4. Data consistency -- None. 5. Test coverage -- bash -n syntax-check passes. Existing rolling- update.sh has no automated tests in this repo; the verification path is "deploy to a real cluster and inspect docker run args." --- scripts/rolling-update.env.example | 16 ++++++++++++ scripts/rolling-update.sh | 41 ++++++++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/scripts/rolling-update.env.example b/scripts/rolling-update.env.example index 3635fb10..e2acad6f 100644 --- a/scripts/rolling-update.env.example +++ b/scripts/rolling-update.env.example @@ -75,3 +75,19 @@ ADMIN_ENABLED="false" # ADMIN_TLS_KEY_FILE="/etc/elastickv/admin-tls.key" # ADMIN_ALLOW_PLAINTEXT_NON_LOOPBACK="false" # ADMIN_ALLOW_INSECURE_DEV_COOKIE="false" + +# KeyViz heatmap sampler. Disabled by default; flip KEYVIZ_ENABLED=true +# to feed the admin dashboard's /admin/api/v1/keyviz/matrix endpoint. +# The sampler is in-memory and read-only, so it is safe to enable +# regardless of whether ADMIN_ENABLED is on; it just produces no +# callers without --adminEnabled. +# +# KEYVIZ_FANOUT_NODES is an optional comma-separated host:port list of +# every admin listener in the cluster. When set, the admin handler +# merges matrices from each peer so the dashboard renders a cluster- +# wide heatmap regardless of which node served the request. NOTE: +# Phase 2-C does not yet ship inter-node auth, so peers reject the +# fan-out call with 401 unless --adminEnabled is off on the peer. +# Track the auth follow-up via docs/design/2026_04_27_proposed_keyviz_cluster_fanout.md. +KEYVIZ_ENABLED="false" +# KEYVIZ_FANOUT_NODES="10.0.0.1:8080,10.0.0.2:8080,10.0.0.3:8080" diff --git a/scripts/rolling-update.sh b/scripts/rolling-update.sh index 14cc89fa..5d1e5f30 100755 --- a/scripts/rolling-update.sh +++ b/scripts/rolling-update.sh @@ -180,6 +180,13 @@ ADMIN_TLS_KEY_FILE="${ADMIN_TLS_KEY_FILE:-}" ADMIN_ALLOW_PLAINTEXT_NON_LOOPBACK="${ADMIN_ALLOW_PLAINTEXT_NON_LOOPBACK:-false}" ADMIN_ALLOW_INSECURE_DEV_COOKIE="${ADMIN_ALLOW_INSECURE_DEV_COOKIE:-false}" +# KeyViz heatmap sampler. KEYVIZ_ENABLED is the master switch; the +# remaining variables only take effect when KEYVIZ_ENABLED=true. The +# sampler is ungated when admin is disabled (it's read-only in-memory +# state); it just produces no callers without --adminEnabled. +KEYVIZ_ENABLED="${KEYVIZ_ENABLED:-false}" +KEYVIZ_FANOUT_NODES="${KEYVIZ_FANOUT_NODES:-}" + # Validate the three boolean ADMIN_* flags must be the literal "true" # or "false" — they are forwarded to the remote shell unquoted (no # printf %q) for readability, which is only safe when the value is @@ -187,7 +194,7 @@ ADMIN_ALLOW_INSECURE_DEV_COOKIE="${ADMIN_ALLOW_INSECURE_DEV_COOKIE:-false}" # who typed "True", "1", or a stray quote sees a script-level error # pointing at the variable name instead of an inscrutable failure # inside the SSH heredoc. -for _bool_var in ADMIN_ENABLED ADMIN_ALLOW_PLAINTEXT_NON_LOOPBACK ADMIN_ALLOW_INSECURE_DEV_COOKIE; do +for _bool_var in ADMIN_ENABLED ADMIN_ALLOW_PLAINTEXT_NON_LOOPBACK ADMIN_ALLOW_INSECURE_DEV_COOKIE KEYVIZ_ENABLED; do case "${!_bool_var}" in true|false) ;; *) @@ -526,6 +533,8 @@ update_one_node() { ADMIN_TLS_KEY_FILE="$ADMIN_TLS_KEY_FILE_Q" \ ADMIN_ALLOW_PLAINTEXT_NON_LOOPBACK="$ADMIN_ALLOW_PLAINTEXT_NON_LOOPBACK" \ ADMIN_ALLOW_INSECURE_DEV_COOKIE="$ADMIN_ALLOW_INSECURE_DEV_COOKIE" \ + KEYVIZ_ENABLED="$KEYVIZ_ENABLED" \ + KEYVIZ_FANOUT_NODES="$KEYVIZ_FANOUT_NODES_Q" \ bash -s <<'REMOTE' set -euo pipefail @@ -827,6 +836,12 @@ run_container() { local admin_volumes=() build_admin_flags admin_flags admin_volumes + # KeyViz heatmap sampler. Same opt-in shape as admin: a false + # KEYVIZ_ENABLED leaves keyviz_flags empty so existing deploys are + # unchanged. + local keyviz_flags=() + build_keyviz_flags keyviz_flags + docker run -d \ --name "$CONTAINER_NAME" \ --restart unless-stopped \ @@ -845,7 +860,24 @@ run_container() { --raftDataDir "$DATA_DIR" \ --raftRedisMap "$RAFT_TO_REDIS_MAP" \ "${s3_flags[@]}" \ - "${admin_flags[@]}" >/dev/null + "${admin_flags[@]}" \ + "${keyviz_flags[@]}" >/dev/null +} + +# build_keyviz_flags emits the --keyviz* flag list. Mirrors +# build_admin_flags' nameref pattern so additional knobs (Step, +# HistoryColumns, etc.) can drop in without touching run_container. +# When KEYVIZ_ENABLED != "true" the array is left empty and the helper +# returns silently — existing deploys see no behaviour change. +build_keyviz_flags() { + local -n _flags=$1 + if [[ "${KEYVIZ_ENABLED:-false}" != "true" ]]; then + return 0 + fi + _flags+=(--keyvizEnabled) + if [[ -n "${KEYVIZ_FANOUT_NODES:-}" ]]; then + _flags+=(--keyvizFanoutNodes "$KEYVIZ_FANOUT_NODES") + fi } # build_admin_flags emits the --admin* flag list and bind-mount list @@ -1207,6 +1239,11 @@ ADMIN_SESSION_SIGNING_KEY_PREVIOUS_FILE_Q="$(printf '%q' "$ADMIN_SESSION_SIGNING ADMIN_TLS_CERT_FILE_Q="$(printf '%q' "$ADMIN_TLS_CERT_FILE")" ADMIN_TLS_KEY_FILE_Q="$(printf '%q' "$ADMIN_TLS_KEY_FILE")" +# KEYVIZ_FANOUT_NODES is a comma-separated host:port list; commas +# survive an unquoted env pass but pre-quoting keeps the pattern +# uniform with the ADMIN_* set above. +KEYVIZ_FANOUT_NODES_Q="$(printf '%q' "$KEYVIZ_FANOUT_NODES")" + echo "[rolling-update] target image: $IMAGE" for node_id in "${ROLLING_NODE_IDS[@]}"; do update_one_node "$node_id" "$(node_host_by_id "$node_id")" "$(ssh_target_by_id "$node_id")" From 94ade0dd13a1f5e579af08f83b934857532d8064 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 28 Apr 2026 01:42:32 +0900 Subject: [PATCH 2/3] Update scripts/rolling-update.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- scripts/rolling-update.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/rolling-update.sh b/scripts/rolling-update.sh index 5d1e5f30..9c7a94e0 100755 --- a/scripts/rolling-update.sh +++ b/scripts/rolling-update.sh @@ -870,13 +870,16 @@ run_container() { # When KEYVIZ_ENABLED != "true" the array is left empty and the helper # returns silently — existing deploys see no behaviour change. build_keyviz_flags() { - local -n _flags=$1 - if [[ "${KEYVIZ_ENABLED:-false}" != "true" ]]; then + local -n _flags="$1" + local enabled="${KEYVIZ_ENABLED:-false}" + if [[ "$enabled" != "true" ]]; then return 0 fi + + local fanout_nodes="${KEYVIZ_FANOUT_NODES:-}" _flags+=(--keyvizEnabled) - if [[ -n "${KEYVIZ_FANOUT_NODES:-}" ]]; then - _flags+=(--keyvizFanoutNodes "$KEYVIZ_FANOUT_NODES") + if [[ -n "$fanout_nodes" ]]; then + _flags+=(--keyvizFanoutNodes "$fanout_nodes") fi } From 76f290618f2afdbaafef0dac9df4fb66e0a212cb Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 28 Apr 2026 01:45:12 +0900 Subject: [PATCH 3/3] fix(rolling-update): refresh KeyViz docs after PR #692 Address claude bot review on 6c7ae791. Findings 1 and 2 already landed in 94ade0dd (gemini's nameref-quote + locals-snapshot suggestion accepted by the author). This commit picks up the two remaining items: - Stale comment at scripts/rolling-update.sh:1233 still said "two boolean flags (ADMIN_ENABLED, ADMIN_ALLOW_*)" -- the validation loop above also covers KEYVIZ_ENABLED now. Rewrite to "boolean flags (ADMIN_ENABLED, ADMIN_ALLOW_*, KEYVIZ_ENABLED)". - env.example fan-out note was both ambiguously phrased and outdated. PR #692 ("forward session cookie on KeyViz fan-out so peers do not 401") shipped on main while this PR was in review, so the prior "Phase 2-C does not yet ship inter-node auth" note is no longer accurate. Rewrite to describe the as-shipped behaviour: the aggregator forwards the operator's session cookie, so peers with --adminEnabled accept the call as long as the same signing key and role allow-lists are configured cluster-wide; peers without --adminEnabled expose an unauthenticated endpoint and respond unconditionally. bash -n scripts/rolling-update.sh passes. --- scripts/rolling-update.env.example | 14 ++++++++++---- scripts/rolling-update.sh | 8 ++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/scripts/rolling-update.env.example b/scripts/rolling-update.env.example index e2acad6f..49a183d6 100644 --- a/scripts/rolling-update.env.example +++ b/scripts/rolling-update.env.example @@ -85,9 +85,15 @@ ADMIN_ENABLED="false" # KEYVIZ_FANOUT_NODES is an optional comma-separated host:port list of # every admin listener in the cluster. When set, the admin handler # merges matrices from each peer so the dashboard renders a cluster- -# wide heatmap regardless of which node served the request. NOTE: -# Phase 2-C does not yet ship inter-node auth, so peers reject the -# fan-out call with 401 unless --adminEnabled is off on the peer. -# Track the auth follow-up via docs/design/2026_04_27_proposed_keyviz_cluster_fanout.md. +# wide heatmap regardless of which node served the request. The +# aggregator forwards the operator's session cookie to each peer +# (PR #692), so peers running with --adminEnabled accept the fan-out +# call as long as the cookie is valid on every node — i.e. the same +# admin signing key (ADMIN_SESSION_SIGNING_KEY_FILE) and matching +# role allow-lists must be configured cluster-wide. Peers without +# --adminEnabled expose an unauthenticated keyviz endpoint and +# respond unconditionally. +# See docs/design/2026_04_27_proposed_keyviz_cluster_fanout.md for the +# full design. KEYVIZ_ENABLED="false" # KEYVIZ_FANOUT_NODES="10.0.0.1:8080,10.0.0.2:8080,10.0.0.3:8080" diff --git a/scripts/rolling-update.sh b/scripts/rolling-update.sh index 9c7a94e0..93af1eaa 100755 --- a/scripts/rolling-update.sh +++ b/scripts/rolling-update.sh @@ -1230,10 +1230,10 @@ RAFT_TO_S3_MAP_Q="$(printf '%q' "$RAFT_TO_S3_MAP")" # the login shell once, so every value the operator might set has to # survive that pass intact. printf %q is the same hardening every # other forwarded path-like variable above gets. -# The two boolean flags (ADMIN_ENABLED, ADMIN_ALLOW_*) are validated -# at the top of the local script to be the literal "true" or "false", -# so they need no extra escaping — kept unquoted at the env site for -# readability. +# The boolean flags (ADMIN_ENABLED, ADMIN_ALLOW_*, KEYVIZ_ENABLED) +# are validated at the top of the local script to be the literal +# "true" or "false", so they need no extra escaping — kept unquoted +# at the env site for readability. ADMIN_ADDRESS_Q="$(printf '%q' "$ADMIN_ADDRESS")" ADMIN_FULL_ACCESS_KEYS_Q="$(printf '%q' "$ADMIN_FULL_ACCESS_KEYS")" ADMIN_READ_ONLY_ACCESS_KEYS_Q="$(printf '%q' "$ADMIN_READ_ONLY_ACCESS_KEYS")"