From a32263dadb7824236c7ffc2da04af7e4c7990eee Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 20 Apr 2026 06:14:47 +0900 Subject: [PATCH 01/28] test(jepsen): add ZSet safety workload with model-based checker Adds a Jepsen workload that goes beyond add->read visibility and verifies ZSet-specific safety properties under faults (network partitions, node kills): - score correctness: a ZRANGE result's score for a member must equal the model's latest committed score for that member, OR equal a score written by an operation that is concurrent with the read (since the client cannot linearise concurrent writes to the same member). - order preservation: ZRANGE 0 -1 must be sorted by (score asc, member lex asc). - ZRANGEBYSCORE correctness: bounded range queries return exactly the members whose score falls in the bound, modulo concurrent mutations. - no phantom members: every read member must have been introduced by some successful or in-flight ZADD/ZINCRBY. Concurrent-ZADD handling uses an invoke/complete windowing approach: mutations whose complete index < read's invoke index are committed before the read; mutations whose intervals overlap are concurrent and contribute to the per-member allowed-score set. Indeterminate (:info) mutations are treated as possibly-concurrent. Workload entry point added to jepsen_test.clj as elastickv-zset-safety-test. --- jepsen/src/elastickv/jepsen_test.clj | 4 + .../elastickv/redis_zset_safety_workload.clj | 555 ++++++++++++++++++ 2 files changed, 559 insertions(+) create mode 100644 jepsen/src/elastickv/redis_zset_safety_workload.clj diff --git a/jepsen/src/elastickv/jepsen_test.clj b/jepsen/src/elastickv/jepsen_test.clj index 8b9df3290..e40a8cb6f 100644 --- a/jepsen/src/elastickv/jepsen_test.clj +++ b/jepsen/src/elastickv/jepsen_test.clj @@ -1,6 +1,7 @@ (ns elastickv.jepsen-test (:gen-class) (:require [elastickv.redis-workload :as redis-workload] + [elastickv.redis-zset-safety-workload :as zset-safety-workload] [elastickv.dynamodb-workload :as dynamodb-workload] [elastickv.s3-workload :as s3-workload] [jepsen.cli :as cli])) @@ -14,6 +15,9 @@ (defn elastickv-s3-test [] (s3-workload/elastickv-s3-test {})) +(defn elastickv-zset-safety-test [] + (zset-safety-workload/elastickv-zset-safety-test {})) + (defn -main [& args] (cli/run! (cli/single-test-cmd {:test-fn elastickv-test}) args)) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj new file mode 100644 index 000000000..a0d477827 --- /dev/null +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -0,0 +1,555 @@ +(ns elastickv.redis-zset-safety-workload + "Jepsen workload verifying stronger safety properties of elastickv's + Redis ZSet (sorted set) implementation under faults. + + Beyond the simple visibility check in redis-zset-workload, this workload + exercises score correctness, ordering, range queries, phantom-member + freedom, and atomicity of compound ZSet mutations by using a custom, + model-based Checker. + + Operations (all target a single well-known key): + + {:f :zadd :value [member score]} ZADD key score member + {:f :zincrby :value [member delta]} ZINCRBY key delta member + {:f :zrem :value member} ZREM key member + {:f :zrange-all} ZRANGE key 0 -1 WITHSCORES + {:f :zrangebyscore :value [lo hi]} ZRANGEBYSCORE key lo hi WITHSCORES + + Semantics checked (see `zset-safety-checker`): + + 1. Score correctness: the score of any member observed by a :zrange-all + read must match the model's latest committed score for that member, + OR must match a score written by an operation that is concurrent with + the read (we cannot linearize concurrent writes to the same member, + so any such \"in-flight\" value is permitted). + 2. Order preservation: the result of :zrange-all must be sorted by + (score ascending, member lexicographically ascending). + 3. ZRANGEBYSCORE correctness: every member in a score-range read must + have a latest committed (or concurrent) score within [lo, hi]; and + every model member with a score in [lo, hi] must either be present + or be subject to a concurrent mutation. + 4. No phantom members: every member observed by a read must have been + introduced by some successful (or in-flight) operation. + 5. Atomicity: there is no explicit \"partial\" state to probe from the + client, but the checker treats every :ok operation as atomic — any + visible inconsistency (member present with no matching op, score + disagreeing with any known write, etc.) is reported." + (:require [clojure.string :as str] + [clojure.tools.logging :refer [warn]] + [elastickv.cli :as cli] + [elastickv.db :as ekdb] + [jepsen.db :as jdb] + [jepsen [checker :as checker] + [client :as client] + [generator :as gen] + [net :as net]] + [jepsen.checker.timeline :as timeline] + [jepsen.control :as control] + [jepsen.nemesis :as nemesis] + [jepsen.nemesis.combined :as combined] + [jepsen.os :as os] + [jepsen.os.debian :as debian] + [taoensso.carmine :as car])) + +;; --------------------------------------------------------------------------- +;; Constants +;; --------------------------------------------------------------------------- + +(def ^:private zset-key "jepsen-zset-safety") + +(def default-nodes ["n1" "n2" "n3" "n4" "n5"]) + +;; A small, fixed universe of members keeps contention high and makes the +;; model's state small enough to enumerate. +(def ^:private members + (mapv #(str "m" %) (range 16))) + +;; --------------------------------------------------------------------------- +;; Client +;; --------------------------------------------------------------------------- + +(defn- parse-withscores + "Carmine returns a flat [member score member score ...] vector for + ZRANGE WITHSCORES. Convert to a sorted vector of [member (double score)] + preserving server-returned order (score ascending, then member)." + [flat] + (->> flat + (partition 2) + (mapv (fn [[m s]] + [(if (bytes? m) (String. ^bytes m) (str m)) + (Double/parseDouble (str s))])))) + +(defrecord ElastickvRedisZSetSafetyClient [node->port conn-spec] + client/Client + + (open! [this test node] + (let [port (get node->port node 6379) + host (or (:redis-host test) (name node))] + (assoc this :conn-spec {:pool {} :spec {:host host + :port port + :timeout-ms 10000}}))) + + (close! [this _test] this) + + (setup! [this _test] + (when-let [cs (:conn-spec this)] + (try (car/wcar cs (car/del zset-key)) + (catch Exception e + (warn "ZSet safety setup DEL failed:" (.getMessage e))))) + this) + + (teardown! [this _test] this) + + (invoke! [_ _test op] + (let [cs conn-spec] + (try + (case (:f op) + :zadd + (let [[member score] (:value op)] + (car/wcar cs (car/zadd zset-key (double score) member)) + (assoc op :type :ok)) + + :zincrby + (let [[member delta] (:value op) + new-score (car/wcar cs (car/zincrby zset-key (double delta) member))] + (assoc op :type :ok + :value [member (Double/parseDouble (str new-score))])) + + :zrem + (let [member (:value op) + removed (car/wcar cs (car/zrem zset-key member))] + (assoc op :type :ok :value [member (pos? (long removed))])) + + :zrange-all + (let [flat (car/wcar cs (car/zrange zset-key 0 -1 "WITHSCORES"))] + (assoc op :type :ok :value (parse-withscores flat))) + + :zrangebyscore + (let [[lo hi] (:value op) + flat (car/wcar cs (car/zrangebyscore zset-key + (double lo) + (double hi) + "WITHSCORES"))] + (assoc op :type :ok :value {:bounds [lo hi] + :members (parse-withscores flat)}))) + (catch Exception e + (warn "ZSet safety op failed:" (:f op) (.getMessage e)) + (assoc op :type :info :error (.getMessage e))))))) + +;; --------------------------------------------------------------------------- +;; Generator +;; --------------------------------------------------------------------------- + +(defn- rand-member [] (rand-nth members)) + +(defn- gen-op [] + (let [roll (rand)] + (cond + (< roll 0.35) + {:f :zadd :value [(rand-member) (double (- (rand-int 200) 100))]} + + (< roll 0.55) + {:f :zincrby :value [(rand-member) + (double (- (rand-int 20) 10))]} + + (< roll 0.65) + {:f :zrem :value (rand-member)} + + (< roll 0.90) + {:f :zrange-all} + + :else + (let [a (- (rand-int 200) 100) + b (- (rand-int 200) 100)] + {:f :zrangebyscore :value [(double (min a b)) (double (max a b))]})))) + +(defn- op-generator [] + (reify gen/Generator + (op [this test ctx] + [(gen/fill-in-op (gen-op) ctx) this]) + (update [this _ _ _] this))) + +;; --------------------------------------------------------------------------- +;; Checker +;; --------------------------------------------------------------------------- + +(defn- sorted-by-score-then-member? + "Validates the zset invariant: (score, member) ascending, strict." + [entries] + (loop [prev nil + es entries] + (cond + (empty? es) true + (nil? prev) (recur (first es) (rest es)) + :else + (let [[pm ps] prev + [cm cs] (first es)] + (cond + (< ps cs) (recur (first es) (rest es)) + (> ps cs) false + ;; equal score: members must be strictly lexicographically ordered + (neg? (compare pm cm)) (recur (first es) (rest es)) + :else false))))) + +(defn- index-by-time + "Return a vector of ops sorted by :index." + [ops] + (vec (sort-by :index ops))) + +(defn- pair-invokes-with-completions + "Returns a sequence of {:invoke inv :complete cmp} pairs for each + completed op (ok/fail/info). Invokes without a matching completion are + paired with nil (still in flight at history end)." + [history] + (let [by-process (group-by :process history)] + (mapcat + (fn [[_p ops]] + (let [ops (index-by-time ops)] + (loop [ops ops acc []] + (if (empty? ops) acc + (let [[o & rest-ops] ops] + (cond + (= :invoke (:type o)) + (let [c (first rest-ops)] + (if (and c (#{:ok :fail :info} (:type c))) + (recur (drop 1 rest-ops) (conj acc {:invoke o :complete c})) + (recur rest-ops (conj acc {:invoke o :complete nil})))) + :else (recur rest-ops acc))))))) + by-process))) + +(defn- mutation? + [op] + (#{:zadd :zincrby :zrem} (:f op))) + +(defn- completed-mutation-window + "For each completed mutation, produce + {:member m :score s :zrem? bool? :invoke-idx i :complete-idx j :type t}. + For :zadd and :zincrby, :score is the final (committed) score. For :zrem, + :score is nil and :zrem? is true. :info (indeterminate) ops are kept as + possibly-applied with :type :info." + [pairs] + (keep + (fn [{:keys [invoke complete]}] + (when (and invoke (mutation? invoke)) + (let [f (:f invoke) + t (if complete (:type complete) :pending) + inv-idx (:index invoke) + cmp-idx (when complete (:index complete))] + (case f + :zadd + (let [[m s] (:value invoke)] + {:f :zadd :member m :score (double s) + :type t :invoke-idx inv-idx :complete-idx cmp-idx}) + + :zincrby + (let [[m _delta] (:value invoke) + s (when (and (= :ok t) (vector? (:value complete))) + (second (:value complete)))] + {:f :zincrby :member m :score (some-> s double) + :type t :invoke-idx inv-idx :complete-idx cmp-idx}) + + :zrem + (let [m (:value invoke)] + {:f :zrem :member m :score nil :zrem? true + :type t :invoke-idx inv-idx :complete-idx cmp-idx}))))) + pairs)) + +(defn- mutations-by-member + [mutations] + (group-by :member mutations)) + +(defn- concurrent? + "A mutation m is concurrent with a read r iff m's invoke precedes r's + completion AND m's completion (or end-of-history) follows r's invoke." + [m read-inv-idx read-cmp-idx] + (and (<= (:invoke-idx m) read-cmp-idx) + (or (nil? (:complete-idx m)) + (>= (:complete-idx m) read-inv-idx)))) + +(defn- model-before + "Construct model state from the set of mutations whose completions + strictly precede `read-inv-idx`. Model maps member -> {:score s} or + marks member as :deleted. Returns {:members map :ok-members set}. + Only considers :ok mutations for the authoritative model; :info + mutations are treated as uncertain (neither strictly applied nor not)." + [mutations-by-m read-inv-idx] + (reduce-kv + (fn [model member muts] + (let [applied (->> muts + (filter #(and (= :ok (:type %)) + (some? (:complete-idx %)) + (< (:complete-idx %) read-inv-idx))) + (sort-by :complete-idx)) + state (reduce + (fn [st m] + (case (:f m) + :zadd {:present? true :score (:score m)} + :zincrby {:present? true :score (:score m)} + :zrem {:present? false :score nil})) + nil + applied)] + (if state + (assoc model member state) + model))) + {} + mutations-by-m)) + +(defn- concurrent-mutations-for-member + "All mutations (ok or info) that are concurrent with the read window." + [muts read-inv-idx read-cmp-idx] + (filter #(concurrent? % read-inv-idx read-cmp-idx) muts)) + +(defn- allowed-scores-for-member + "Compute the set of scores considered valid for `member` by a read + whose window is [read-inv-idx, read-cmp-idx], based on committed state + and any concurrent mutations." + [mutations-by-m member read-inv-idx read-cmp-idx] + (let [muts (get mutations-by-m member []) + committed (->> muts + (filter #(and (= :ok (:type %)) + (some? (:complete-idx %)) + (< (:complete-idx %) read-inv-idx))) + (sort-by :complete-idx)) + committed-state (reduce + (fn [st m] + (case (:f m) + (:zadd :zincrby) {:present? true :score (:score m)} + :zrem {:present? false :score nil})) + nil + committed) + concurrent (concurrent-mutations-for-member muts read-inv-idx read-cmp-idx) + scores (cond-> #{} + (and committed-state (:present? committed-state)) + (conj (:score committed-state))) + scores (reduce + (fn [acc m] + (case (:f m) + :zadd (conj acc (:score m)) + :zincrby (cond-> acc (some? (:score m)) (conj (:score m))) + :zrem acc)) + scores + concurrent)] + {:scores scores + :must-be-present? (boolean (and committed-state (:present? committed-state) + (empty? concurrent))) + :any-known? (or (boolean committed-state) (seq concurrent))})) + +(defn- check-zrange-all + [mutations-by-m {:keys [invoke complete] :as _pair}] + (let [entries (:value complete) + inv-idx (:index invoke) + cmp-idx (:index complete) + errors (atom [])] + ;; 1. Ordering + (when-not (sorted-by-score-then-member? entries) + (swap! errors conj {:kind :unsorted + :index cmp-idx + :entries entries})) + ;; 2. For each observed (member,score): validate score and non-phantom + (doseq [[member score] entries] + (let [{:keys [scores any-known?]} + (allowed-scores-for-member mutations-by-m member inv-idx cmp-idx)] + (cond + (not any-known?) + (swap! errors conj {:kind :phantom + :index cmp-idx + :member member + :score score}) + (not (contains? scores score)) + (swap! errors conj {:kind :score-mismatch + :index cmp-idx + :member member + :observed score + :allowed scores})))) + ;; 3. Completeness: model-required members must appear. + (let [model (model-before mutations-by-m inv-idx) + observed-members (into #{} (map first) entries)] + (doseq [[member {:keys [present?]}] model] + (when (and present? (not (contains? observed-members member))) + (let [muts (get mutations-by-m member []) + concurrent (concurrent-mutations-for-member muts inv-idx cmp-idx)] + (when (empty? concurrent) + (swap! errors conj {:kind :missing-member + :index cmp-idx + :member member})))))) + @errors)) + +(defn- check-zrangebyscore + [mutations-by-m {:keys [invoke complete] :as _pair}] + (let [{:keys [bounds members]} (:value complete) + [lo hi] bounds + inv-idx (:index invoke) + cmp-idx (:index complete) + errors (atom [])] + (when-not (sorted-by-score-then-member? members) + (swap! errors conj {:kind :unsorted-range + :index cmp-idx + :bounds bounds + :members members})) + ;; Observed members must be within bounds AND have a known allowed score. + (doseq [[member score] members] + (when (or (< score lo) (> score hi)) + (swap! errors conj {:kind :out-of-range + :index cmp-idx + :bounds bounds + :member member + :score score})) + (let [{:keys [scores any-known?]} + (allowed-scores-for-member mutations-by-m member inv-idx cmp-idx)] + (cond + (not any-known?) + (swap! errors conj {:kind :phantom-range + :index cmp-idx + :member member + :score score}) + (not (contains? scores score)) + (swap! errors conj {:kind :score-mismatch-range + :index cmp-idx + :member member + :observed score + :allowed scores})))) + ;; Completeness within bounds: any model member whose committed score + ;; is in [lo,hi] with no concurrent mutation must appear. + (let [model (model-before mutations-by-m inv-idx) + observed-members (into #{} (map first) members)] + (doseq [[member {:keys [present? score]}] model] + (when (and present? + (<= lo score hi) + (not (contains? observed-members member))) + (let [muts (get mutations-by-m member []) + concurrent (concurrent-mutations-for-member muts inv-idx cmp-idx)] + (when (empty? concurrent) + (swap! errors conj {:kind :missing-member-range + :index cmp-idx + :bounds bounds + :member member + :expected-score score})))))) + @errors)) + +(defn zset-safety-checker + "Custom Jepsen checker: validates ZSet safety properties using a + last-writer model combined with a concurrent-write relaxation." + [] + (reify checker/Checker + (check [_ _test history _opts] + (let [pairs (pair-invokes-with-completions history) + mutations (completed-mutation-window pairs) + mutations-by-m (mutations-by-member mutations) + read-pairs (filter (fn [{:keys [invoke complete]}] + (and invoke complete + (= :ok (:type complete)) + (#{:zrange-all :zrangebyscore} + (:f invoke)))) + pairs) + all-errors (reduce + (fn [acc {:keys [invoke] :as pair}] + (into acc + (case (:f invoke) + :zrange-all (check-zrange-all mutations-by-m pair) + :zrangebyscore (check-zrangebyscore mutations-by-m pair)))) + [] + read-pairs) + by-kind (group-by :kind all-errors)] + {:valid? (empty? all-errors) + :reads (count read-pairs) + :mutations (count mutations) + :error-count (count all-errors) + :errors-by-kind (into {} (map (fn [[k v]] [k (count v)]) by-kind)) + :first-errors (take 20 all-errors)})))) + +;; --------------------------------------------------------------------------- +;; Workload +;; --------------------------------------------------------------------------- + +(defn elastickv-zset-safety-workload + [opts] + (let [node->port (or (:node->port opts) + (zipmap default-nodes (repeat 6379))) + client (->ElastickvRedisZSetSafetyClient node->port nil)] + {:client client + :checker (checker/compose + {:zset-safety (zset-safety-checker) + :timeline (timeline/html)}) + :generator (op-generator) + :final-generator (gen/once {:f :zrange-all})})) + +(defn elastickv-zset-safety-test + "Builds a Jepsen test map that drives elastickv's Redis ZSet safety + workload." + ([] (elastickv-zset-safety-test {})) + ([opts] + (let [nodes (or (:nodes opts) default-nodes) + redis-ports (or (:redis-ports opts) + (repeat (count nodes) (or (:redis-port opts) 6379))) + node->port (or (:node->port opts) + (cli/ports->node-map redis-ports nodes)) + local? (:local opts) + db (if local? + jdb/noop + (ekdb/db {:grpc-port (or (:grpc-port opts) 50051) + :redis-port node->port + :raft-groups (:raft-groups opts) + :shard-ranges (:shard-ranges opts)})) + rate (double (or (:rate opts) 10)) + time-limit (or (:time-limit opts) 60) + faults (if local? + [] + (cli/normalize-faults (or (:faults opts) [:partition :kill]))) + nemesis-p (when-not local? + (combined/nemesis-package {:db db + :faults faults + :interval (or (:fault-interval opts) 40)})) + nemesis-gen (if nemesis-p + (:generator nemesis-p) + (gen/once {:type :info :f :noop})) + workload (elastickv-zset-safety-workload + (assoc opts :node->port node->port))] + (merge workload + {:name (or (:name opts) "elastickv-redis-zset-safety") + :nodes nodes + :db db + :redis-host (:redis-host opts) + :os (if local? os/noop debian/os) + :net (if local? net/noop net/iptables) + :ssh (merge {:username "vagrant" + :private-key-path "/home/vagrant/.ssh/id_rsa" + :strict-host-key-checking false} + (when local? {:dummy true}) + (:ssh opts)) + :remote control/ssh + :nemesis (if nemesis-p (:nemesis nemesis-p) nemesis/noop) + :final-generator nil + :concurrency (or (:concurrency opts) 5) + :generator (->> (:generator workload) + (gen/nemesis nemesis-gen) + (gen/stagger (/ rate)) + (gen/time-limit time-limit))})))) + +;; --------------------------------------------------------------------------- +;; CLI +;; --------------------------------------------------------------------------- + +(def zset-safety-cli-opts + [[nil "--ports PORTS" "Comma-separated Redis ports (per node)." + :default nil + :parse-fn (fn [s] + (->> (str/split s #",") + (remove str/blank?) + (mapv #(Integer/parseInt %))))] + [nil "--redis-port PORT" "Redis port applied to all nodes." + :default 6379 + :parse-fn #(Integer/parseInt %)]]) + +(defn- prepare-zset-safety-opts [options] + (let [ports (or (:ports options) nil) + options (cli/parse-common-opts options ports)] + (assoc options + :redis-host (:host options) + :redis-ports ports + :redis-port (:redis-port options)))) + +(defn -main [& args] + (cli/run-workload! args + (into cli/common-cli-opts zset-safety-cli-opts) + prepare-zset-safety-opts + elastickv-zset-safety-test)) From 2693a30de28f54861e4ec795b974ff31c13698db Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 20 Apr 2026 06:20:05 +0900 Subject: [PATCH 02/28] ci(jepsen): run ZSet safety workload in per-push and scheduled jobs - jepsen-test.yml: 5s smoke run on every push, mirroring the other workloads. - jepsen-test-scheduled.yml: 150s default run (overridable via workflow_dispatch inputs) every 6 hours. Workload entry: elastickv.redis-zset-safety-workload (added in the previous commit via -main). --- .github/workflows/jepsen-test-scheduled.yml | 10 ++++++++++ .github/workflows/jepsen-test.yml | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/.github/workflows/jepsen-test-scheduled.yml b/.github/workflows/jepsen-test-scheduled.yml index ad925f8f6..46a8129ef 100644 --- a/.github/workflows/jepsen-test-scheduled.yml +++ b/.github/workflows/jepsen-test-scheduled.yml @@ -101,6 +101,16 @@ jobs: --max-txn-length ${{ inputs.max-txn-length || '4' }} \ --ports 63791,63792,63793 \ --host 127.0.0.1 + - name: Run Redis ZSet safety Jepsen workload against elastickv + working-directory: jepsen + timeout-minutes: 10 + run: | + timeout 480 ~/lein run -m elastickv.redis-zset-safety-workload \ + --time-limit ${{ inputs.time-limit || '150' }} \ + --rate ${{ inputs.rate || '10' }} \ + --concurrency ${{ inputs.concurrency || '8' }} \ + --ports 63791,63792,63793 \ + --host 127.0.0.1 - name: Run DynamoDB Jepsen workload against elastickv working-directory: jepsen timeout-minutes: 10 diff --git a/.github/workflows/jepsen-test.yml b/.github/workflows/jepsen-test.yml index aeac9354c..aeb4e2d4d 100644 --- a/.github/workflows/jepsen-test.yml +++ b/.github/workflows/jepsen-test.yml @@ -90,6 +90,11 @@ jobs: timeout-minutes: 3 run: | timeout 120 ~/lein run -m elastickv.redis-workload --time-limit 5 --rate 5 --concurrency 5 --ports 63791,63792,63793 --host 127.0.0.1 + - name: Run Redis ZSet safety Jepsen workload against elastickv + working-directory: jepsen + timeout-minutes: 3 + run: | + timeout 120 ~/lein run -m elastickv.redis-zset-safety-workload --time-limit 5 --rate 5 --concurrency 5 --ports 63791,63792,63793 --host 127.0.0.1 - name: Run DynamoDB Jepsen workload against elastickv working-directory: jepsen timeout-minutes: 3 From 94be1bdd22510f1a857dd3a1b5f05068e6ede110 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 20 Apr 2026 06:59:15 +0900 Subject: [PATCH 03/28] fix(jepsen-zset-safety): plug checker false positives + add unit tests Three Copilot findings on PR #550: 1. :zincrby indeterminate handling. Pending or :info ZINCRBY left the resulting score unknown, but the checker still required the observed read score to be in the finite allowed-scores set. A read that legitimately observed an in-flight increment was flagged as a score mismatch (false positive). completed-mutation-window now sets :unknown-score? on a ZINCRBY when the completion is :info or pending. allowed-scores-for-member returns :unknown-score? when any concurrent ZINCRBY carries the flag, and check-zrange-all / check-zrangebyscore skip the strict score-membership check in that case. 2. :zrem no-op handling. ZREM of a never-added member returns 0 server-side (no-op). The previous model treated every ZREM as a deletion, producing missing-member false positives and score-mismatch false negatives. invoke! already exposes the actual removed? boolean as the second element of the completion value. completed-mutation-window now threads :removed? through, and the new apply-mutation-to-state helper leaves state unchanged when :removed? is false. 3. model-before docstring claimed it returned {:members map :ok-members set}, but it returned the model map directly. Docstring rewritten to match the actual return value. Adds jepsen/test/elastickv/redis_zset_safety_workload_test.clj covering test-spec construction, the no-op ZREM edge case, the :info ZINCRBY skip, and a positive-control score-mismatch detection. The checker tests bypass the timeline.html sub-checker (which writes to the test store) by invoking zset-safety-checker directly. --- .../elastickv/redis_zset_safety_workload.clj | 104 ++++++++++++------ .../redis_zset_safety_workload_test.clj | 76 +++++++++++++ 2 files changed, 149 insertions(+), 31 deletions(-) create mode 100644 jepsen/test/elastickv/redis_zset_safety_workload_test.clj diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index a0d477827..07953bd9c 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -223,10 +223,18 @@ (defn- completed-mutation-window "For each completed mutation, produce - {:member m :score s :zrem? bool? :invoke-idx i :complete-idx j :type t}. - For :zadd and :zincrby, :score is the final (committed) score. For :zrem, - :score is nil and :zrem? is true. :info (indeterminate) ops are kept as - possibly-applied with :type :info." + {:member m :score s :zrem? bool? :unknown-score? bool? :invoke-idx i + :complete-idx j :type t}. + - :zadd: :score is the requested score (always known). + - :zincrby: when :ok, :score is the server-returned final score. When + :info or pending, the resulting score is unknown (depends on which + other ops were applied first); :unknown-score? is set so allowed- + scores-for-member can short-circuit the strict score check. + - :zrem: :removed? is the boolean returned by ZREM (true iff the + member existed). A no-op ZREM (returns 0) does NOT mutate state, so + the model must not treat it as a deletion. + :info / :pending mutations are still emitted so concurrent windows + account for their possible effect." [pairs] (keep (fn [{:keys [invoke complete]}] @@ -243,14 +251,30 @@ :zincrby (let [[m _delta] (:value invoke) - s (when (and (= :ok t) (vector? (:value complete))) + ;; ZINCRBY's resulting score is only knowable from the + ;; server reply. For :info/:pending we don't have it. + ok? (= :ok t) + s (when (and ok? (vector? (:value complete))) (second (:value complete)))] {:f :zincrby :member m :score (some-> s double) + :unknown-score? (not (and ok? (some? s))) :type t :invoke-idx inv-idx :complete-idx cmp-idx}) :zrem - (let [m (:value invoke)] - {:f :zrem :member m :score nil :zrem? true + (let [m (:value invoke) + ;; invoke! returns [member removed?]. For :info we don't + ;; know whether the member was removed. + removed? (cond + (and (= :ok t) + (vector? (:value complete))) + (boolean (second (:value complete))) + ;; pending / info: assume removal could have + ;; happened; the checker treats it as a + ;; possibly-concurrent deletion via the + ;; concurrent window. + :else true)] + {:f :zrem :member m :score nil + :zrem? true :removed? removed? :type t :invoke-idx inv-idx :complete-idx cmp-idx}))))) pairs)) @@ -266,12 +290,23 @@ (or (nil? (:complete-idx m)) (>= (:complete-idx m) read-inv-idx)))) +(defn- apply-mutation-to-state + "Fold one mutation into a per-member state {:present? bool :score s}. + A no-op ZREM (member did not exist; :removed? false) leaves state + unchanged so the checker doesn't falsely conclude the member is gone." + [st m] + (case (:f m) + :zadd {:present? true :score (:score m)} + :zincrby {:present? true :score (:score m)} + :zrem (if (:removed? m) + {:present? false :score nil} + st))) + (defn- model-before - "Construct model state from the set of mutations whose completions - strictly precede `read-inv-idx`. Model maps member -> {:score s} or - marks member as :deleted. Returns {:members map :ok-members set}. - Only considers :ok mutations for the authoritative model; :info - mutations are treated as uncertain (neither strictly applied nor not)." + "Construct authoritative per-member state from mutations whose + completions strictly precede read-inv-idx. Returns + {member -> {:present? bool :score s}}. Only :ok mutations contribute; + :info / :pending are deferred to the concurrent-window check." [mutations-by-m read-inv-idx] (reduce-kv (fn [model member muts] @@ -280,14 +315,7 @@ (some? (:complete-idx %)) (< (:complete-idx %) read-inv-idx))) (sort-by :complete-idx)) - state (reduce - (fn [st m] - (case (:f m) - :zadd {:present? true :score (:score m)} - :zincrby {:present? true :score (:score m)} - :zrem {:present? false :score nil})) - nil - applied)] + state (reduce apply-mutation-to-state nil applied)] (if state (assoc model member state) model))) @@ -302,7 +330,18 @@ (defn- allowed-scores-for-member "Compute the set of scores considered valid for `member` by a read whose window is [read-inv-idx, read-cmp-idx], based on committed state - and any concurrent mutations." + and any concurrent mutations. + + Returns: + :scores - set of acceptable scores (committed + concurrent + :zadd / :ok :zincrby). + :unknown-score? - true iff any concurrent ZINCRBY's resulting score + is unknown (in-flight or :info). When set, the + caller MUST skip the strict score-membership + check to stay sound. + :must-be-present? - committed state says present and no concurrent + mutation could have removed/changed it. + :any-known? - some op claims to have touched this member." [mutations-by-m member read-inv-idx read-cmp-idx] (let [muts (get mutations-by-m member []) committed (->> muts @@ -310,13 +349,7 @@ (some? (:complete-idx %)) (< (:complete-idx %) read-inv-idx))) (sort-by :complete-idx)) - committed-state (reduce - (fn [st m] - (case (:f m) - (:zadd :zincrby) {:present? true :score (:score m)} - :zrem {:present? false :score nil})) - nil - committed) + committed-state (reduce apply-mutation-to-state nil committed) concurrent (concurrent-mutations-for-member muts read-inv-idx read-cmp-idx) scores (cond-> #{} (and committed-state (:present? committed-state)) @@ -328,8 +361,12 @@ :zincrby (cond-> acc (some? (:score m)) (conj (:score m))) :zrem acc)) scores - concurrent)] + concurrent) + unknown-score? (some #(and (= :zincrby (:f %)) + (:unknown-score? %)) + concurrent)] {:scores scores + :unknown-score? (boolean unknown-score?) :must-be-present? (boolean (and committed-state (:present? committed-state) (empty? concurrent))) :any-known? (or (boolean committed-state) (seq concurrent))})) @@ -347,7 +384,7 @@ :entries entries})) ;; 2. For each observed (member,score): validate score and non-phantom (doseq [[member score] entries] - (let [{:keys [scores any-known?]} + (let [{:keys [scores any-known? unknown-score?]} (allowed-scores-for-member mutations-by-m member inv-idx cmp-idx)] (cond (not any-known?) @@ -355,6 +392,10 @@ :index cmp-idx :member member :score score}) + ;; Skip the strict score check when any concurrent ZINCRBY's + ;; resulting score is unknown: the read could legitimately + ;; observe any value the in-flight increment produces. + unknown-score? nil (not (contains? scores score)) (swap! errors conj {:kind :score-mismatch :index cmp-idx @@ -394,7 +435,7 @@ :bounds bounds :member member :score score})) - (let [{:keys [scores any-known?]} + (let [{:keys [scores any-known? unknown-score?]} (allowed-scores-for-member mutations-by-m member inv-idx cmp-idx)] (cond (not any-known?) @@ -402,6 +443,7 @@ :index cmp-idx :member member :score score}) + unknown-score? nil (not (contains? scores score)) (swap! errors conj {:kind :score-mismatch-range :index cmp-idx diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj new file mode 100644 index 000000000..25bd6a042 --- /dev/null +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -0,0 +1,76 @@ +(ns elastickv.redis-zset-safety-workload-test + "Unit tests for the ZSet safety workload's test-spec construction and + the model-based checker's edge cases (no-op ZREM, :info ZINCRBY)." + (:require [clojure.test :refer :all] + [jepsen.checker :as checker] + [elastickv.redis-zset-safety-workload :as workload])) + +;; --------------------------------------------------------------------------- +;; Test-spec construction +;; --------------------------------------------------------------------------- + +(deftest builds-test-spec + (let [t (workload/elastickv-zset-safety-test {})] + (is (map? t)) + (is (= "elastickv-redis-zset-safety" (:name t))) + (is (= ["n1" "n2" "n3" "n4" "n5"] (:nodes t))) + (is (some? (:client t))) + (is (some? (:checker t))) + (is (some? (:generator t))))) + +(deftest custom-options-override-defaults + (let [t (workload/elastickv-zset-safety-test + {:time-limit 30 + :concurrency 8 + :rate 4})] + (is (= 8 (:concurrency t))))) + +;; --------------------------------------------------------------------------- +;; Checker edge cases +;; --------------------------------------------------------------------------- + +(defn- run-checker + "Run the workload's safety checker against an in-memory history. + Bypasses the composed timeline.html checker (which writes files to + the test store) so tests stay hermetic." + [history] + (checker/check (workload/zset-safety-checker) + (workload/elastickv-zset-safety-test {}) + history + nil)) + +(deftest noop-zrem-does-not-flag-correct-read + ;; ZREM of a member that was never added returns 0 (no-op). The model + ;; must not treat it as a deletion. A subsequent read showing the + ;; absence of that member is correct. + (let [history [{:type :invoke :process 0 :f :zrem :value "ghost" :index 0} + {:type :ok :process 0 :f :zrem :value ["ghost" false] :index 1} + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 2} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 3} + {:type :invoke :process 0 :f :zrange-all :index 4} + {:type :ok :process 0 :f :zrange-all :value [["m1" 1.0]] :index 5}] + result (run-checker history)] + (is (:valid? result) (str "expected valid, got: " result)))) + +(deftest info-zincrby-skips-strict-score-check + ;; ZINCRBY whose response was lost (:info) leaves the resulting score + ;; unknown. A read concurrent with such an op observing some derived + ;; score must NOT be flagged as a score mismatch. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 1 :f :zincrby :value ["m1" 5] :index 2} + {:type :invoke :process 0 :f :zrange-all :index 3} + {:type :ok :process 0 :f :zrange-all :value [["m1" 6.0]] :index 4} + {:type :info :process 1 :f :zincrby :value ["m1" 5] :index 5}] + result (run-checker history)] + (is (:valid? result) (str "expected valid, got: " result)))) + +(deftest score-mismatch-is-detected-when-no-uncertainty + ;; Sanity check: with all ops :ok and no concurrency, an obviously + ;; wrong observed score IS flagged. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 0 :f :zrange-all :index 2} + {:type :ok :process 0 :f :zrange-all :value [["m1" 999.0]] :index 3}] + result (run-checker history)] + (is (not (:valid? result)) (str "expected mismatch, got: " result)))) From 9bfcc13f1d0299cd3893a05e25c2d7e0201466af Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 20 Apr 2026 15:15:41 +0900 Subject: [PATCH 04/28] fix(jepsen-zset-safety): no-op-ZREM-only member must not trigger :score-mismatch CI failure on 94be1bdd surfaced a remaining checker false positive: a read that observes a member whose only prior ops are no-op ZREMs was classified as :score-mismatch with :allowed #{} rather than treated as a never-existed member. allowed-scores-for-member returned any-known? = true because (seq concurrent) was truthy even when every concurrent op was a :zrem that didn't actually remove anything. Fix: any-known? (renamed internally to existence-evidence?) now only counts concurrent mutations that provide evidence the member ever existed -- :zadd, :zincrby, or a :zrem whose :removed? boolean is true. A concurrent no-op ZREM contributes nothing. Adds no-op-zrem-alone-does-not-false-positive as a regression test. All 6 workload unit tests pass under Java 21. --- .../src/elastickv/redis_zset_safety_workload.clj | 16 ++++++++++++++-- .../redis_zset_safety_workload_test.clj | 13 +++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 07953bd9c..f0f22ebb7 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -364,12 +364,24 @@ concurrent) unknown-score? (some #(and (= :zincrby (:f %)) (:unknown-score? %)) - concurrent)] + concurrent) + ;; any-known? must only be true when something provides evidence + ;; the member actually existed at some point. A no-op ZREM + ;; (:removed? false) does NOT prove existence -- it explicitly + ;; says the member wasn't there. Without this guard, reads of a + ;; never-added member that happened to race with a no-op ZREM + ;; would flip from :phantom to :score-mismatch with an empty + ;; :allowed set, producing a false positive. + existence-evidence? (or (boolean committed-state) + (some #(case (:f %) + (:zadd :zincrby) true + :zrem (:removed? %)) + concurrent))] {:scores scores :unknown-score? (boolean unknown-score?) :must-be-present? (boolean (and committed-state (:present? committed-state) (empty? concurrent))) - :any-known? (or (boolean committed-state) (seq concurrent))})) + :any-known? (boolean existence-evidence?)})) (defn- check-zrange-all [mutations-by-m {:keys [invoke complete] :as _pair}] diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 25bd6a042..5c59dcd6f 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -74,3 +74,16 @@ {:type :ok :process 0 :f :zrange-all :value [["m1" 999.0]] :index 3}] result (run-checker history)] (is (not (:valid? result)) (str "expected mismatch, got: " result)))) + +(deftest no-op-zrem-alone-does-not-false-positive + ;; CI-observed false positive: a member whose only prior ops are no-op + ;; ZREMs was classified as :score-mismatch with :allowed #{} instead + ;; of treated as never-existed (:phantom candidate, empty read -> OK). + ;; After the existence-evidence? fix, a read that observes NO such + ;; member must be accepted as valid. + (let [history [{:type :invoke :process 0 :f :zrem :value "never-added" :index 0} + {:type :invoke :process 1 :f :zrange-all :index 1} + {:type :ok :process 1 :f :zrange-all :value [] :index 2} + {:type :ok :process 0 :f :zrem :value ["never-added" false] :index 3}] + result (run-checker history)] + (is (:valid? result) (str "expected valid, got: " result)))) From 6d0b4c3d781ef6e554b97a57a45227a88b1a0eba Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 20 Apr 2026 15:24:33 +0900 Subject: [PATCH 05/28] fix(jepsen-zset-safety): address CodeRabbit checker soundness issues Three Major-severity CodeRabbit findings on PR #550: 1. Duplicate-member detection (line 416): a ZSet read must return each member at most once. Previously, if ZRANGE returned the same member twice with an allowed score, the checker accepted it because sort and score-membership checks passed independently per entry. duplicate-members helper now flags :duplicate-members (and :duplicate-members-range for ZRANGEBYSCORE) before the per-entry loop. 2. Overlapping committed writes (line 318): two :ok mutations whose invoke/complete windows overlap have ambiguous serialization order. Pinning allowed-scores to a single last-wins linearization by :complete-idx was unsound. allowed-scores-for-member now unions all :zadd/:ok-:zincrby scores from committed mutations (over-approx that stays sound), and must-be-present? is relaxed when any pair of committed writes for the same member overlaps in time. 3. Pre-read :info mutations (line 328): a mutation recorded as :info whose completion precedes a later read's invoke may have taken effect server-side. Previously it was ignored by both model-before (:ok only) and the concurrent window (complete-idx >= read-inv-idx required). Now collected as pre-read-info, contributing to allowed scores and flipping unknown-score? for :zincrby with unknown resulting score. 3 new regression tests (duplicate-members-are-flagged, overlapping- committed-zadds-allow-either-score, info-before-read-is-considered- uncertain). Workload unit test count now 9, all PASS under Java 21. --- .../elastickv/redis_zset_safety_workload.clj | 105 ++++++++++++++---- .../redis_zset_safety_workload_test.clj | 53 +++++++++ 2 files changed, 134 insertions(+), 24 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index f0f22ebb7..549845e40 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -344,45 +344,88 @@ :any-known? - some op claims to have touched this member." [mutations-by-m member read-inv-idx read-cmp-idx] (let [muts (get mutations-by-m member []) + ;; :ok mutations that completed strictly before the read. They + ;; may have overlapped with each other in wall-clock time, so + ;; the serialization order among them is ambiguous. committed (->> muts (filter #(and (= :ok (:type %)) (some? (:complete-idx %)) - (< (:complete-idx %) read-inv-idx))) - (sort-by :complete-idx)) - committed-state (reduce apply-mutation-to-state nil committed) + (< (:complete-idx %) read-inv-idx)))) + ;; :info mutations that completed before the read: they may or + ;; may not have taken effect server-side. We must account for + ;; their possible scores just like concurrent ones. + pre-read-info (->> muts + (filter #(and (= :info (:type %)) + (some? (:complete-idx %)) + (< (:complete-idx %) read-inv-idx)))) + ;; Concurrent mutations: windows overlap the read. Include both + ;; :ok and :info since either may have taken effect. concurrent (concurrent-mutations-for-member muts read-inv-idx read-cmp-idx) - scores (cond-> #{} - (and committed-state (:present? committed-state)) - (conj (:score committed-state))) - scores (reduce - (fn [acc m] - (case (:f m) - :zadd (conj acc (:score m)) - :zincrby (cond-> acc (some? (:score m)) (conj (:score m))) - :zrem acc)) - scores - concurrent) - unknown-score? (some #(and (= :zincrby (:f %)) - (:unknown-score? %)) - concurrent) + ;; A conservative last-wins linearization for the must-be-present? + ;; check only. Ambiguous when committed writes overlap each other. + committed-sorted (sort-by :complete-idx committed) + committed-state (reduce apply-mutation-to-state nil committed-sorted) + committed-overlap? (boolean + (some (fn [[a b]] + (and (not (identical? a b)) + (<= (:invoke-idx a) (:complete-idx b)) + (<= (:invoke-idx b) (:complete-idx a)))) + (for [a committed, b committed] [a b]))) + ;; Union of every score that any committed / pre-read :info / + ;; concurrent op could have produced. This over-approximates the + ;; legitimate post-state set when writes overlap, keeping the + ;; checker sound at the cost of being slightly less strict on + ;; overlapping concurrent writers. + add-scores (fn [acc m] + (case (:f m) + :zadd (conj acc (:score m)) + :zincrby (cond-> acc (some? (:score m)) (conj (:score m))) + :zrem acc)) + scores (as-> #{} s + (reduce add-scores s committed) + (reduce add-scores s pre-read-info) + (reduce add-scores s concurrent)) + unknown-score? (or + (some #(and (= :zincrby (:f %)) (:unknown-score? %)) + concurrent) + (some #(and (= :zincrby (:f %)) (:unknown-score? %)) + pre-read-info)) ;; any-known? must only be true when something provides evidence ;; the member actually existed at some point. A no-op ZREM - ;; (:removed? false) does NOT prove existence -- it explicitly - ;; says the member wasn't there. Without this guard, reads of a - ;; never-added member that happened to race with a no-op ZREM - ;; would flip from :phantom to :score-mismatch with an empty - ;; :allowed set, producing a false positive. - existence-evidence? (or (boolean committed-state) + ;; (:removed? false) does NOT prove existence. + existence-evidence? (or (some #(case (:f %) + (:zadd :zincrby) true + :zrem (:removed? %)) + committed) + (some #(case (:f %) + (:zadd :zincrby) true + :zrem (:removed? %)) + pre-read-info) (some #(case (:f %) (:zadd :zincrby) true :zrem (:removed? %)) concurrent))] {:scores scores :unknown-score? (boolean unknown-score?) - :must-be-present? (boolean (and committed-state (:present? committed-state) + ;; must-be-present? is relaxed when committed writes overlap + ;; among themselves or when any :info / concurrent mutation could + ;; have removed the member before the read. + :must-be-present? (boolean (and committed-state + (:present? committed-state) + (not committed-overlap?) + (empty? pre-read-info) (empty? concurrent))) :any-known? (boolean existence-evidence?)})) +(defn- duplicate-members + "Return the set of members that appear more than once in entries." + [entries] + (->> entries + (map first) + frequencies + (keep (fn [[m n]] (when (> n 1) m))) + set)) + (defn- check-zrange-all [mutations-by-m {:keys [invoke complete] :as _pair}] (let [entries (:value complete) @@ -394,6 +437,14 @@ (swap! errors conj {:kind :unsorted :index cmp-idx :entries entries})) + ;; 1b. No duplicate members: a ZSet read must return each member at + ;; most once. A duplicate-member result could otherwise satisfy + ;; ordering and score-membership checks while hiding a real bug. + (let [dupes (duplicate-members entries)] + (when (seq dupes) + (swap! errors conj {:kind :duplicate-members + :index cmp-idx + :members dupes}))) ;; 2. For each observed (member,score): validate score and non-phantom (doseq [[member score] entries] (let [{:keys [scores any-known? unknown-score?]} @@ -439,6 +490,12 @@ :index cmp-idx :bounds bounds :members members})) + (let [dupes (duplicate-members members)] + (when (seq dupes) + (swap! errors conj {:kind :duplicate-members-range + :index cmp-idx + :bounds bounds + :members dupes}))) ;; Observed members must be within bounds AND have a known allowed score. (doseq [[member score] members] (when (or (< score lo) (> score hi)) diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 5c59dcd6f..ee7f1dc85 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -87,3 +87,56 @@ {:type :ok :process 0 :f :zrem :value ["never-added" false] :index 3}] result (run-checker history)] (is (:valid? result) (str "expected valid, got: " result)))) + +(deftest duplicate-members-are-flagged + ;; CodeRabbit finding: ZRANGE must not return the same member twice. + ;; With a hypothetical committed + concurrent score for the same + ;; member, a duplicate could sneak past sort + score-membership + ;; checks. Enforce distinctness explicitly. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 0 :f :zrange-all :index 2} + {:type :ok :process 0 :f :zrange-all + :value [["m1" 1.0] ["m1" 1.0]] :index 3}] + result (run-checker history)] + (is (not (:valid? result)) (str "expected duplicate-members error, got: " result)))) + +(deftest overlapping-committed-zadds-allow-either-score + ;; CodeRabbit finding: two :ok ZADDs for the same member whose + ;; invoke/complete windows overlap have ambiguous serialization + ;; order. Either's resulting score is a valid post-state; the checker + ;; must not pin to the higher :complete-idx value only. + ;; Timeline (overlap between A's [invoke=0, complete=3] and + ;; B's [invoke=1, complete=2]): + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 5] :index 0} + {:type :invoke :process 1 :f :zadd :value ["m1" 9] :index 1} + {:type :ok :process 1 :f :zadd :value ["m1" 9] :index 2} + {:type :ok :process 0 :f :zadd :value ["m1" 5] :index 3} + ;; Post-commit: either 5 or 9 is a valid final score. + ;; A read observing 5 must NOT be flagged as mismatch. + {:type :invoke :process 2 :f :zrange-all :index 4} + {:type :ok :process 2 :f :zrange-all + :value [["m1" 5.0]] :index 5}] + result (run-checker history)] + (is (:valid? result) + (str "expected valid under overlapping-writes relaxation, got: " result)))) + +(deftest info-before-read-is-considered-uncertain + ;; CodeRabbit finding: an :info mutation that completed before a + ;; later read may have taken effect. It must be considered a possible + ;; source of state for that read, rather than being ignored by both + ;; model-before and the concurrent window. + (let [history [;; Add m1 with score 1. + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; ZINCRBY m1 by 5 -- response lost, recorded :info. + {:type :invoke :process 1 :f :zincrby :value ["m1" 5] :index 2} + {:type :info :process 1 :f :zincrby :value ["m1" 5] :index 3} + ;; Later read observes m1 at score 6 (increment applied + ;; server-side before the response was lost). Valid. + {:type :invoke :process 2 :f :zrange-all :index 4} + {:type :ok :process 2 :f :zrange-all + :value [["m1" 6.0]] :index 5}] + result (run-checker history)] + (is (:valid? result) + (str "expected :info-before-read to skip strict score check, got: " result)))) From ecb39831e5af88c24a8f6830668e905e7eee5cc8 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 23:46:43 +0900 Subject: [PATCH 06/28] fix(jepsen): correct ZSet checker for infinity, stale reads, and :info ops Address correctness issues in the Redis ZSet safety workload raised in PR #550 review: - parse-double-safe: Redis emits "inf"/"+inf"/"-inf" for infinite ZSET scores, which Double/parseDouble rejects. Route parse-withscores and the ZINCRBY reply through a tolerant parser that maps those to +/- Double/POSITIVE_INFINITY. - allowed-scores-for-member: tighten committed-score admissibility to real-time "candidates" (preceding :ok mutations not strictly followed in real time by another preceding :ok mutation). Superseded committed scores are no longer admissible, closing a stale-read soundness gap. - can-be-present?: replace the phantom-only check with a presence check that rejects both phantoms (never existed) and stale reads (member committed-removed before the read with no concurrent re-add). - CLI dispatch: -main now selects a workload by the first positional arg (elastickv-test, elastickv-zset-safety-test, elastickv-dynamodb- test, elastickv-s3-test) and auto-prepends the jepsen "test" subcommand when missing, matching the invocation documented in the PR description. - Tests: cover phantom, stale-read-after-committed-zrem, superseded committed score, and parse-withscores infinity handling. --- jepsen/src/elastickv/jepsen_test.clj | 36 +++- .../elastickv/redis_zset_safety_workload.clj | 187 +++++++++++------- .../redis_zset_safety_workload_test.clj | 76 +++++++ 3 files changed, 228 insertions(+), 71 deletions(-) diff --git a/jepsen/src/elastickv/jepsen_test.clj b/jepsen/src/elastickv/jepsen_test.clj index e40a8cb6f..3fc9b5e21 100644 --- a/jepsen/src/elastickv/jepsen_test.clj +++ b/jepsen/src/elastickv/jepsen_test.clj @@ -18,6 +18,40 @@ (defn elastickv-zset-safety-test [] (zset-safety-workload/elastickv-zset-safety-test {})) +(def ^:private test-fns + "Map of user-facing test names to their constructor fns. The first + positional CLI arg selects which workload runs; if absent or unknown, + we default to `elastickv-test` for backward compatibility with + pre-existing invocations." + {"elastickv-test" elastickv-test + "elastickv-zset-safety-test" elastickv-zset-safety-test + "elastickv-dynamodb-test" elastickv-dynamodb-test + "elastickv-s3-test" elastickv-s3-test}) + (defn -main + "Dispatch to a named workload. Usage: + + lein run -m elastickv.jepsen-test [jepsen-subcmd] [jepsen-opts ...] + + Supported s: elastickv-test, elastickv-zset-safety-test, + elastickv-dynamodb-test, elastickv-s3-test. When the first positional + arg is not a known test name, we default to `elastickv-test` for + backward compatibility and forward ALL args to jepsen.cli/run!. + + The jepsen subcommand (`test` or `analyze`) is auto-prepended when + missing, so `lein run elastickv-zset-safety-test --nodes n1,n2` works + without the user repeating `test`." [& args] - (cli/run! (cli/single-test-cmd {:test-fn elastickv-test}) args)) + (let [[head & tail] args + [selected-fn remaining-args] (if-let [f (get test-fns head)] + [f tail] + [elastickv-test args]) + ;; jepsen.cli/run! requires a subcommand ("test" or "analyze") + ;; as the first arg. Insert "test" if absent so users don't + ;; have to type it twice. + [next-head & _] remaining-args + final-args (if (#{"test" "analyze"} next-head) + remaining-args + (cons "test" remaining-args))] + (cli/run! (cli/single-test-cmd {:test-fn selected-fn}) + final-args))) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 549845e40..c4d71a990 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -68,6 +68,24 @@ ;; Client ;; --------------------------------------------------------------------------- +(defn- parse-double-safe + "Parse a Redis score string into a Double. Redis serializes infinite + scores as \"inf\" / \"+inf\" / \"-inf\", which Java's Double/parseDouble + does not accept (it expects \"Infinity\" / \"-Infinity\"). Handle both + encodings so the checker doesn't throw on infinite ZSET scores." + [s] + (let [raw (str s) + lower (str/lower-case raw)] + (cond + (or (= lower "inf") (= lower "+inf") (= lower "infinity") (= lower "+infinity")) + Double/POSITIVE_INFINITY + + (or (= lower "-inf") (= lower "-infinity")) + Double/NEGATIVE_INFINITY + + :else + (Double/parseDouble raw)))) + (defn- parse-withscores "Carmine returns a flat [member score member score ...] vector for ZRANGE WITHSCORES. Convert to a sorted vector of [member (double score)] @@ -77,7 +95,7 @@ (partition 2) (mapv (fn [[m s]] [(if (bytes? m) (String. ^bytes m) (str m)) - (Double/parseDouble (str s))])))) + (parse-double-safe s)])))) (defrecord ElastickvRedisZSetSafetyClient [node->port conn-spec] client/Client @@ -113,7 +131,7 @@ (let [[member delta] (:value op) new-score (car/wcar cs (car/zincrby zset-key (double delta) member))] (assoc op :type :ok - :value [member (Double/parseDouble (str new-score))])) + :value [member (parse-double-safe new-score)])) :zrem (let [member (:value op) @@ -327,33 +345,65 @@ [muts read-inv-idx read-cmp-idx] (filter #(concurrent? % read-inv-idx read-cmp-idx) muts)) +(defn- write-op? + "True iff the mutation adds/updates the member's score (i.e. would + make the member present). :zrem is NOT a write-op here." + [m] + (#{:zadd :zincrby} (:f m))) + +(defn- existence-evidence? + "A mutation proves that the member existed at some point iff it is a + write-op, or a ZREM whose :removed? flag is true. No-op ZREMs + (:removed? false) do NOT prove existence." + [m] + (case (:f m) + (:zadd :zincrby) true + :zrem (boolean (:removed? m)) + false)) + (defn- allowed-scores-for-member "Compute the set of scores considered valid for `member` by a read whose window is [read-inv-idx, read-cmp-idx], based on committed state - and any concurrent mutations. + and any concurrent/uncertain mutations. + + Linearizability demands a read observes either (a) the latest committed + state in real-time order, or (b) the effect of a write still concurrent + with the read. We therefore restrict the committed score set to + 'candidates' — committed mutations NOT strictly followed in real time + by another committed mutation (i.e. no other committed op's invoke + comes after this op's completion). Scores from strictly superseded + committed mutations are NOT admissible. Returns: - :scores - set of acceptable scores (committed + concurrent - :zadd / :ok :zincrby). - :unknown-score? - true iff any concurrent ZINCRBY's resulting score - is unknown (in-flight or :info). When set, the - caller MUST skip the strict score-membership + :scores - set of acceptable scores (from candidate + committed ops + pre-read :info + concurrent + writes with a known score). + :unknown-score? - true iff any concurrent / pre-read :info + ZINCRBY's resulting score is unknown. When set, + the caller MUST skip the strict score-membership check to stay sound. - :must-be-present? - committed state says present and no concurrent - mutation could have removed/changed it. - :any-known? - some op claims to have touched this member." + :can-be-present? - true iff the member may legitimately appear in + the read (some candidate/concurrent/pre-read + :info op is a write, OR a concurrent/pre-read + :info ZREM leaves presence uncertain). + :must-be-present? - true iff a candidate committed state says + present and nothing concurrent/pre-read :info + could have removed it." [mutations-by-m member read-inv-idx read-cmp-idx] (let [muts (get mutations-by-m member []) - ;; :ok mutations that completed strictly before the read. They - ;; may have overlapped with each other in wall-clock time, so - ;; the serialization order among them is ambiguous. - committed (->> muts + ;; :ok mutations that completed strictly before the read. + preceding (->> muts (filter #(and (= :ok (:type %)) (some? (:complete-idx %)) (< (:complete-idx %) read-inv-idx)))) + ;; Real-time "last-wins" candidate filter: a preceding mutation + ;; m is admissible iff no OTHER preceding mutation m' has + ;; m'.invoke-idx > m.complete-idx (i.e. m' strictly follows m). + ;; Equivalent: m.complete-idx >= max(invoke-idx) over preceding. + max-inv (reduce max -1 (map :invoke-idx preceding)) + candidates (filterv #(>= (:complete-idx %) max-inv) preceding) ;; :info mutations that completed before the read: they may or - ;; may not have taken effect server-side. We must account for - ;; their possible scores just like concurrent ones. + ;; may not have taken effect server-side. pre-read-info (->> muts (filter #(and (= :info (:type %)) (some? (:complete-idx %)) @@ -361,61 +411,55 @@ ;; Concurrent mutations: windows overlap the read. Include both ;; :ok and :info since either may have taken effect. concurrent (concurrent-mutations-for-member muts read-inv-idx read-cmp-idx) - ;; A conservative last-wins linearization for the must-be-present? - ;; check only. Ambiguous when committed writes overlap each other. - committed-sorted (sort-by :complete-idx committed) - committed-state (reduce apply-mutation-to-state nil committed-sorted) - committed-overlap? (boolean - (some (fn [[a b]] - (and (not (identical? a b)) - (<= (:invoke-idx a) (:complete-idx b)) - (<= (:invoke-idx b) (:complete-idx a)))) - (for [a committed, b committed] [a b]))) - ;; Union of every score that any committed / pre-read :info / - ;; concurrent op could have produced. This over-approximates the - ;; legitimate post-state set when writes overlap, keeping the - ;; checker sound at the cost of being slightly less strict on - ;; overlapping concurrent writers. + add-scores (fn [acc m] (case (:f m) :zadd (conj acc (:score m)) :zincrby (cond-> acc (some? (:score m)) (conj (:score m))) :zrem acc)) + ;; Admissible scores: candidate committed + pre-read :info + + ;; concurrent writes (with a known score). scores (as-> #{} s - (reduce add-scores s committed) + (reduce add-scores s candidates) (reduce add-scores s pre-read-info) (reduce add-scores s concurrent)) - unknown-score? (or - (some #(and (= :zincrby (:f %)) (:unknown-score? %)) - concurrent) - (some #(and (= :zincrby (:f %)) (:unknown-score? %)) - pre-read-info)) - ;; any-known? must only be true when something provides evidence - ;; the member actually existed at some point. A no-op ZREM - ;; (:removed? false) does NOT prove existence. - existence-evidence? (or (some #(case (:f %) - (:zadd :zincrby) true - :zrem (:removed? %)) - committed) - (some #(case (:f %) - (:zadd :zincrby) true - :zrem (:removed? %)) - pre-read-info) - (some #(case (:f %) - (:zadd :zincrby) true - :zrem (:removed? %)) - concurrent))] - {:scores scores - :unknown-score? (boolean unknown-score?) - ;; must-be-present? is relaxed when committed writes overlap - ;; among themselves or when any :info / concurrent mutation could - ;; have removed the member before the read. - :must-be-present? (boolean (and committed-state - (:present? committed-state) - (not committed-overlap?) + + has-unknown-incr? (fn [coll] + (some #(and (= :zincrby (:f %)) + (:unknown-score? %)) + coll)) + unknown-score? (or (has-unknown-incr? concurrent) + (has-unknown-incr? pre-read-info)) + + ;; Did any candidate commit establish presence (write, or + ;; ZREM with :removed? -- either way the member existed)? + candidate-state (reduce apply-mutation-to-state nil + (sort-by :complete-idx candidates)) + candidate-present? (boolean (:present? candidate-state)) + + any-concurrent-could-write? (or (some write-op? concurrent) + (some write-op? pre-read-info)) + any-concurrent-could-remove? (or (some #(= :zrem (:f %)) concurrent) + (some #(= :zrem (:f %)) pre-read-info)) + + can-be-present? (or candidate-present? + any-concurrent-could-write? + ;; A :zrem with :removed? true still proves + ;; existence; if a concurrent ZREM raced + ;; with an earlier write whose window is + ;; not captured as a candidate, presence is + ;; uncertain rather than forbidden. + (and (some existence-evidence? (concat concurrent + pre-read-info)) + any-concurrent-could-remove?)) + + must-be-present? (boolean (and candidate-present? (empty? pre-read-info) - (empty? concurrent))) - :any-known? (boolean existence-evidence?)})) + (empty? concurrent)))] + {:scores scores + :unknown-score? (boolean unknown-score?) + :can-be-present? (boolean can-be-present?) + :must-be-present? must-be-present?})) (defn- duplicate-members "Return the set of members that appear more than once in entries." @@ -445,13 +489,16 @@ (swap! errors conj {:kind :duplicate-members :index cmp-idx :members dupes}))) - ;; 2. For each observed (member,score): validate score and non-phantom + ;; 2. For each observed (member,score): validate presence + score. + ;; can-be-present? catches both phantoms (member never existed) + ;; and stale reads (member committed-removed before the read + ;; with no concurrent re-add). (doseq [[member score] entries] - (let [{:keys [scores any-known? unknown-score?]} + (let [{:keys [scores can-be-present? unknown-score?]} (allowed-scores-for-member mutations-by-m member inv-idx cmp-idx)] (cond - (not any-known?) - (swap! errors conj {:kind :phantom + (not can-be-present?) + (swap! errors conj {:kind :unexpected-presence :index cmp-idx :member member :score score}) @@ -504,11 +551,11 @@ :bounds bounds :member member :score score})) - (let [{:keys [scores any-known? unknown-score?]} + (let [{:keys [scores can-be-present? unknown-score?]} (allowed-scores-for-member mutations-by-m member inv-idx cmp-idx)] (cond - (not any-known?) - (swap! errors conj {:kind :phantom-range + (not can-be-present?) + (swap! errors conj {:kind :unexpected-presence-range :index cmp-idx :member member :score score}) diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index ee7f1dc85..e4ff8da14 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -140,3 +140,79 @@ result (run-checker history)] (is (:valid? result) (str "expected :info-before-read to skip strict score check, got: " result)))) + +;; --------------------------------------------------------------------------- +;; Stale-read / phantom / superseded-committed checks (gemini HIGH) +;; --------------------------------------------------------------------------- + +(deftest phantom-member-is-flagged + ;; gemini HIGH: a read that observes a member which was never added + ;; (no ZADD/ZINCRBY/true-ZREM anywhere) must be rejected. + (let [history [{:type :invoke :process 0 :f :zrange-all :index 0} + {:type :ok :process 0 :f :zrange-all + :value [["never-added" 42.0]] :index 1}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) (str "expected phantom error, got: " result)) + (is (contains? kinds :unexpected-presence) + (str "expected :unexpected-presence, got kinds=" kinds)))) + +(deftest stale-read-after-committed-zrem-is-flagged + ;; gemini HIGH: once a ZADD and a subsequent real (:removed? true) ZREM + ;; have BOTH committed (with no concurrent re-add), a later read that + ;; still sees the member must be rejected as a stale read. + (let [history [;; Add then remove m1 — both committed before any read. + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 0 :f :zrem :value "m1" :index 2} + {:type :ok :process 0 :f :zrem :value ["m1" true] :index 3} + ;; Stale read: m1 somehow still appears. + {:type :invoke :process 1 :f :zrange-all :index 4} + {:type :ok :process 1 :f :zrange-all + :value [["m1" 1.0]] :index 5}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) (str "expected stale-read error, got: " result)) + (is (contains? kinds :unexpected-presence) + (str "expected :unexpected-presence, got kinds=" kinds)))) + +(deftest superseded-committed-score-is-not-allowed + ;; gemini HIGH: a ZADD committed BEFORE another ZADD for the same + ;; member whose invoke strictly followed it should not be treated as + ;; a valid post-state score. Only the latest committed score (plus + ;; concurrent in-flight) may be observed. + (let [history [;; ZADD m1 1 commits first ... + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; ... then ZADD m1 2 is invoked strictly after, and + ;; also commits before the read. + {:type :invoke :process 0 :f :zadd :value ["m1" 2] :index 2} + {:type :ok :process 0 :f :zadd :value ["m1" 2] :index 3} + ;; Read observing the SUPERSEDED score 1.0 — invalid. + {:type :invoke :process 1 :f :zrange-all :index 4} + {:type :ok :process 1 :f :zrange-all + :value [["m1" 1.0]] :index 5}] + result (run-checker history)] + (is (not (:valid? result)) + (str "expected superseded-score mismatch, got: " result)))) + +;; --------------------------------------------------------------------------- +;; Infinity score parsing +;; --------------------------------------------------------------------------- + +(deftest parse-withscores-handles-inf-strings + ;; gemini HIGH: Redis returns "inf" / "+inf" / "-inf" for infinite + ;; ZSET scores. Double/parseDouble expects "Infinity"; the workload's + ;; parser must normalize both encodings instead of throwing. + (let [flat ["m-pos" "inf" + "m-pos2" "+inf" + "m-neg" "-inf" + "m-jvm" "Infinity" + "m-num" "3.5"] + parsed (#'workload/parse-withscores flat)] + (is (= [["m-pos" Double/POSITIVE_INFINITY] + ["m-pos2" Double/POSITIVE_INFINITY] + ["m-neg" Double/NEGATIVE_INFINITY] + ["m-jvm" Double/POSITIVE_INFINITY] + ["m-num" 3.5]] + parsed)))) From 2a194a4b0e99d950bf514ac0cecbb00c36b1acd6 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 00:46:57 +0900 Subject: [PATCH 07/28] fix(jepsen): accept linearization of concurrent ops and uncertain mutations in checker - allowed-scores-for-member: replace complete-idx-ordered candidate-present? with linearization-aware semantics. When multiple candidates share overlapping windows, presence is allowed iff some linearization keeps the member present; presence is required only when every linearization does. Uncertain ZREMs (pre-read :info / concurrent) now correctly relax must-be-present?. - check-zrange-all / check-zrangebyscore: delegate completeness to must-be-present? so :info or concurrent ZREMs don't false-positive missing-member. - check-zrangebyscore: gate completeness on a new score-definitely-in-range? helper so uncertain ZINCRBY (unknown resulting score) can't trigger false :missing-member-range. - Tests: concurrent ZADD+ZREM accepts either outcome; :info ZREM allows absent read; :info ZINCRBY with out-of-range guess does not flag completeness; sanity tests still flag truly-missing members. --- .../elastickv/redis_zset_safety_workload.clj | 156 +++++++++++------- .../redis_zset_safety_workload_test.clj | 96 +++++++++++ 2 files changed, 193 insertions(+), 59 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index c4d71a990..a65823c4c 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -374,6 +374,12 @@ comes after this op's completion). Scores from strictly superseded committed mutations are NOT admissible. + When multiple candidates remain (their windows overlap), they can + serialize in any real-time-consistent order: the read may legitimately + observe the outcome of any of them. Thus presence is required only + when EVERY admissible serialization leaves the member present; presence + is forbidden only when EVERY admissible serialization leaves it absent. + Returns: :scores - set of acceptable scores (from candidate committed ops + pre-read :info + concurrent @@ -382,13 +388,12 @@ ZINCRBY's resulting score is unknown. When set, the caller MUST skip the strict score-membership check to stay sound. - :can-be-present? - true iff the member may legitimately appear in - the read (some candidate/concurrent/pre-read - :info op is a write, OR a concurrent/pre-read - :info ZREM leaves presence uncertain). - :must-be-present? - true iff a candidate committed state says - present and nothing concurrent/pre-read :info - could have removed it." + :can-be-present? - true iff SOME admissible linearization leaves + the member present. + :must-be-present? - true iff EVERY admissible linearization leaves + the member present (i.e. some candidate is a + write, no candidate is a ZREM, and no uncertain + ZREM could have applied before the read)." [mutations-by-m member read-inv-idx read-cmp-idx] (let [muts (get mutations-by-m member []) ;; :ok mutations that completed strictly before the read. @@ -400,6 +405,8 @@ ;; m is admissible iff no OTHER preceding mutation m' has ;; m'.invoke-idx > m.complete-idx (i.e. m' strictly follows m). ;; Equivalent: m.complete-idx >= max(invoke-idx) over preceding. + ;; When multiple candidates remain, they have overlapping + ;; windows and may serialize in any real-time-consistent order. max-inv (reduce max -1 (map :invoke-idx preceding)) candidates (filterv #(>= (:complete-idx %) max-inv) preceding) ;; :info mutations that completed before the read: they may or @@ -411,6 +418,9 @@ ;; Concurrent mutations: windows overlap the read. Include both ;; :ok and :info since either may have taken effect. concurrent (concurrent-mutations-for-member muts read-inv-idx read-cmp-idx) + ;; Uncertain mutations: anything whose effect on the read is not + ;; fully determined by committed real-time order alone. + uncertain (concat pre-read-info concurrent) add-scores (fn [acc m] (case (:f m) @@ -421,46 +431,65 @@ ;; concurrent writes (with a known score). scores (as-> #{} s (reduce add-scores s candidates) - (reduce add-scores s pre-read-info) - (reduce add-scores s concurrent)) + (reduce add-scores s uncertain)) has-unknown-incr? (fn [coll] (some #(and (= :zincrby (:f %)) (:unknown-score? %)) coll)) - unknown-score? (or (has-unknown-incr? concurrent) - (has-unknown-incr? pre-read-info)) - - ;; Did any candidate commit establish presence (write, or - ;; ZREM with :removed? -- either way the member existed)? - candidate-state (reduce apply-mutation-to-state nil - (sort-by :complete-idx candidates)) - candidate-present? (boolean (:present? candidate-state)) - - any-concurrent-could-write? (or (some write-op? concurrent) - (some write-op? pre-read-info)) - any-concurrent-could-remove? (or (some #(= :zrem (:f %)) concurrent) - (some #(= :zrem (:f %)) pre-read-info)) - - can-be-present? (or candidate-present? - any-concurrent-could-write? - ;; A :zrem with :removed? true still proves - ;; existence; if a concurrent ZREM raced - ;; with an earlier write whose window is - ;; not captured as a candidate, presence is - ;; uncertain rather than forbidden. - (and (some existence-evidence? (concat concurrent - pre-read-info)) - any-concurrent-could-remove?)) - - must-be-present? (boolean (and candidate-present? - (empty? pre-read-info) - (empty? concurrent)))] + ;; Uncertain score set when any ZINCRBY is concurrent/uncertain + ;; (its resulting score is unknown), OR when multiple concurrent + ;; increments could yield intermediate prefix-sum scores that + ;; are not in :scores. + unknown-score? (or (has-unknown-incr? uncertain) + (some #(= :zincrby (:f %)) uncertain)) + + any-candidate-write? (some write-op? candidates) + any-candidate-zrem? (some #(= :zrem (:f %)) candidates) + any-uncertain-write? (some write-op? uncertain) + any-uncertain-zrem? (some #(= :zrem (:f %)) uncertain) + + ;; Some linearization of candidates ends with the member + ;; present. Because candidates have overlapping windows (they + ;; all share the same max-inv), any of them can serialize last. + ;; So presence is allowed iff at least one candidate is a write. + candidate-can-be-present? (boolean any-candidate-write?) + ;; Some linearization of candidates ends with the member absent. + candidate-can-be-absent? (or (empty? candidates) + (boolean any-candidate-zrem?)) + + ;; can-be-present?: at least one admissible linearization + ;; (candidates + uncertain) ends with the member present. + ;; An uncertain write (or an uncertain :zrem combined with + ;; existence evidence) can flip an otherwise-absent candidate + ;; outcome to present by reordering after a write. + can-be-present? (or candidate-can-be-present? + any-uncertain-write? + (and any-uncertain-zrem? + (some existence-evidence? uncertain))) + + ;; must-be-present?: EVERY admissible linearization ends with + ;; the member present. Requires the candidate outcome to be + ;; always-present (candidate write, no candidate zrem) AND no + ;; uncertain zrem that could reorder last to remove it. + must-be-present? (boolean (and any-candidate-write? + (not candidate-can-be-absent?) + (not any-uncertain-zrem?)))] {:scores scores :unknown-score? (boolean unknown-score?) :can-be-present? (boolean can-be-present?) :must-be-present? must-be-present?})) +(defn- score-definitely-in-range? + "True iff the member's committed score is definitively in [lo, hi] + for the purposes of completeness: every candidate score is inside the + range AND no uncertain/concurrent mutation could have produced an + unknown or out-of-range score. Used by ZRANGEBYSCORE completeness." + [scores unknown-score? lo hi] + (boolean (and (not unknown-score?) + (seq scores) + (every? #(<= lo % hi) scores)))) + (defn- duplicate-members "Return the set of members that appear more than once in entries." [entries] @@ -513,16 +542,20 @@ :observed score :allowed scores})))) ;; 3. Completeness: model-required members must appear. + ;; A member is required-present only if every admissible + ;; linearization leaves it present (must-be-present?). This + ;; correctly skips members that an :info or concurrent ZREM + ;; might have removed before the read. (let [model (model-before mutations-by-m inv-idx) observed-members (into #{} (map first) entries)] - (doseq [[member {:keys [present?]}] model] - (when (and present? (not (contains? observed-members member))) - (let [muts (get mutations-by-m member []) - concurrent (concurrent-mutations-for-member muts inv-idx cmp-idx)] - (when (empty? concurrent) - (swap! errors conj {:kind :missing-member - :index cmp-idx - :member member})))))) + (doseq [[member _] model] + (let [{:keys [must-be-present?]} + (allowed-scores-for-member mutations-by-m member inv-idx cmp-idx)] + (when (and must-be-present? + (not (contains? observed-members member))) + (swap! errors conj {:kind :missing-member + :index cmp-idx + :member member}))))) @errors)) (defn- check-zrangebyscore @@ -566,22 +599,27 @@ :member member :observed score :allowed scores})))) - ;; Completeness within bounds: any model member whose committed score - ;; is in [lo,hi] with no concurrent mutation must appear. + ;; Completeness within bounds: a model member must appear only when + ;; (a) every admissible linearization leaves it present + ;; (must-be-present?), AND + ;; (b) its score is definitively within [lo, hi] across all + ;; admissible linearizations (no uncertain ZINCRBY, every + ;; candidate score inside the bounds). + ;; Uncertain scores (concurrent/:info ZINCRBY) must NOT cause + ;; completeness failures when the resulting score is unknown. (let [model (model-before mutations-by-m inv-idx) observed-members (into #{} (map first) members)] - (doseq [[member {:keys [present? score]}] model] - (when (and present? - (<= lo score hi) - (not (contains? observed-members member))) - (let [muts (get mutations-by-m member []) - concurrent (concurrent-mutations-for-member muts inv-idx cmp-idx)] - (when (empty? concurrent) - (swap! errors conj {:kind :missing-member-range - :index cmp-idx - :bounds bounds - :member member - :expected-score score})))))) + (doseq [[member _] model] + (let [{:keys [must-be-present? scores unknown-score?]} + (allowed-scores-for-member mutations-by-m member inv-idx cmp-idx)] + (when (and must-be-present? + (score-definitely-in-range? scores unknown-score? lo hi) + (not (contains? observed-members member))) + (swap! errors conj {:kind :missing-member-range + :index cmp-idx + :bounds bounds + :member member + :expected-score (first scores)}))))) @errors)) (defn zset-safety-checker diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index e4ff8da14..0ab9b6772 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -200,6 +200,102 @@ ;; Infinity score parsing ;; --------------------------------------------------------------------------- +;; --------------------------------------------------------------------------- +;; Linearization of concurrent ops / uncertain mutations (gemini HIGH batch 2) +;; --------------------------------------------------------------------------- + +(deftest concurrent-zadd-zrem-both-completed-accepts-either-outcome + ;; gemini HIGH: ZADD and ZREM for the same member whose invoke/complete + ;; windows overlap (both commit before the read) have ambiguous + ;; linearization. A linearizable store may serialize either one last, + ;; so the read legitimately observes EITHER [["m1" 1.0]] OR []. + ;; Windows: ZADD=[inv=0, cmp=3], ZREM=[inv=1, cmp=2] — overlap. + (let [base [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :invoke :process 1 :f :zrem :value "m1" :index 1} + {:type :ok :process 1 :f :zrem :value ["m1" true] :index 2} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 3}] + hist-present (conj base + {:type :invoke :process 2 :f :zrange-all :index 4} + {:type :ok :process 2 :f :zrange-all + :value [["m1" 1.0]] :index 5}) + hist-absent (conj base + {:type :invoke :process 2 :f :zrange-all :index 4} + {:type :ok :process 2 :f :zrange-all + :value [] :index 5})] + (is (:valid? (run-checker hist-present)) + "expected read observing ZADD's outcome to be accepted") + (is (:valid? (run-checker hist-absent)) + "expected read observing ZREM's outcome (absent) to be accepted"))) + +(deftest info-zrem-concurrent-with-read-allows-missing-member + ;; gemini HIGH: an :info ZREM that might have applied before a read + ;; leaves the member's presence uncertain. A ZRANGE that omits the + ;; member must NOT be flagged as a completeness failure. + (let [history [;; ZADD m1 committed before the read. + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; ZREM m1 is invoked, then the read runs, then the + ;; ZREM response is lost (:info). The ZREM may or may + ;; not have applied server-side. + {:type :invoke :process 1 :f :zrem :value "m1" :index 2} + {:type :invoke :process 0 :f :zrange-all :index 3} + {:type :ok :process 0 :f :zrange-all :value [] :index 4} + {:type :info :process 1 :f :zrem :value "m1" :index 5}] + result (run-checker history)] + (is (:valid? result) + (str "expected :info ZREM to make absence acceptable, got: " result)))) + +(deftest info-zincrby-does-not-flag-zrangebyscore-completeness + ;; gemini HIGH: a pre-read :info / concurrent ZINCRBY leaves the + ;; resulting score unknown. ZRANGEBYSCORE filtering on a specific + ;; range must not flag the member as missing, because its score may + ;; have moved outside [lo, hi]. + (let [history [;; ZADD m1 at score 1 (committed well before read). + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; ZINCRBY m1 +100 — response lost (:info) BEFORE read. + {:type :invoke :process 1 :f :zincrby :value ["m1" 100] :index 2} + {:type :info :process 1 :f :zincrby :value ["m1" 100] :index 3} + ;; ZRANGEBYSCORE [0, 10] — m1's score is uncertain; it + ;; may now be 101 (outside range) or still 1. The + ;; checker must not complain about m1's absence. + {:type :invoke :process 2 :f :zrangebyscore :value [0.0 10.0] :index 4} + {:type :ok :process 2 :f :zrangebyscore + :value {:bounds [0.0 10.0] :members []} :index 5}] + result (run-checker history)] + (is (:valid? result) + (str "expected :info ZINCRBY to skip completeness, got: " result)))) + +(deftest zrangebyscore-completeness-still-detects-truly-missing-member + ;; Sanity: when NO uncertainty exists and a model member's committed + ;; score is definitively inside [lo, hi], its absence IS flagged. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 5] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 5] :index 1} + {:type :invoke :process 0 :f :zrangebyscore :value [0.0 10.0] :index 2} + {:type :ok :process 0 :f :zrangebyscore + :value {:bounds [0.0 10.0] :members []} :index 3}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) (str "expected missing-member-range, got: " result)) + (is (contains? kinds :missing-member-range) + (str "expected :missing-member-range, got kinds=" kinds)))) + +(deftest zrange-completeness-still-detects-truly-missing-member + ;; Sanity: no uncertainty, member committed-present. Absence flagged. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 5] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 5] :index 1} + {:type :invoke :process 0 :f :zrange-all :index 2} + {:type :ok :process 0 :f :zrange-all :value [] :index 3}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) (str "expected missing-member, got: " result)) + (is (contains? kinds :missing-member) + (str "expected :missing-member, got kinds=" kinds)))) + +;; --------------------------------------------------------------------------- +;; Infinity score parsing +;; --------------------------------------------------------------------------- + (deftest parse-withscores-handles-inf-strings ;; gemini HIGH: Redis returns "inf" / "+inf" / "-inf" for infinite ;; ZSET scores. Double/parseDouble expects "Infinity"; the workload's From 0c0efc4e32910a0be910f6c7889503b841d91f4b Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 03:38:38 +0900 Subject: [PATCH 08/28] fix(jepsen): keep strict score check when concurrent ZINCRBY score is known MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1: `allowed-scores-for-member` was marking `:unknown-score?` true whenever ANY uncertain mutation was `:zincrby` — including completed `:ok` ops whose resulting score is fully recovered from the server reply. That over-relaxed the check in `check-zrange-all` / `check-zrangebyscore`, letting a read concurrent with a single known ZINCRBY return an arbitrary impossible score without being flagged. Refine the relaxation rule so it only fires when the score is genuinely unrecoverable: * `:unknown-score?` is set on an individual ZINCRBY only when its own result is unknown (:info/:pending), OR * there are >=2 concurrent/uncertain ZINCRBYs whose relative serialization produces prefix-sum intermediates not present in `:scores`. With a single :ok concurrent ZINCRBY the read can observe either the pre-op score (in `:scores` via candidate committed ops) or the post-op score (also in `:scores`), so the strict score-membership check stays sound. Add two regression tests: * `single-ok-concurrent-zincrby-still-validates-scores` flags a read observing 999.0 while a single :ok ZINCRBY is concurrent. * `two-concurrent-zincrbys-relax-score-check` accepts the intermediate prefix-sum score under multi-ZINCRBY overlap. --- .../elastickv/redis_zset_safety_workload.clj | 22 +++++++--- .../redis_zset_safety_workload_test.clj | 41 +++++++++++++++++++ 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index a65823c4c..2e26cabf7 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -437,12 +437,24 @@ (some #(and (= :zincrby (:f %)) (:unknown-score? %)) coll)) - ;; Uncertain score set when any ZINCRBY is concurrent/uncertain - ;; (its resulting score is unknown), OR when multiple concurrent - ;; increments could yield intermediate prefix-sum scores that - ;; are not in :scores. + ;; Count concurrent/uncertain ZINCRBYs. The resulting score of a + ;; read relative to ZINCRBYs depends on how many of them took + ;; effect before the read observed state. + ;; * 0 uncertain ZINCRBYs: :scores is authoritative. + ;; * 1 uncertain ZINCRBY whose final score is KNOWN (:ok): + ;; the read can observe either the pre-op state (already in + ;; :scores from candidates) or the post-op state (its known + ;; final score, also added to :scores). :scores is complete. + ;; * 1 uncertain ZINCRBY with UNKNOWN score (:info/:pending): + ;; the post-op score is not recoverable from the history. + ;; We must relax the strict score check. + ;; * >=2 uncertain ZINCRBYs: their relative serialization + ;; order produces prefix-sum intermediates that are not in + ;; :scores (e.g. pre + delta1 before pre + delta1 + delta2). + ;; Relax the strict score check. + uncertain-incrs (filter #(= :zincrby (:f %)) uncertain) unknown-score? (or (has-unknown-incr? uncertain) - (some #(= :zincrby (:f %)) uncertain)) + (> (count uncertain-incrs) 1)) any-candidate-write? (some write-op? candidates) any-candidate-zrem? (some #(= :zrem (:f %)) candidates) diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 0ab9b6772..be957a232 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -75,6 +75,47 @@ result (run-checker history)] (is (not (:valid? result)) (str "expected mismatch, got: " result)))) +(deftest single-ok-concurrent-zincrby-still-validates-scores + ;; Codex P1: :unknown-score? must NOT be set when exactly one + ;; concurrent ZINCRBY is :ok (and therefore has a known resulting + ;; score). The read may observe either the pre-op score or the + ;; post-op score, both of which are in :scores. An arbitrary + ;; impossible score (e.g. 999.0) must still be flagged as a + ;; :score-mismatch, not waved through by `:unknown-score?`. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 1 :f :zincrby :value ["m1" 5] :index 2} + {:type :invoke :process 0 :f :zrange-all :index 3} + ;; Read observes 999.0 — not 1.0 (pre) or 6.0 (post). + {:type :ok :process 0 :f :zrange-all + :value [["m1" 999.0]] :index 4} + ;; ZINCRBY eventually completes :ok with known score 6. + {:type :ok :process 1 :f :zincrby :value ["m1" 6.0] :index 5}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) + (str "expected score-mismatch to still fire, got: " result)) + (is (contains? kinds :score-mismatch) + (str "expected :score-mismatch, got kinds=" kinds)))) + +(deftest two-concurrent-zincrbys-relax-score-check + ;; Prefix-sum ordering matters: with two concurrent ZINCRBYs, the + ;; intermediate score (pre + one delta) is reachable and need not be + ;; in :scores. The checker must relax the strict score check. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 1 :f :zincrby :value ["m1" 2] :index 2} + {:type :invoke :process 2 :f :zincrby :value ["m1" 3] :index 3} + {:type :invoke :process 0 :f :zrange-all :index 4} + ;; Intermediate 3.0 = 1 + 2 (before +3 applied). + {:type :ok :process 0 :f :zrange-all + :value [["m1" 3.0]] :index 5} + {:type :ok :process 1 :f :zincrby :value ["m1" 3.0] :index 6} + {:type :ok :process 2 :f :zincrby :value ["m1" 6.0] :index 7}] + result (run-checker history)] + (is (:valid? result) + (str "expected relaxation for >=2 concurrent ZINCRBYs, got: " result)))) + (deftest no-op-zrem-alone-does-not-false-positive ;; CI-observed false positive: a member whose only prior ops are no-op ;; ZREMs was classified as :score-mismatch with :allowed #{} instead From e5dcc349e0d52a79ec9dd05c5254965e37f6d165 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 03:54:09 +0900 Subject: [PATCH 09/28] fix(jepsen): exclude :fail completions from concurrent mutation uncertainty In Jepsen, :fail means the op definitively did NOT take effect. The workload's concurrent-mutations-for-member was including failed mutations in the uncertainty set, which made allowed-scores-for-member / must-be-present? / can-be-present? treat failed ops as possibly-visible state: * a failed concurrent :zrem suppressed a required-presence check * a failed concurrent :zadd contributed a false score to the allowed set Filter the concurrent window by :type so only :ok and :info/:pending mutations contribute uncertainty; :fail is excluded outright. The :type field is already threaded through each mutation record from the invoke/complete pair processing, so no additional plumbing was needed. Regression tests in redis_zset_safety_workload_test.clj: * failed-concurrent-zrem-does-not-relax-must-be-present: a concurrent ZREM that completes :fail no longer suppresses the :missing-member check for a committed member the read omitted. * failed-concurrent-zadd-does-not-contribute-allowed-score: a concurrent ZADD that completes :fail no longer admits its score; observing that score flags :score-mismatch. --- .../elastickv/redis_zset_safety_workload.clj | 10 +++- .../redis_zset_safety_workload_test.clj | 50 +++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 2e26cabf7..2e91e5a4f 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -341,9 +341,15 @@ mutations-by-m)) (defn- concurrent-mutations-for-member - "All mutations (ok or info) that are concurrent with the read window." + "All mutations concurrent with the read window that could have taken + effect. :fail completions are excluded: in Jepsen, :fail means the op + definitively did NOT execute, so it contributes neither an allowed + score nor uncertainty about presence. :ok and :info/:pending are + included (either may be visible to the read)." [muts read-inv-idx read-cmp-idx] - (filter #(concurrent? % read-inv-idx read-cmp-idx) muts)) + (filter #(and (not= :fail (:type %)) + (concurrent? % read-inv-idx read-cmp-idx)) + muts)) (defn- write-op? "True iff the mutation adds/updates the member's score (i.e. would diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index be957a232..b471ad2f3 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -333,6 +333,56 @@ (is (contains? kinds :missing-member) (str "expected :missing-member, got kinds=" kinds)))) +;; --------------------------------------------------------------------------- +;; Failed-concurrent mutations must not contribute to uncertainty (codex P1) +;; --------------------------------------------------------------------------- + +(deftest failed-concurrent-zrem-does-not-relax-must-be-present + ;; codex P1: a concurrent ZREM that completes with :fail did NOT take + ;; effect. Its window must NOT make the member's presence uncertain, + ;; so a read that omits the member (which was ZADDed and committed + ;; beforehand) must be flagged as :missing-member. + (let [history [;; ZADD m1 committed before the read. + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; ZREM m1 is invoked concurrently with the read but + ;; ultimately :fails -- the op definitively did NOT run. + {:type :invoke :process 1 :f :zrem :value "m1" :index 2} + {:type :invoke :process 0 :f :zrange-all :index 3} + ;; Read observes m1 ABSENT -- without the fix, the + ;; failed ZREM would admit this as "possibly removed". + {:type :ok :process 0 :f :zrange-all :value [] :index 4} + {:type :fail :process 1 :f :zrem :value "m1" :index 5}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) + (str "expected :missing-member despite failed ZREM, got: " result)) + (is (contains? kinds :missing-member) + (str "expected :missing-member, got kinds=" kinds)))) + +(deftest failed-concurrent-zadd-does-not-contribute-allowed-score + ;; codex P1: a concurrent ZADD that completes with :fail did NOT take + ;; effect. Its score must NOT be added to the allowed set. A read + ;; observing that score must be flagged as :score-mismatch rather than + ;; being waved through by the failed ZADD's ghost contribution. + (let [history [;; ZADD m1 at score 1 committed before the read. + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; Concurrent ZADD m1 at score 42 ultimately :fails. + {:type :invoke :process 1 :f :zadd :value ["m1" 42] :index 2} + {:type :invoke :process 0 :f :zrange-all :index 3} + ;; Read observes score 42 -- only valid if the failed + ;; ZADD is (incorrectly) admitted as a possible write. + {:type :ok :process 0 :f :zrange-all + :value [["m1" 42.0]] :index 4} + {:type :fail :process 1 :f :zadd :value ["m1" 42] :index 5}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) + (str "expected :score-mismatch ignoring failed ZADD, got: " result)) + (is (contains? kinds :score-mismatch) + (str "expected :score-mismatch, got kinds=" kinds)))) + ;; --------------------------------------------------------------------------- ;; Infinity score parsing ;; --------------------------------------------------------------------------- From 02a8adf7f3e4640469d121512e761b631f89b99e Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 04:04:37 +0900 Subject: [PATCH 10/28] fix(jepsen): restrict committed ZINCRBY candidates to linearization-chain tail Codex P1 on #550: allowed-scores-for-member must not admit the return value of an intermediate ZINCRBY as a valid final score once another mutation has strictly followed it and committed before the read. The existing max-invoke real-time "last-wins" filter already enforces this for ZINCRBY as well as ZADD/ZREM (a ZINCRBY whose complete-idx is less than a later committed op's invoke-idx is dropped), but the invariant was not tested or clearly documented. - Clarify the comment on the candidate filter to explain why it covers ZINCRBY chains and ZADD chain-reset semantics. - Add regression coverage for the P1 scenarios: * chained-committed-zincrby-rejects-stale-intermediate * chained-committed-zincrby-accepts-latest * concurrent-zincrby-both-admissible * zadd-resets-zincrby-chain --- .../elastickv/redis_zset_safety_workload.clj | 26 ++++- .../redis_zset_safety_workload_test.clj | 104 ++++++++++++++++++ 2 files changed, 124 insertions(+), 6 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 2e91e5a4f..9d96e0f94 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -407,12 +407,26 @@ (filter #(and (= :ok (:type %)) (some? (:complete-idx %)) (< (:complete-idx %) read-inv-idx)))) - ;; Real-time "last-wins" candidate filter: a preceding mutation - ;; m is admissible iff no OTHER preceding mutation m' has - ;; m'.invoke-idx > m.complete-idx (i.e. m' strictly follows m). - ;; Equivalent: m.complete-idx >= max(invoke-idx) over preceding. - ;; When multiple candidates remain, they have overlapping - ;; windows and may serialize in any real-time-consistent order. + ;; Real-time "last-wins" / chain-tail candidate filter: a + ;; preceding mutation m is admissible iff no OTHER preceding + ;; mutation m' has m'.invoke-idx > m.complete-idx (i.e. m' + ;; strictly follows m in real time). Equivalent: + ;; m.complete-idx >= max(invoke-idx) over preceding. + ;; + ;; Importantly this applies to :zincrby as well: a sequentially + ;; committed ZINCRBY chain has a forced linearization (each + ;; :ok :zincrby pins the pre-op and post-op states), so only + ;; the latest chain tail's return value is a valid final score + ;; for a post-chain read. An intermediate ZINCRBY's return + ;; value is NOT admissible once another mutation strictly + ;; follows it and commits before the read. A ZADD that strictly + ;; follows a ZINCRBY likewise resets the chain (the ZADD's + ;; absolute score becomes the only candidate). + ;; + ;; When multiple candidates remain (their invoke/complete + ;; windows overlap), they may serialize in any real-time- + ;; consistent order and any of their return values is a valid + ;; final state. max-inv (reduce max -1 (map :invoke-idx preceding)) candidates (filterv #(>= (:complete-idx %) max-inv) preceding) ;; :info mutations that completed before the read: they may or diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index b471ad2f3..38a0d8952 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -383,6 +383,110 @@ (is (contains? kinds :score-mismatch) (str "expected :score-mismatch, got kinds=" kinds)))) +;; --------------------------------------------------------------------------- +;; Chained committed ZINCRBYs: only the linearization-chain tail is a +;; valid final score. Earlier intermediate return values are stale. (codex P1) +;; --------------------------------------------------------------------------- + +(deftest chained-committed-zincrby-rejects-stale-intermediate + ;; codex P1: sequential committed ZINCRBYs form a forced linearization + ;; chain. The first ZINCRBY's return value is an intermediate that no + ;; post-chain read may observe. Expect :score-mismatch on the stale + ;; intermediate. + (let [history [;; Start with score 1. + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; ZINCRBY +2 -> ok=3 (committed). + {:type :invoke :process 0 :f :zincrby :value ["m1" 2] :index 2} + {:type :ok :process 0 :f :zincrby :value ["m1" 3.0] :index 3} + ;; ZINCRBY +3 -> ok=6 (committed). Strictly follows the + ;; previous ZINCRBY in real time (invoke 4 > complete 3). + {:type :invoke :process 0 :f :zincrby :value ["m1" 3] :index 4} + {:type :ok :process 0 :f :zincrby :value ["m1" 6.0] :index 5} + ;; Read AFTER the whole chain observes the stale + ;; intermediate 3.0 -- not admissible under any + ;; linearization. + {:type :invoke :process 1 :f :zrange-all :index 6} + {:type :ok :process 1 :f :zrange-all + :value [["m1" 3.0]] :index 7}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) + (str "expected stale-intermediate to be flagged, got: " result)) + (is (contains? kinds :score-mismatch) + (str "expected :score-mismatch, got kinds=" kinds)))) + +(deftest chained-committed-zincrby-accepts-latest + ;; codex P1: same history but the read observes the LATEST chain tail + ;; (6.0) -- accept as valid. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 0 :f :zincrby :value ["m1" 2] :index 2} + {:type :ok :process 0 :f :zincrby :value ["m1" 3.0] :index 3} + {:type :invoke :process 0 :f :zincrby :value ["m1" 3] :index 4} + {:type :ok :process 0 :f :zincrby :value ["m1" 6.0] :index 5} + {:type :invoke :process 1 :f :zrange-all :index 6} + {:type :ok :process 1 :f :zrange-all + :value [["m1" 6.0]] :index 7}] + result (run-checker history)] + (is (:valid? result) + (str "expected chain-tail score to be accepted, got: " result)))) + +(deftest concurrent-zincrby-both-admissible + ;; codex P1: two overlapping-in-real-time ZINCRBYs whose returned + ;; scores are BOTH candidate final states under some linearization. + ;; Read observing either value must be accepted. + ;; Overlap: A=[inv=2, cmp=5], B=[inv=3, cmp=4]. + (let [base [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 1 :f :zincrby :value ["m1" 2] :index 2} + {:type :invoke :process 2 :f :zincrby :value ["m1" 3] :index 3} + ;; B completes first with ok=4 (delta applied to score 1). + {:type :ok :process 2 :f :zincrby :value ["m1" 4.0] :index 4} + ;; A completes with ok=6 (delta applied after B). + {:type :ok :process 1 :f :zincrby :value ["m1" 6.0] :index 5}] + read-a (conj base + {:type :invoke :process 3 :f :zrange-all :index 6} + {:type :ok :process 3 :f :zrange-all + :value [["m1" 4.0]] :index 7}) + read-b (conj base + {:type :invoke :process 3 :f :zrange-all :index 6} + {:type :ok :process 3 :f :zrange-all + :value [["m1" 6.0]] :index 7})] + (is (:valid? (run-checker read-a)) + "expected B's return value (4.0) admissible under overlap") + (is (:valid? (run-checker read-b)) + "expected A's return value (6.0) admissible under overlap"))) + +(deftest zadd-resets-zincrby-chain + ;; codex P1: a committed ZADD between ZINCRBYs resets the chain -- + ;; subsequent ZINCRBYs operate on the new ZADD'd value. The pre-reset + ;; ZINCRBY score is NOT a valid read after the chain completes. + (let [base [;; ZADD m1 1 + {:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; ZINCRBY +2 -> 3 + {:type :invoke :process 0 :f :zincrby :value ["m1" 2] :index 2} + {:type :ok :process 0 :f :zincrby :value ["m1" 3.0] :index 3} + ;; ZADD m1 10 -- chain reset to absolute value. + {:type :invoke :process 0 :f :zadd :value ["m1" 10] :index 4} + {:type :ok :process 0 :f :zadd :value ["m1" 10] :index 5} + ;; ZINCRBY +1 -> 11 + {:type :invoke :process 0 :f :zincrby :value ["m1" 1] :index 6} + {:type :ok :process 0 :f :zincrby :value ["m1" 11.0] :index 7}] + read-ok (conj base + {:type :invoke :process 1 :f :zrange-all :index 8} + {:type :ok :process 1 :f :zrange-all + :value [["m1" 11.0]] :index 9}) + read-bad (conj base + {:type :invoke :process 1 :f :zrange-all :index 8} + {:type :ok :process 1 :f :zrange-all + :value [["m1" 3.0]] :index 9})] + (is (:valid? (run-checker read-ok)) + "expected post-reset chain tail (11.0) to be accepted") + (is (not (:valid? (run-checker read-bad))) + "expected pre-reset intermediate (3.0) to be flagged"))) + ;; --------------------------------------------------------------------------- ;; Infinity score parsing ;; --------------------------------------------------------------------------- From da8556033917811b16540571bb13e46064a83f39 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 18:55:15 +0900 Subject: [PATCH 11/28] fix(jepsen): restrict unknown-score? to :info zincrby, not any concurrent multi The prior rule flipped `unknown-score?` to true whenever 2+ concurrent ZINCRBYs overlapped a read, even when all of them had :ok completions with known return values. That silently disabled the strict score check for the read, producing false negatives (e.g. an impossible score 999 after `+2 -> 3` and `+3 -> 6` ok completions was accepted). Refine the gate: `unknown-score?` is true iff at least one uncertain zincrby has an unknown post-op score (:info/:pending). When every uncertain zincrby is :ok with a recorded return value, those return values pin the linearization and the admissible score set built from candidates + uncertain ok scores is already complete. The max-inv chain-tail filter added in 02a8adf7 handles the preceding chain; this change lets the strict score check run for fully-committed concurrent windows. Add three tests exercising the scenarios: - two-ok-concurrent-zincrbys-reject-impossible-score - two-ok-concurrent-zincrbys-accept-known-chain-tail - info-plus-ok-concurrent-zincrby-stays-unknown --- .../elastickv/redis_zset_safety_workload.clj | 41 +++++----- .../redis_zset_safety_workload_test.clj | 79 +++++++++++++++++++ 2 files changed, 102 insertions(+), 18 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 9d96e0f94..f2daad948 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -457,24 +457,29 @@ (some #(and (= :zincrby (:f %)) (:unknown-score? %)) coll)) - ;; Count concurrent/uncertain ZINCRBYs. The resulting score of a - ;; read relative to ZINCRBYs depends on how many of them took - ;; effect before the read observed state. - ;; * 0 uncertain ZINCRBYs: :scores is authoritative. - ;; * 1 uncertain ZINCRBY whose final score is KNOWN (:ok): - ;; the read can observe either the pre-op state (already in - ;; :scores from candidates) or the post-op state (its known - ;; final score, also added to :scores). :scores is complete. - ;; * 1 uncertain ZINCRBY with UNKNOWN score (:info/:pending): - ;; the post-op score is not recoverable from the history. - ;; We must relax the strict score check. - ;; * >=2 uncertain ZINCRBYs: their relative serialization - ;; order produces prefix-sum intermediates that are not in - ;; :scores (e.g. pre + delta1 before pre + delta1 + delta2). - ;; Relax the strict score check. - uncertain-incrs (filter #(= :zincrby (:f %)) uncertain) - unknown-score? (or (has-unknown-incr? uncertain) - (> (count uncertain-incrs) 1)) + ;; Classify uncertain ZINCRBYs by whether their resulting score + ;; is known. The resulting score of a read relative to ZINCRBYs + ;; depends only on which of them took effect before the read + ;; observed state AND whether each such ZINCRBY's return value + ;; is recorded. + ;; * Any uncertain ZINCRBY with UNKNOWN score (:info/:pending): + ;; the post-op score is not recoverable from the history, so + ;; we must relax the strict score check -- any numeric score + ;; is admissible. + ;; * All uncertain ZINCRBYs :ok with known return values: + ;; every recorded return pins the ZINCRBY's post-op state + ;; and (because ZINCRBY reads-then-writes atomically) its + ;; pre-op state. Any real-time consistent linearization + ;; therefore ends on one of those known return values (or + ;; on a candidate's score). :scores already contains all + ;; of them via the add-scores reduction over `uncertain`, + ;; so the strict score check is sound. Intermediate + ;; "prefix-sum" values (pre + delta_i for just one of + ;; several concurrent zincrbys) are NOT admissible final + ;; states: the return values constrain the serialization + ;; order, and no legitimate read can observe a partial sum + ;; that doesn't match any recorded post-op score. + unknown-score? (has-unknown-incr? uncertain) any-candidate-write? (some write-op? candidates) any-candidate-zrem? (some #(= :zrem (:f %)) candidates) diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 38a0d8952..44c3bd668 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -487,6 +487,85 @@ (is (not (:valid? (run-checker read-bad))) "expected pre-reset intermediate (3.0) to be flagged"))) +;; --------------------------------------------------------------------------- +;; unknown-score? gate: restricted to :info ZINCRBYs only. Two concurrent +;; :ok ZINCRBYs with known return values do NOT make the score check +;; unknown -- their return values pin the linearization and the +;; admissible score set is constrained by :scores (candidates + uncertain +;; ok return values). (codex P1) +;; --------------------------------------------------------------------------- + +(deftest two-ok-concurrent-zincrbys-reject-impossible-score + ;; codex P1: two overlapping :ok ZINCRBYs with known return values + ;; (3 and 6) constrain the admissible post-chain read set to {1,3,6}. + ;; A read of 999 is impossible under any linearization; the checker + ;; must flag it as :score-mismatch (no longer swallowed by the old + ;; "2+ uncertain zincrbys -> unknown-score?" shortcut). + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; Two concurrent ZINCRBYs. Windows overlap the read. + {:type :invoke :process 1 :f :zincrby :value ["m1" 2] :index 2} + {:type :invoke :process 2 :f :zincrby :value ["m1" 3] :index 3} + {:type :ok :process 1 :f :zincrby :value ["m1" 3.0] :index 4} + {:type :ok :process 2 :f :zincrby :value ["m1" 6.0] :index 5} + ;; Read observes an impossible score. + {:type :invoke :process 3 :f :zrange-all :index 6} + {:type :ok :process 3 :f :zrange-all + :value [["m1" 999.0]] :index 7}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) + (str "expected impossible score to be flagged, got: " result)) + (is (contains? kinds :score-mismatch) + (str "expected :score-mismatch, got kinds=" kinds)))) + +(deftest two-ok-concurrent-zincrbys-accept-known-chain-tail + ;; codex P1: same concurrent :ok ZINCRBY history, but the read + ;; observes one of the recorded return values. Both 3.0 (linearization + ;; where +3 ran first, then +2) and 6.0 (the other order) must be + ;; accepted as valid. + (let [base [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + {:type :invoke :process 1 :f :zincrby :value ["m1" 2] :index 2} + {:type :invoke :process 2 :f :zincrby :value ["m1" 3] :index 3} + {:type :ok :process 1 :f :zincrby :value ["m1" 3.0] :index 4} + {:type :ok :process 2 :f :zincrby :value ["m1" 6.0] :index 5}] + read-6 (conj base + {:type :invoke :process 3 :f :zrange-all :index 6} + {:type :ok :process 3 :f :zrange-all + :value [["m1" 6.0]] :index 7}) + read-3 (conj base + {:type :invoke :process 3 :f :zrange-all :index 6} + {:type :ok :process 3 :f :zrange-all + :value [["m1" 3.0]] :index 7})] + (is (:valid? (run-checker read-6)) + "expected 6.0 (one linearization) to be accepted") + (is (:valid? (run-checker read-3)) + "expected 3.0 (other linearization) to be accepted"))) + +(deftest info-plus-ok-concurrent-zincrby-stays-unknown + ;; codex P1: when at least one concurrent ZINCRBY is :info (unknown + ;; post-op score), the strict score check must be relaxed regardless + ;; of how many other :ok ZINCRBYs are concurrent. Any numeric score + ;; must be accepted for this read. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} + ;; One :info ZINCRBY (unknown outcome). + {:type :invoke :process 1 :f :zincrby :value ["m1" 2] :index 2} + ;; One :ok ZINCRBY with known return value. + {:type :invoke :process 2 :f :zincrby :value ["m1" 3] :index 3} + {:type :ok :process 2 :f :zincrby :value ["m1" 4.0] :index 4} + {:type :info :process 1 :f :zincrby :value ["m1" 2] + :error "conn reset" :index 5} + ;; Read observes an "arbitrary" score -- admissible + ;; because the :info ZINCRBY could have produced any + ;; post-op state visible to the read. + {:type :invoke :process 3 :f :zrange-all :index 6} + {:type :ok :process 3 :f :zrange-all + :value [["m1" 42.0]] :index 7}]] + (is (:valid? (run-checker history)) + "expected any score accepted when :info ZINCRBY is concurrent"))) + ;; --------------------------------------------------------------------------- ;; Infinity score parsing ;; --------------------------------------------------------------------------- From 502f64a346738ffa2e86f18b0955e8d70daf5c96 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:05:22 +0900 Subject: [PATCH 12/28] fix(jepsen): guard setup! and ZINCRBY response parsing against nil/missing - setup! now loudly warns (rather than silently proceeding) when open! failed to populate :conn-spec, so repeated failures can't mask stale data carrying over between runs. Catches Throwable on the DEL so a transiently unreachable Redis doesn't abort the whole run. - ZINCRBY response goes through an explicit coerce-zincrby-score: nil / Throwable / non-string/non-number replies become :info with a structured :error instead of throwing NumberFormatException out of (str nil) -> "nil" -> parse-double-safe. - Introduces a private zincrby! fn indirection so tests can stub the Redis call without going through the car/wcar macro. - Adds tests for missing :conn-spec, unreachable Redis, and nil / unexpected / numeric ZINCRBY replies. --- .../elastickv/redis_zset_safety_workload.clj | 78 +++++++++++++++++-- .../redis_zset_safety_workload_test.clj | 66 ++++++++++++++++ 2 files changed, 137 insertions(+), 7 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index f2daad948..c1487d115 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -86,6 +86,41 @@ :else (Double/parseDouble raw)))) +(defn- coerce-zincrby-score + "Carmine's ZINCRBY reply is normally a score string, but under error / + timeout / protocol edge cases it may be nil, a numeric value, or + something else entirely. Stringifying nil produces \"nil\", which + parse-double-safe would then hand to Double/parseDouble and throw. + Explicitly classify the response so the invoke! path can record + :unknown-response as :info instead of masking it in a catch-all. + + Returns one of: + [:ok (double score)] + [:nil] ; nil response + [:error ] ; Carmine error reply + [:unexpected ] ; anything else" + [response] + (cond + (nil? response) + [:nil] + + (number? response) + [:ok (double response)] + + (string? response) + (try + [:ok (parse-double-safe response)] + (catch NumberFormatException _ + [:unexpected response])) + + ;; Carmine surfaces Redis error replies as exceptions by default, + ;; but some codepaths wrap them in an ex-info / Throwable value. + (instance? Throwable response) + [:error (.getMessage ^Throwable response)] + + :else + [:unexpected response])) + (defn- parse-withscores "Carmine returns a flat [member score member score ...] vector for ZRANGE WITHSCORES. Convert to a sorted vector of [member (double score)] @@ -97,6 +132,13 @@ [(if (bytes? m) (String. ^bytes m) (str m)) (parse-double-safe s)])))) +(defn- zincrby! + "Executes a ZINCRBY against conn-spec and returns Carmine's raw reply + (normally a score string). Extracted so tests can stub the Redis call + without going through the `car/wcar` macro." + [conn-spec key delta member] + (car/wcar conn-spec (car/zincrby key (double delta) member))) + (defrecord ElastickvRedisZSetSafetyClient [node->port conn-spec] client/Client @@ -110,10 +152,20 @@ (close! [this _test] this) (setup! [this _test] - (when-let [cs (:conn-spec this)] - (try (car/wcar cs (car/del zset-key)) - (catch Exception e - (warn "ZSet safety setup DEL failed:" (.getMessage e))))) + (if-let [cs (:conn-spec this)] + (try + (car/wcar cs (car/del zset-key)) + (catch Throwable t + ;; Do NOT swallow silently: repeated setup! failures across + ;; runs would leave stale data under zset-key and could + ;; produce false-positive safety failures. Log loudly so + ;; operators notice. + (warn "ZSet safety setup! DEL failed -- stale data may survive" + "into this run:" (.getMessage t)))) + ;; open! failed to populate :conn-spec (e.g. unresolvable host); + ;; flag it rather than silently proceeding with a no-op setup. + (warn "ZSet safety setup! skipped: missing :conn-spec on client;" + "prior state under" zset-key "may survive into this run")) this) (teardown! [this _test] this) @@ -129,9 +181,21 @@ :zincrby (let [[member delta] (:value op) - new-score (car/wcar cs (car/zincrby zset-key (double delta) member))] - (assoc op :type :ok - :value [member (parse-double-safe new-score)])) + new-score (zincrby! cs zset-key delta member) + [tag v] (coerce-zincrby-score new-score)] + (case tag + :ok (assoc op :type :ok :value [member v]) + :nil (do (warn "ZSet safety ZINCRBY returned nil for" member) + (assoc op :type :info + :error :nil-response)) + :error (do (warn "ZSet safety ZINCRBY returned error reply:" v) + (assoc op :type :info + :error {:kind :error-response + :message v})) + :unexpected (do (warn "ZSet safety ZINCRBY returned unexpected reply:" (pr-str v)) + (assoc op :type :info + :error {:kind :unexpected-response + :value (pr-str v)})))) :zrem (let [member (:value op) diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 44c3bd668..9bcb0dc4d 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -3,6 +3,7 @@ the model-based checker's edge cases (no-op ZREM, :info ZINCRBY)." (:require [clojure.test :refer :all] [jepsen.checker :as checker] + [jepsen.client :as client] [elastickv.redis-zset-safety-workload :as workload])) ;; --------------------------------------------------------------------------- @@ -570,6 +571,71 @@ ;; Infinity score parsing ;; --------------------------------------------------------------------------- +;; --------------------------------------------------------------------------- +;; Client setup! / invoke! robustness (gemini MEDIUM) +;; --------------------------------------------------------------------------- + +(deftest setup-bang-tolerates-missing-conn-spec + ;; gemini MEDIUM: if open! failed to populate :conn-spec (unresolvable + ;; host, etc.), setup! must NOT throw. Otherwise stale data from a + ;; prior run stays under zset-key and produces false-positive safety + ;; failures. The fix logs the skip loudly but returns normally. + (let [client (workload/->ElastickvRedisZSetSafetyClient {} nil)] + (is (= client (client/setup! client {})) + "setup! must return the client (not throw) when :conn-spec is nil"))) + +(deftest setup-bang-tolerates-unreachable-redis + ;; gemini MEDIUM: a Redis that can't be reached must surface as a + ;; logged warn, not an uncaught throw that aborts the whole run. The + ;; fix catches Throwable in setup!. + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "127.0.0.1" + :port 1 ; guaranteed unreachable + :timeout-ms 100}})] + (is (= client (client/setup! client {})) + "setup! must swallow connection errors and keep the run going"))) + +(deftest zincrby-invoke-handles-nil-response + ;; gemini MEDIUM: if car/wcar for ZINCRBY returns nil (error reply + ;; coerced, unexpected protocol edge), the op must complete as :info + ;; with a structured :error, not throw NumberFormatException from + ;; parse-double-safe swallowing (str nil) -> "nil". + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zincrby :value ["m1" 1.0] :process 0 :index 0}] + (with-redefs [workload/zincrby! (fn [& _] nil)] + (let [result (client/invoke! client {} op)] + (is (= :info (:type result)) + (str "expected :info on nil ZINCRBY reply, got: " result)) + (is (some? (:error result)) + (str "expected :error to be populated, got: " result)))))) + +(deftest zincrby-invoke-handles-unexpected-response + ;; gemini MEDIUM: same guard, but for a non-string / non-number reply. + ;; Must complete :info rather than propagate a parse failure. + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zincrby :value ["m1" 1.0] :process 0 :index 0}] + (with-redefs [workload/zincrby! (fn [& _] {:unexpected :map})] + (let [result (client/invoke! client {} op)] + (is (= :info (:type result)) + (str "expected :info on unexpected ZINCRBY reply, got: " result)))))) + +(deftest zincrby-invoke-accepts-numeric-response + ;; Sanity: some Carmine versions coerce integer scores to longs. + ;; Must parse cleanly to a Double and complete :ok. + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zincrby :value ["m1" 1.0] :process 0 :index 0}] + (with-redefs [workload/zincrby! (fn [& _] 7)] + (let [result (client/invoke! client {} op)] + (is (= :ok (:type result)) + (str "expected :ok on numeric reply, got: " result)) + (is (= ["m1" 7.0] (:value result))))))) + (deftest parse-withscores-handles-inf-strings ;; gemini HIGH: Redis returns "inf" / "+inf" / "-inf" for infinite ;; ZSET scores. Double/parseDouble expects "Infinity"; the workload's From 33d59c576424af30c59bac32ea972cfadea6b8b6 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:15:09 +0900 Subject: [PATCH 13/28] fix(jepsen): restrict can-be-present? existence evidence to ZADD/ZINCRBY ZREM -- even :ok with :removed? true, and especially :info where :removed? is defaulted to true only for uncertainty accounting -- must never contribute existence evidence to can-be-present?. Since setup! clears the key at test start, every legitimately observed member must trace back to a successful (or in-flight) ZADD / ZINCRBY. Without this, a phantom member touched only by an :info ZREM could slip past the :unexpected-presence check. Drop the existence-evidence? helper and simplify can-be-present? to a disjunction over ZADD/ZINCRBY sources (candidate writes + uncertain writes). ZREM only ever contributes to must-not-be-present. Add regression test phantom-from-info-zrem-still-flagged. --- .../elastickv/redis_zset_safety_workload.clj | 25 ++++++----------- .../redis_zset_safety_workload_test.clj | 28 +++++++++++++++++-- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index c1487d115..caa9eb3fd 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -421,16 +421,6 @@ [m] (#{:zadd :zincrby} (:f m))) -(defn- existence-evidence? - "A mutation proves that the member existed at some point iff it is a - write-op, or a ZREM whose :removed? flag is true. No-op ZREMs - (:removed? false) do NOT prove existence." - [m] - (case (:f m) - (:zadd :zincrby) true - :zrem (boolean (:removed? m)) - false)) - (defn- allowed-scores-for-member "Compute the set of scores considered valid for `member` by a read whose window is [read-inv-idx, read-cmp-idx], based on committed state @@ -561,13 +551,16 @@ ;; can-be-present?: at least one admissible linearization ;; (candidates + uncertain) ends with the member present. - ;; An uncertain write (or an uncertain :zrem combined with - ;; existence evidence) can flip an otherwise-absent candidate - ;; outcome to present by reordering after a write. + ;; Presence REQUIRES a write-op (ZADD / ZINCRBY) somewhere in + ;; the admissible set -- either a candidate committed write or + ;; an uncertain concurrent/pre-read :info write. ZREM never + ;; contributes existence evidence: since `setup!` clears the + ;; key at test start, an observed member that never had a ZADD + ;; or ZINCRBY touch it must be a phantom regardless of any + ;; ZREM's :removed? flag (which may be defaulted to true on + ;; :info for uncertainty accounting only). can-be-present? (or candidate-can-be-present? - any-uncertain-write? - (and any-uncertain-zrem? - (some existence-evidence? uncertain))) + any-uncertain-write?) ;; must-be-present?: EVERY admissible linearization ends with ;; the member present. Requires the candidate outcome to be diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 9bcb0dc4d..d52a3094a 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -121,8 +121,7 @@ ;; CI-observed false positive: a member whose only prior ops are no-op ;; ZREMs was classified as :score-mismatch with :allowed #{} instead ;; of treated as never-existed (:phantom candidate, empty read -> OK). - ;; After the existence-evidence? fix, a read that observes NO such - ;; member must be accepted as valid. + ;; A read that observes NO such member must be accepted as valid. (let [history [{:type :invoke :process 0 :f :zrem :value "never-added" :index 0} {:type :invoke :process 1 :f :zrange-all :index 1} {:type :ok :process 1 :f :zrange-all :value [] :index 2} @@ -199,6 +198,31 @@ (is (contains? kinds :unexpected-presence) (str "expected :unexpected-presence, got kinds=" kinds)))) +(deftest phantom-from-info-zrem-still-flagged + ;; gemini HIGH (round 2): an :info ZREM is the ONLY history contact + ;; with a member (no ZADD/ZINCRBY ever). Because completed-mutation- + ;; window defaults :removed? to true on :info ZREMs (for uncertainty + ;; accounting), the checker must NOT treat ZREM as proof the member + ;; ever existed. A read observing the member present must be flagged + ;; as :unexpected-presence. Since setup! clears the key at test + ;; start, every observed member must trace back to a successful (or + ;; in-flight) ZADD/ZINCRBY -- never to a ZREM. + (let [history [;; ZREM of a member that was never added. Invoked + ;; concurrently with the read, response eventually + ;; lost (:info). No ZADD/ZINCRBY anywhere in history. + {:type :invoke :process 0 :f :zrem :value "phantom" :index 0} + {:type :invoke :process 1 :f :zrange-all :index 1} + ;; Read observes the phantom present at some score. + {:type :ok :process 1 :f :zrange-all + :value [["phantom" 7.0]] :index 2} + {:type :info :process 0 :f :zrem :value "phantom" :index 3}] + result (run-checker history) + kinds (set (map :kind (:first-errors result)))] + (is (not (:valid? result)) + (str "expected :unexpected-presence for phantom, got: " result)) + (is (contains? kinds :unexpected-presence) + (str "expected :unexpected-presence, got kinds=" kinds)))) + (deftest stale-read-after-committed-zrem-is-flagged ;; gemini HIGH: once a ZADD and a subsequent real (:removed? true) ZREM ;; have BOTH committed (with no concurrent re-add), a later read that From 9535ff3b4044a756cb1c4d22a6ca1a354eb74c02 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:15:54 +0900 Subject: [PATCH 14/28] fix(jepsen): correct clojure.tools.logging/warn call style in zset workload clojure.tools.logging/warn takes (warn msg) or (warn throwable msg) -- NOT multiple string args. Passing additional arguments positionally silently drops all but the first string on some backends. - setup! DEL failure: pass throwable first, concatenate the message. - setup! missing :conn-spec: concatenate into a single message. - ZINCRBY :nil / :error / :unexpected: concatenate into single message. - invoke! catch: pass throwable first, concatenate the :f context. No behavioral change beyond log output; existing tests still pass. --- .../elastickv/redis_zset_safety_workload.clj | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index caa9eb3fd..15dd38e15 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -159,13 +159,13 @@ ;; Do NOT swallow silently: repeated setup! failures across ;; runs would leave stale data under zset-key and could ;; produce false-positive safety failures. Log loudly so - ;; operators notice. - (warn "ZSet safety setup! DEL failed -- stale data may survive" - "into this run:" (.getMessage t)))) + ;; operators notice. clojure.tools.logging/warn expects + ;; (warn msg) or (warn throwable msg) -- NOT multiple strings. + (warn t "ZSet safety setup! DEL failed -- stale data may survive into this run"))) ;; open! failed to populate :conn-spec (e.g. unresolvable host); ;; flag it rather than silently proceeding with a no-op setup. - (warn "ZSet safety setup! skipped: missing :conn-spec on client;" - "prior state under" zset-key "may survive into this run")) + (warn (str "ZSet safety setup! skipped: missing :conn-spec on client;" + " prior state under " zset-key " may survive into this run"))) this) (teardown! [this _test] this) @@ -185,14 +185,14 @@ [tag v] (coerce-zincrby-score new-score)] (case tag :ok (assoc op :type :ok :value [member v]) - :nil (do (warn "ZSet safety ZINCRBY returned nil for" member) + :nil (do (warn (str "ZSet safety ZINCRBY returned nil for " member)) (assoc op :type :info :error :nil-response)) - :error (do (warn "ZSet safety ZINCRBY returned error reply:" v) + :error (do (warn (str "ZSet safety ZINCRBY returned error reply: " v)) (assoc op :type :info :error {:kind :error-response :message v})) - :unexpected (do (warn "ZSet safety ZINCRBY returned unexpected reply:" (pr-str v)) + :unexpected (do (warn (str "ZSet safety ZINCRBY returned unexpected reply: " (pr-str v))) (assoc op :type :info :error {:kind :unexpected-response :value (pr-str v)})))) @@ -215,7 +215,7 @@ (assoc op :type :ok :value {:bounds [lo hi] :members (parse-withscores flat)}))) (catch Exception e - (warn "ZSet safety op failed:" (:f op) (.getMessage e)) + (warn e (str "ZSet safety op failed: " (:f op))) (assoc op :type :info :error (.getMessage e))))))) ;; --------------------------------------------------------------------------- From fde116c7574cafede231dd83f321460aa64c4eed Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:16:03 +0900 Subject: [PATCH 15/28] fix(jepsen): decode Redis ZSET member bytes as UTF-8 explicitly String.(^bytes m) uses the JVM default charset, which is not guaranteed to be UTF-8 across runtimes. Redis stores member names as arbitrary bytes; the workload only ever produces UTF-8 member strings, so decode with "UTF-8" explicitly to stay stable regardless of JVM defaults. --- jepsen/src/elastickv/redis_zset_safety_workload.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 15dd38e15..110449bc6 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -129,7 +129,7 @@ (->> flat (partition 2) (mapv (fn [[m s]] - [(if (bytes? m) (String. ^bytes m) (str m)) + [(if (bytes? m) (String. ^bytes m "UTF-8") (str m)) (parse-double-safe s)])))) (defn- zincrby! From 0c948a2b54019124fdc46d1e1e9c1fc60d19e953 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:31:05 +0900 Subject: [PATCH 16/28] fix(jepsen-zset): hard-fail setup! when :conn-spec is missing gemini HIGH (PR #550): if open! fails to populate :conn-spec (e.g. unresolvable host), the previous "loud warn and proceed" behavior let the run continue against whatever stale ZSet state survived from an earlier run, yielding false-positive checker verdicts. Throw an ex-info instead so Jepsen aborts and the configuration problem surfaces immediately. Update the matching unit test: setup! is now required to throw on a nil :conn-spec rather than return the client. --- .../elastickv/redis_zset_safety_workload.clj | 20 +++++++++++++------ .../redis_zset_safety_workload_test.clj | 17 +++++++++------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 110449bc6..29fd882e8 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -152,7 +152,19 @@ (close! [this _test] this) (setup! [this _test] - (if-let [cs (:conn-spec this)] + ;; Hard-fail when :conn-spec is missing after open!. Silently (or + ;; even loudly) proceeding would leave stale data from a previous + ;; run under zset-key and risk false-positive checker results from + ;; that dirty state. Better to abort the run and surface the + ;; configuration problem. + (let [cs (or (:conn-spec this) + (throw (ex-info + (str "ZSet safety setup! cannot clear prior state:" + " :conn-spec is missing on client (open! did" + " not populate it). Aborting to avoid running" + " against stale data under " zset-key ".") + {:type ::missing-conn-spec + :zset-key zset-key})))] (try (car/wcar cs (car/del zset-key)) (catch Throwable t @@ -161,11 +173,7 @@ ;; produce false-positive safety failures. Log loudly so ;; operators notice. clojure.tools.logging/warn expects ;; (warn msg) or (warn throwable msg) -- NOT multiple strings. - (warn t "ZSet safety setup! DEL failed -- stale data may survive into this run"))) - ;; open! failed to populate :conn-spec (e.g. unresolvable host); - ;; flag it rather than silently proceeding with a no-op setup. - (warn (str "ZSet safety setup! skipped: missing :conn-spec on client;" - " prior state under " zset-key " may survive into this run"))) + (warn t "ZSet safety setup! DEL failed -- stale data may survive into this run")))) this) (teardown! [this _test] this) diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index d52a3094a..440231fd6 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -599,14 +599,17 @@ ;; Client setup! / invoke! robustness (gemini MEDIUM) ;; --------------------------------------------------------------------------- -(deftest setup-bang-tolerates-missing-conn-spec - ;; gemini MEDIUM: if open! failed to populate :conn-spec (unresolvable - ;; host, etc.), setup! must NOT throw. Otherwise stale data from a - ;; prior run stays under zset-key and produces false-positive safety - ;; failures. The fix logs the skip loudly but returns normally. +(deftest setup-bang-hard-fails-when-conn-spec-missing + ;; gemini HIGH: if open! failed to populate :conn-spec (unresolvable + ;; host, etc.), setup! MUST throw rather than silently proceed. + ;; Continuing with a no-op setup would leave stale data from a prior + ;; run under zset-key and risk false-positive checker verdicts from + ;; that dirty state. We want Jepsen to surface the failure. (let [client (workload/->ElastickvRedisZSetSafetyClient {} nil)] - (is (= client (client/setup! client {})) - "setup! must return the client (not throw) when :conn-spec is nil"))) + (is (thrown-with-msg? clojure.lang.ExceptionInfo + #":conn-spec is missing" + (client/setup! client {})) + "setup! must throw ex-info when :conn-spec is nil"))) (deftest setup-bang-tolerates-unreachable-redis ;; gemini MEDIUM: a Redis that can't be reached must surface as a From 84989f1885e0f2b8d7740fe833df9642475267bf Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:31:21 +0900 Subject: [PATCH 17/28] fix(jepsen-zset): prepend test subcommand only when absent or an option gemini MEDIUM (PR #550): the previous heuristic hard-coded "test" and "analyze" as the only recognised jepsen.cli subcommands and prepended "test" for anything else -- including future subcommands like "serve". Flip the logic: prepend "test" iff remaining-args is empty OR the first token starts with "-" (i.e. an option). Otherwise treat the first token as a subcommand and let jepsen.cli/run! decide (it also produces a better error message for unknown subcommands than we can). --- jepsen/src/elastickv/jepsen_test.clj | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/jepsen/src/elastickv/jepsen_test.clj b/jepsen/src/elastickv/jepsen_test.clj index 3fc9b5e21..7999c89c0 100644 --- a/jepsen/src/elastickv/jepsen_test.clj +++ b/jepsen/src/elastickv/jepsen_test.clj @@ -47,11 +47,21 @@ [f tail] [elastickv-test args]) ;; jepsen.cli/run! requires a subcommand ("test" or "analyze") - ;; as the first arg. Insert "test" if absent so users don't - ;; have to type it twice. + ;; as the first arg. Insert "test" only when the user clearly + ;; did NOT supply a subcommand: + ;; - remaining-args is empty, OR + ;; - the first token is an option (starts with "-") + ;; If the first token looks like a subcommand (any non-option + ;; word, e.g. "test", "analyze", "serve", or a future jepsen.cli + ;; subcommand we don't hard-code), leave it alone and let + ;; jepsen.cli/run! handle it (including producing a better + ;; error message for unknown subcommands than we could here). [next-head & _] remaining-args - final-args (if (#{"test" "analyze"} next-head) - remaining-args - (cons "test" remaining-args))] + prepend-test? (or (empty? remaining-args) + (and (string? next-head) + (.startsWith ^String next-head "-"))) + final-args (if prepend-test? + (cons "test" remaining-args) + remaining-args)] (cli/run! (cli/single-test-cmd {:test-fn selected-fn}) final-args))) From 7a7a2185418061d2379158c3c28609620ebb42b7 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:31:37 +0900 Subject: [PATCH 18/28] fix(jepsen-zset): document why :final-generator is overridden to nil gemini MEDIUM (PR #550): elastickv-zset-safety-test unconditionally overrides the workload's :final-generator to nil, shadowing the (gen/once {:f :zrange-all}) defined in elastickv-zset-safety-workload. Gemini flagged this as suspicious and asked whether it was intentional. It is intentional and matches the project-wide pattern in redis_workload.clj / s3_workload.clj / dynamodb_workload.clj, all of which set :final-generator nil with the comment "Jepsen 0.3.x can't fressian-serialize some combined final gens; skip." (introduced in commit a27267ca0e). Add the same one-line comment here so future readers (and future Gemini reviews) understand the override is a deliberate workaround for a Jepsen 0.3.x limitation, not a bug. --- jepsen/src/elastickv/redis_zset_safety_workload.clj | 1 + 1 file changed, 1 insertion(+) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 29fd882e8..12610c806 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -816,6 +816,7 @@ (:ssh opts)) :remote control/ssh :nemesis (if nemesis-p (:nemesis nemesis-p) nemesis/noop) + ;; Jepsen 0.3.x can't fressian-serialize some combined final gens; skip. :final-generator nil :concurrency (or (:concurrency opts) 5) :generator (->> (:generator workload) From 62198314885c6bd749163b6a81f63645f4d825a1 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:47:15 +0900 Subject: [PATCH 19/28] fix(jepsen-zset): guard ZREM nil reply to avoid NPE in invoke! If `car/wcar` for ZREM returns nil (protocol edge, closed connection, Redis error path), `(long nil)` throws NPE. The NPE was previously caught by the general Exception handler and the op was logged as a generic failure, masking the real signal. Extract a `zrem!` helper paralleling `zincrby!` and wrap the reply in `(or removed 0)` so a nil reply resolves cleanly as `:ok [member false]`. Covers tests for both the nil-guard and the normal numeric reply. Addresses gemini MEDIUM review on PR #550. --- .../elastickv/redis_zset_safety_workload.clj | 17 +++++++++-- .../redis_zset_safety_workload_test.clj | 28 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 12610c806..e48ea965d 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -139,6 +139,13 @@ [conn-spec key delta member] (car/wcar conn-spec (car/zincrby key (double delta) member))) +(defn- zrem! + "Executes a ZREM against conn-spec and returns Carmine's raw reply + (normally an integer count of removed members). Extracted so tests + can stub the Redis call without going through the `car/wcar` macro." + [conn-spec key member] + (car/wcar conn-spec (car/zrem key member))) + (defrecord ElastickvRedisZSetSafetyClient [node->port conn-spec] client/Client @@ -207,8 +214,14 @@ :zrem (let [member (:value op) - removed (car/wcar cs (car/zrem zset-key member))] - (assoc op :type :ok :value [member (pos? (long removed))])) + ;; Carmine normally returns an integer count. Guard + ;; against nil / missing reply (protocol edge, closed + ;; connection, etc.) so `(long removed)` doesn't throw + ;; NPE -- that would otherwise fall through to the + ;; general Exception handler and be logged as a generic + ;; op failure, obscuring the actual signal. + removed (zrem! cs zset-key member)] + (assoc op :type :ok :value [member (pos? (long (or removed 0)))])) :zrange-all (let [flat (car/wcar cs (car/zrange zset-key 0 -1 "WITHSCORES"))] diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 440231fd6..1b3508791 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -663,6 +663,34 @@ (str "expected :ok on numeric reply, got: " result)) (is (= ["m1" 7.0] (:value result))))))) +(deftest zrem-invoke-handles-nil-response + ;; gemini MEDIUM: if car/wcar for ZREM returns nil (protocol edge, + ;; closed connection, etc.), `(long nil)` would throw NPE and the + ;; op would be logged as a generic failure via the general Exception + ;; handler. Guard with `(or removed 0)` so the op resolves cleanly + ;; as :ok [member false]. + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zrem :value "ghost" :process 0 :index 0}] + (with-redefs [workload/zrem! (fn [& _] nil)] + (let [result (client/invoke! client {} op)] + (is (= :ok (:type result)) + (str "expected :ok on nil ZREM reply, got: " result)) + (is (= ["ghost" false] (:value result)) + (str "expected removed? false on nil reply, got: " result)))))) + +(deftest zrem-invoke-handles-numeric-response + ;; Sanity: ZREM's normal reply is an integer count. + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zrem :value "m1" :process 0 :index 0}] + (with-redefs [workload/zrem! (fn [& _] 1)] + (let [result (client/invoke! client {} op)] + (is (= :ok (:type result))) + (is (= ["m1" true] (:value result))))))) + (deftest parse-withscores-handles-inf-strings ;; gemini HIGH: Redis returns "inf" / "+inf" / "-inf" for infinite ;; ZSET scores. Double/parseDouble expects "Infinity"; the workload's From e67d29f157fd18b8554f3e66a8d3a0b1dc4b714d Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:48:13 +0900 Subject: [PATCH 20/28] fix(jepsen-zset): return :valid? :unknown when no successful reads If every operation in a run is downgraded to :info (Redis unreachable, every read timed out, etc.), read-pairs is empty and all-errors is empty, so the checker previously returned :valid? true -- a false-green that hides the fact that no safety evidence was gathered. Emit :valid? :unknown plus a diagnostic :reason string when the history contains zero successful :zrange-all / :zrangebyscore reads. The cli's fail-on-invalid! treats anything other than true as a failure, so the run now surfaces the missing signal. Regression tests cover the empty history, the all-:info history, and the single-:ok-read positive case. Addresses codex P1 review on PR #550. --- .../elastickv/redis_zset_safety_workload.clj | 34 +++++++++++---- .../redis_zset_safety_workload_test.clj | 41 +++++++++++++++++++ 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index e48ea965d..89aa03dd4 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -760,13 +760,33 @@ :zrangebyscore (check-zrangebyscore mutations-by-m pair)))) [] read-pairs) - by-kind (group-by :kind all-errors)] - {:valid? (empty? all-errors) - :reads (count read-pairs) - :mutations (count mutations) - :error-count (count all-errors) - :errors-by-kind (into {} (map (fn [[k v]] [k (count v)]) by-kind)) - :first-errors (take 20 all-errors)})))) + by-kind (group-by :kind all-errors) + ;; Vacuous-pass guard (codex P1): if the run produced zero + ;; successful reads, we have no evidence that the system + ;; under test actually satisfies ZSet safety -- every op + ;; may have been downgraded to :info because Redis was + ;; unreachable or every read timed out. Returning + ;; `:valid? true` in that case would be a false-green. + ;; Emit `:valid? :unknown` with a diagnostic reason; the + ;; cli's `fail-on-invalid!` treats anything other than + ;; `true` as a failure (see elastickv.cli/fail-on-invalid!). + no-successful-reads? (zero? (count read-pairs)) + valid? (cond + (seq all-errors) false + no-successful-reads? :unknown + :else true)] + (cond-> {:valid? valid? + :reads (count read-pairs) + :mutations (count mutations) + :error-count (count all-errors) + :errors-by-kind (into {} (map (fn [[k v]] [k (count v)]) by-kind)) + :first-errors (take 20 all-errors)} + no-successful-reads? + (assoc :reason + (str "No successful :zrange-all / :zrangebyscore reads" + " completed -- cannot assert ZSet safety. Likely" + " Redis was unreachable or every read timed out;" + " re-run against a healthy cluster."))))))) ;; --------------------------------------------------------------------------- ;; Workload diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 1b3508791..778a8feb3 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -663,6 +663,47 @@ (str "expected :ok on numeric reply, got: " result)) (is (= ["m1" 7.0] (:value result))))))) +;; --------------------------------------------------------------------------- +;; Vacuous-pass guard (codex P1) +;; --------------------------------------------------------------------------- + +(deftest empty-history-is-unknown-not-valid + ;; codex P1: an empty history (e.g. Redis unreachable, all ops + ;; downgraded to :info) produces zero successful reads. The checker + ;; MUST NOT return :valid? true in that case -- that would be a + ;; false-green. Expect :valid? :unknown plus a diagnostic :reason. + (let [result (run-checker [])] + (is (= :unknown (:valid? result)) + (str "expected :unknown on empty history, got: " result)) + (is (string? (:reason result)) + (str "expected :reason to be populated, got: " result)) + (is (zero? (:reads result))))) + +(deftest all-info-history-is-unknown-not-valid + ;; codex P1: a run where every operation was downgraded to :info + ;; (Redis unreachable / every read timed out) still has read-pairs + ;; filtered down to zero :ok reads. Must surface as :valid? :unknown. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} + {:type :info :process 0 :f :zadd :value ["m1" 1] :index 1 + :error "conn refused"} + {:type :invoke :process 0 :f :zrange-all :index 2} + {:type :info :process 0 :f :zrange-all :index 3 + :error "conn refused"}] + result (run-checker history)] + (is (= :unknown (:valid? result)) + (str "expected :unknown when all ops are :info, got: " result)) + (is (string? (:reason result))))) + +(deftest one-successful-read-is-enough-to-validate + ;; Sanity: the vacuous-pass guard must only kick in when there are + ;; ZERO successful reads. A single :ok read with no errors is a + ;; legitimate :valid? true. + (let [history [{:type :invoke :process 0 :f :zrange-all :index 0} + {:type :ok :process 0 :f :zrange-all :value [] :index 1}] + result (run-checker history)] + (is (true? (:valid? result)) + (str "expected :valid? true with one :ok read, got: " result)))) + (deftest zrem-invoke-handles-nil-response ;; gemini MEDIUM: if car/wcar for ZREM returns nil (protocol edge, ;; closed connection, etc.), `(long nil)` would throw NPE and the From 623d5c22157eb517d0895405f608f04679dcfed2 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 19:49:00 +0900 Subject: [PATCH 21/28] fix(jepsen-zset): hard-fail setup! when cleanup DEL errors Previously setup! caught Throwable around the cleanup (car/wcar cs (car/del zset-key)) and only logged a warn, then returned the client so the run continued. If the DEL failed (connection refused, Redis error, timeout, etc.) stale data from a previous run would survive under zset-key and could produce false-positive safety verdicts in the checker. Re-throw an ex-info wrapping the original cause so Jepsen aborts the run. The warn is retained for log visibility. Update the existing "tolerates-unreachable-redis" regression test to the new expectation: setup! MUST propagate cleanup failures, not swallow them. Addresses gemini MEDIUM review on PR #550. --- .../elastickv/redis_zset_safety_workload.clj | 20 +++++++++++++------ .../redis_zset_safety_workload_test.clj | 15 ++++++++------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 89aa03dd4..b92a5ff5d 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -172,15 +172,23 @@ " against stale data under " zset-key ".") {:type ::missing-conn-spec :zset-key zset-key})))] + ;; The cleanup DEL MUST succeed. If it fails (connection refused, + ;; Redis error reply, timeout, whatever), stale data from a prior + ;; run survives under zset-key and can produce false-positive + ;; safety verdicts in the checker. Log loudly AND re-throw so + ;; Jepsen aborts the run instead of silently running against + ;; dirty state. (gemini MEDIUM) (try (car/wcar cs (car/del zset-key)) (catch Throwable t - ;; Do NOT swallow silently: repeated setup! failures across - ;; runs would leave stale data under zset-key and could - ;; produce false-positive safety failures. Log loudly so - ;; operators notice. clojure.tools.logging/warn expects - ;; (warn msg) or (warn throwable msg) -- NOT multiple strings. - (warn t "ZSet safety setup! DEL failed -- stale data may survive into this run")))) + (warn t "ZSet safety setup! DEL failed -- aborting to avoid stale data") + (throw (ex-info + (str "ZSet safety setup! failed to clear prior state at " + zset-key ": " (.getMessage t) + ". Refusing to run against potentially stale data.") + {:type ::cleanup-failed + :zset-key zset-key} + t))))) this) (teardown! [this _test] this) diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 778a8feb3..1eca986b5 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -611,16 +611,19 @@ (client/setup! client {})) "setup! must throw ex-info when :conn-spec is nil"))) -(deftest setup-bang-tolerates-unreachable-redis - ;; gemini MEDIUM: a Redis that can't be reached must surface as a - ;; logged warn, not an uncaught throw that aborts the whole run. The - ;; fix catches Throwable in setup!. +(deftest setup-bang-hard-fails-when-cleanup-del-errors + ;; gemini MEDIUM: even when :conn-spec is populated, if the actual + ;; cleanup (DEL zset-key) fails or errors, setup! must NOT silently + ;; proceed. Stale data surviving from a prior run under zset-key + ;; would cause false-positive safety verdicts. Propagate the + ;; exception so Jepsen aborts the run. (let [client (workload/->ElastickvRedisZSetSafetyClient {} {:pool {} :spec {:host "127.0.0.1" :port 1 ; guaranteed unreachable :timeout-ms 100}})] - (is (= client (client/setup! client {})) - "setup! must swallow connection errors and keep the run going"))) + (is (thrown? Throwable + (client/setup! client {})) + "setup! must propagate cleanup failures, not swallow them"))) (deftest zincrby-invoke-handles-nil-response ;; gemini MEDIUM: if car/wcar for ZINCRBY returns nil (error reply From 22e41e1127560ab177096453fe1e0d891a3770bb Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 20:09:38 +0900 Subject: [PATCH 22/28] fix(jepsen-zset): guard nil .getMessage on exception :error fields Throwable.getMessage() can return nil when the exception was constructed without a detail message. Without a guard, the op would record :error nil -- an unhelpful signal that hides the real failure mode. Wrap every .getMessage call feeding :error (or an error string) with (or (.getMessage t) (str t)) so the diagnostic falls back to the exception's class + inner state when no message is attached. Applies to three sites in redis_zset_safety_workload.clj: - coerce-zincrby-score's Throwable branch - setup! cleanup-failed ex-info message - invoke!'s catch-all :info error recorder --- jepsen/src/elastickv/redis_zset_safety_workload.clj | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index b92a5ff5d..9cea08acc 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -116,7 +116,7 @@ ;; Carmine surfaces Redis error replies as exceptions by default, ;; but some codepaths wrap them in an ex-info / Throwable value. (instance? Throwable response) - [:error (.getMessage ^Throwable response)] + [:error (or (.getMessage ^Throwable response) (str response))] :else [:unexpected response])) @@ -184,7 +184,7 @@ (warn t "ZSet safety setup! DEL failed -- aborting to avoid stale data") (throw (ex-info (str "ZSet safety setup! failed to clear prior state at " - zset-key ": " (.getMessage t) + zset-key ": " (or (.getMessage t) (str t)) ". Refusing to run against potentially stale data.") {:type ::cleanup-failed :zset-key zset-key} @@ -245,7 +245,7 @@ :members (parse-withscores flat)}))) (catch Exception e (warn e (str "ZSet safety op failed: " (:f op))) - (assoc op :type :info :error (.getMessage e))))))) + (assoc op :type :info :error (or (.getMessage e) (str e)))))))) ;; --------------------------------------------------------------------------- ;; Generator From 9f5e958d7c2029f61e356bffc9811c5819a395b2 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 20:11:00 +0900 Subject: [PATCH 23/28] docs(jepsen-zset): strip LLM reviewer artifact markers from comments Scrub "(gemini MEDIUM)", "(codex P1)", "gemini HIGH", "CodeRabbit finding", etc. from source and test comments. The substantive explanations remain; only the review-trail jargon is removed so the code reads as intentional design rather than a paper trail of iterative bot feedback. Files: - jepsen/src/elastickv/redis_zset_safety_workload.clj (2 sites) - jepsen/test/elastickv/redis_zset_safety_workload_test.clj (many) --- .../elastickv/redis_zset_safety_workload.clj | 4 +- .../redis_zset_safety_workload_test.clj | 80 +++++++++---------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 9cea08acc..4bc4a289d 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -177,7 +177,7 @@ ;; run survives under zset-key and can produce false-positive ;; safety verdicts in the checker. Log loudly AND re-throw so ;; Jepsen aborts the run instead of silently running against - ;; dirty state. (gemini MEDIUM) + ;; dirty state. (try (car/wcar cs (car/del zset-key)) (catch Throwable t @@ -769,7 +769,7 @@ [] read-pairs) by-kind (group-by :kind all-errors) - ;; Vacuous-pass guard (codex P1): if the run produced zero + ;; Vacuous-pass guard: if the run produced zero ;; successful reads, we have no evidence that the system ;; under test actually satisfies ZSet safety -- every op ;; may have been downgraded to :info because Redis was diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 1eca986b5..0641d7efc 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -77,12 +77,12 @@ (is (not (:valid? result)) (str "expected mismatch, got: " result)))) (deftest single-ok-concurrent-zincrby-still-validates-scores - ;; Codex P1: :unknown-score? must NOT be set when exactly one - ;; concurrent ZINCRBY is :ok (and therefore has a known resulting - ;; score). The read may observe either the pre-op score or the - ;; post-op score, both of which are in :scores. An arbitrary - ;; impossible score (e.g. 999.0) must still be flagged as a - ;; :score-mismatch, not waved through by `:unknown-score?`. + ;; :unknown-score? must NOT be set when exactly one concurrent + ;; ZINCRBY is :ok (and therefore has a known resulting score). The + ;; read may observe either the pre-op score or the post-op score, + ;; both of which are in :scores. An arbitrary impossible score + ;; (e.g. 999.0) must still be flagged as a :score-mismatch, not + ;; waved through by `:unknown-score?`. (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} {:type :invoke :process 1 :f :zincrby :value ["m1" 5] :index 2} @@ -130,7 +130,7 @@ (is (:valid? result) (str "expected valid, got: " result)))) (deftest duplicate-members-are-flagged - ;; CodeRabbit finding: ZRANGE must not return the same member twice. + ;; ZRANGE must not return the same member twice. ;; With a hypothetical committed + concurrent score for the same ;; member, a duplicate could sneak past sort + score-membership ;; checks. Enforce distinctness explicitly. @@ -143,7 +143,7 @@ (is (not (:valid? result)) (str "expected duplicate-members error, got: " result)))) (deftest overlapping-committed-zadds-allow-either-score - ;; CodeRabbit finding: two :ok ZADDs for the same member whose + ;; Two :ok ZADDs for the same member whose ;; invoke/complete windows overlap have ambiguous serialization ;; order. Either's resulting score is a valid post-state; the checker ;; must not pin to the higher :complete-idx value only. @@ -163,7 +163,7 @@ (str "expected valid under overlapping-writes relaxation, got: " result)))) (deftest info-before-read-is-considered-uncertain - ;; CodeRabbit finding: an :info mutation that completed before a + ;; An :info mutation that completed before a ;; later read may have taken effect. It must be considered a possible ;; source of state for that read, rather than being ignored by both ;; model-before and the concurrent window. @@ -183,11 +183,11 @@ (str "expected :info-before-read to skip strict score check, got: " result)))) ;; --------------------------------------------------------------------------- -;; Stale-read / phantom / superseded-committed checks (gemini HIGH) +;; Stale-read / phantom / superseded-committed checks ;; --------------------------------------------------------------------------- (deftest phantom-member-is-flagged - ;; gemini HIGH: a read that observes a member which was never added + ;; A read that observes a member which was never added ;; (no ZADD/ZINCRBY/true-ZREM anywhere) must be rejected. (let [history [{:type :invoke :process 0 :f :zrange-all :index 0} {:type :ok :process 0 :f :zrange-all @@ -199,7 +199,7 @@ (str "expected :unexpected-presence, got kinds=" kinds)))) (deftest phantom-from-info-zrem-still-flagged - ;; gemini HIGH (round 2): an :info ZREM is the ONLY history contact + ;; An :info ZREM is the ONLY history contact ;; with a member (no ZADD/ZINCRBY ever). Because completed-mutation- ;; window defaults :removed? to true on :info ZREMs (for uncertainty ;; accounting), the checker must NOT treat ZREM as proof the member @@ -224,7 +224,7 @@ (str "expected :unexpected-presence, got kinds=" kinds)))) (deftest stale-read-after-committed-zrem-is-flagged - ;; gemini HIGH: once a ZADD and a subsequent real (:removed? true) ZREM + ;; Once a ZADD and a subsequent real (:removed? true) ZREM ;; have BOTH committed (with no concurrent re-add), a later read that ;; still sees the member must be rejected as a stale read. (let [history [;; Add then remove m1 — both committed before any read. @@ -243,7 +243,7 @@ (str "expected :unexpected-presence, got kinds=" kinds)))) (deftest superseded-committed-score-is-not-allowed - ;; gemini HIGH: a ZADD committed BEFORE another ZADD for the same + ;; A ZADD committed BEFORE another ZADD for the same ;; member whose invoke strictly followed it should not be treated as ;; a valid post-state score. Only the latest committed score (plus ;; concurrent in-flight) may be observed. @@ -267,11 +267,11 @@ ;; --------------------------------------------------------------------------- ;; --------------------------------------------------------------------------- -;; Linearization of concurrent ops / uncertain mutations (gemini HIGH batch 2) +;; Linearization of concurrent ops / uncertain mutations ;; --------------------------------------------------------------------------- (deftest concurrent-zadd-zrem-both-completed-accepts-either-outcome - ;; gemini HIGH: ZADD and ZREM for the same member whose invoke/complete + ;; ZADD and ZREM for the same member whose invoke/complete ;; windows overlap (both commit before the read) have ambiguous ;; linearization. A linearizable store may serialize either one last, ;; so the read legitimately observes EITHER [["m1" 1.0]] OR []. @@ -294,7 +294,7 @@ "expected read observing ZREM's outcome (absent) to be accepted"))) (deftest info-zrem-concurrent-with-read-allows-missing-member - ;; gemini HIGH: an :info ZREM that might have applied before a read + ;; An :info ZREM that might have applied before a read ;; leaves the member's presence uncertain. A ZRANGE that omits the ;; member must NOT be flagged as a completeness failure. (let [history [;; ZADD m1 committed before the read. @@ -312,7 +312,7 @@ (str "expected :info ZREM to make absence acceptable, got: " result)))) (deftest info-zincrby-does-not-flag-zrangebyscore-completeness - ;; gemini HIGH: a pre-read :info / concurrent ZINCRBY leaves the + ;; A pre-read :info / concurrent ZINCRBY leaves the ;; resulting score unknown. ZRANGEBYSCORE filtering on a specific ;; range must not flag the member as missing, because its score may ;; have moved outside [lo, hi]. @@ -359,11 +359,11 @@ (str "expected :missing-member, got kinds=" kinds)))) ;; --------------------------------------------------------------------------- -;; Failed-concurrent mutations must not contribute to uncertainty (codex P1) +;; Failed-concurrent mutations must not contribute to uncertainty ;; --------------------------------------------------------------------------- (deftest failed-concurrent-zrem-does-not-relax-must-be-present - ;; codex P1: a concurrent ZREM that completes with :fail did NOT take + ;; A concurrent ZREM that completes with :fail did NOT take ;; effect. Its window must NOT make the member's presence uncertain, ;; so a read that omits the member (which was ZADDed and committed ;; beforehand) must be flagged as :missing-member. @@ -386,7 +386,7 @@ (str "expected :missing-member, got kinds=" kinds)))) (deftest failed-concurrent-zadd-does-not-contribute-allowed-score - ;; codex P1: a concurrent ZADD that completes with :fail did NOT take + ;; A concurrent ZADD that completes with :fail did NOT take ;; effect. Its score must NOT be added to the allowed set. A read ;; observing that score must be flagged as :score-mismatch rather than ;; being waved through by the failed ZADD's ghost contribution. @@ -410,11 +410,11 @@ ;; --------------------------------------------------------------------------- ;; Chained committed ZINCRBYs: only the linearization-chain tail is a -;; valid final score. Earlier intermediate return values are stale. (codex P1) +;; valid final score. Earlier intermediate return values are stale. ;; --------------------------------------------------------------------------- (deftest chained-committed-zincrby-rejects-stale-intermediate - ;; codex P1: sequential committed ZINCRBYs form a forced linearization + ;; Sequential committed ZINCRBYs form a forced linearization ;; chain. The first ZINCRBY's return value is an intermediate that no ;; post-chain read may observe. Expect :score-mismatch on the stale ;; intermediate. @@ -442,7 +442,7 @@ (str "expected :score-mismatch, got kinds=" kinds)))) (deftest chained-committed-zincrby-accepts-latest - ;; codex P1: same history but the read observes the LATEST chain tail + ;; Same history but the read observes the LATEST chain tail ;; (6.0) -- accept as valid. (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} {:type :ok :process 0 :f :zadd :value ["m1" 1] :index 1} @@ -458,7 +458,7 @@ (str "expected chain-tail score to be accepted, got: " result)))) (deftest concurrent-zincrby-both-admissible - ;; codex P1: two overlapping-in-real-time ZINCRBYs whose returned + ;; Two overlapping-in-real-time ZINCRBYs whose returned ;; scores are BOTH candidate final states under some linearization. ;; Read observing either value must be accepted. ;; Overlap: A=[inv=2, cmp=5], B=[inv=3, cmp=4]. @@ -484,7 +484,7 @@ "expected A's return value (6.0) admissible under overlap"))) (deftest zadd-resets-zincrby-chain - ;; codex P1: a committed ZADD between ZINCRBYs resets the chain -- + ;; A committed ZADD between ZINCRBYs resets the chain -- ;; subsequent ZINCRBYs operate on the new ZADD'd value. The pre-reset ;; ZINCRBY score is NOT a valid read after the chain completes. (let [base [;; ZADD m1 1 @@ -517,11 +517,11 @@ ;; :ok ZINCRBYs with known return values do NOT make the score check ;; unknown -- their return values pin the linearization and the ;; admissible score set is constrained by :scores (candidates + uncertain -;; ok return values). (codex P1) +;; ok return values). ;; --------------------------------------------------------------------------- (deftest two-ok-concurrent-zincrbys-reject-impossible-score - ;; codex P1: two overlapping :ok ZINCRBYs with known return values + ;; Two overlapping :ok ZINCRBYs with known return values ;; (3 and 6) constrain the admissible post-chain read set to {1,3,6}. ;; A read of 999 is impossible under any linearization; the checker ;; must flag it as :score-mismatch (no longer swallowed by the old @@ -545,7 +545,7 @@ (str "expected :score-mismatch, got kinds=" kinds)))) (deftest two-ok-concurrent-zincrbys-accept-known-chain-tail - ;; codex P1: same concurrent :ok ZINCRBY history, but the read + ;; Same concurrent :ok ZINCRBY history, but the read ;; observes one of the recorded return values. Both 3.0 (linearization ;; where +3 ran first, then +2) and 6.0 (the other order) must be ;; accepted as valid. @@ -569,7 +569,7 @@ "expected 3.0 (other linearization) to be accepted"))) (deftest info-plus-ok-concurrent-zincrby-stays-unknown - ;; codex P1: when at least one concurrent ZINCRBY is :info (unknown + ;; When at least one concurrent ZINCRBY is :info (unknown ;; post-op score), the strict score check must be relaxed regardless ;; of how many other :ok ZINCRBYs are concurrent. Any numeric score ;; must be accepted for this read. @@ -596,11 +596,11 @@ ;; --------------------------------------------------------------------------- ;; --------------------------------------------------------------------------- -;; Client setup! / invoke! robustness (gemini MEDIUM) +;; Client setup! / invoke! robustness ;; --------------------------------------------------------------------------- (deftest setup-bang-hard-fails-when-conn-spec-missing - ;; gemini HIGH: if open! failed to populate :conn-spec (unresolvable + ;; If open! failed to populate :conn-spec (unresolvable ;; host, etc.), setup! MUST throw rather than silently proceed. ;; Continuing with a no-op setup would leave stale data from a prior ;; run under zset-key and risk false-positive checker verdicts from @@ -612,7 +612,7 @@ "setup! must throw ex-info when :conn-spec is nil"))) (deftest setup-bang-hard-fails-when-cleanup-del-errors - ;; gemini MEDIUM: even when :conn-spec is populated, if the actual + ;; Even when :conn-spec is populated, if the actual ;; cleanup (DEL zset-key) fails or errors, setup! must NOT silently ;; proceed. Stale data surviving from a prior run under zset-key ;; would cause false-positive safety verdicts. Propagate the @@ -626,7 +626,7 @@ "setup! must propagate cleanup failures, not swallow them"))) (deftest zincrby-invoke-handles-nil-response - ;; gemini MEDIUM: if car/wcar for ZINCRBY returns nil (error reply + ;; If car/wcar for ZINCRBY returns nil (error reply ;; coerced, unexpected protocol edge), the op must complete as :info ;; with a structured :error, not throw NumberFormatException from ;; parse-double-safe swallowing (str nil) -> "nil". @@ -642,7 +642,7 @@ (str "expected :error to be populated, got: " result)))))) (deftest zincrby-invoke-handles-unexpected-response - ;; gemini MEDIUM: same guard, but for a non-string / non-number reply. + ;; Same guard, but for a non-string / non-number reply. ;; Must complete :info rather than propagate a parse failure. (let [client (workload/->ElastickvRedisZSetSafetyClient {} {:pool {} :spec {:host "localhost" :port 6379 @@ -667,11 +667,11 @@ (is (= ["m1" 7.0] (:value result))))))) ;; --------------------------------------------------------------------------- -;; Vacuous-pass guard (codex P1) +;; Vacuous-pass guard ;; --------------------------------------------------------------------------- (deftest empty-history-is-unknown-not-valid - ;; codex P1: an empty history (e.g. Redis unreachable, all ops + ;; An empty history (e.g. Redis unreachable, all ops ;; downgraded to :info) produces zero successful reads. The checker ;; MUST NOT return :valid? true in that case -- that would be a ;; false-green. Expect :valid? :unknown plus a diagnostic :reason. @@ -683,7 +683,7 @@ (is (zero? (:reads result))))) (deftest all-info-history-is-unknown-not-valid - ;; codex P1: a run where every operation was downgraded to :info + ;; A run where every operation was downgraded to :info ;; (Redis unreachable / every read timed out) still has read-pairs ;; filtered down to zero :ok reads. Must surface as :valid? :unknown. (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 1] :index 0} @@ -708,7 +708,7 @@ (str "expected :valid? true with one :ok read, got: " result)))) (deftest zrem-invoke-handles-nil-response - ;; gemini MEDIUM: if car/wcar for ZREM returns nil (protocol edge, + ;; If car/wcar for ZREM returns nil (protocol edge, ;; closed connection, etc.), `(long nil)` would throw NPE and the ;; op would be logged as a generic failure via the general Exception ;; handler. Guard with `(or removed 0)` so the op resolves cleanly @@ -736,7 +736,7 @@ (is (= ["m1" true] (:value result))))))) (deftest parse-withscores-handles-inf-strings - ;; gemini HIGH: Redis returns "inf" / "+inf" / "-inf" for infinite + ;; Redis returns "inf" / "+inf" / "-inf" for infinite ;; ZSET scores. Double/parseDouble expects "Infinity"; the workload's ;; parser must normalize both encodings instead of throwing. (let [flat ["m-pos" "inf" From d0c8a03f06f2660b58cd74fd9944e5dc376c7b21 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 20:11:32 +0900 Subject: [PATCH 24/28] fix(jepsen-zset): let inner workload's :final-generator pass through elastickv-zset-safety-test was unconditionally overriding the workload map's :final-generator to nil, citing "Jepsen 0.3.x can't fressian- serialize some combined final gens." That blanket cargo-culted the convention introduced in a27267ca0e for OTHER workloads (redis, s3, dynamodb) that compose more complex final gens. This workload's inner :final-generator is literally (gen/once {:f :zrange-all}) -- a single Limit defrecord wrapping a 1-key Clojure map, which round-trips through Jepsen 0.3.x's Fressian store cleanly (verified at 86 bytes via a REPL serialization test). Drop the override so the workload's one-shot final read survives, and update the comment to explain why this workload doesn't need the convention workaround the other workloads do. Note: elastickv.cli / jepsen_test.clj don't currently invoke :final-generator at the runner level, so this is cosmetic today but future-proofs the workload against a runner that would honor it. --- jepsen/src/elastickv/redis_zset_safety_workload.clj | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 4bc4a289d..c596886b2 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -857,8 +857,11 @@ (:ssh opts)) :remote control/ssh :nemesis (if nemesis-p (:nemesis nemesis-p) nemesis/noop) - ;; Jepsen 0.3.x can't fressian-serialize some combined final gens; skip. - :final-generator nil + ;; The inner workload's :final-generator is the trivially- + ;; serializable (gen/once {:f :zrange-all}) -- a single + ;; Limit defrecord wrapping a plain map. It round-trips + ;; through Jepsen 0.3.x's Fressian store cleanly + ;; (verified at 86 bytes), so we don't override it here. :concurrency (or (:concurrency opts) 5) :generator (->> (:generator workload) (gen/nemesis nemesis-gen) From 559e83d4f07bce63882fd3faa776d82dc1eeeb0f Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 20:14:02 +0900 Subject: [PATCH 25/28] fix(jepsen-zset): reject odd-length WITHSCORES replies Addresses codex P2 on 623d5c22. parse-withscores used `(partition 2)` which silently drops a trailing unpaired element. For a safety-focused workload this is exactly the wrong failure mode: a WITHSCORES reply with a dangling member is a protocol violation, and if the checker truncates the evidence rather than surfacing it the anomaly is laundered into a false green/unknown verdict. Fix: validate `(odd? (count flat))` up front and throw `ex-info` with the payload attached for debugging. The try/catch in invoke! routes the throw into an `:info` op so the history records the violation rather than dropping it. Regression test `parse-withscores-rejects-odd-length-payload` confirms a 3-element input throws with `"odd element count"`. Tests: 41 tests / 67 assertions, 0 failures. --- jepsen/src/elastickv/redis_zset_safety_workload.clj | 10 +++++++++- .../test/elastickv/redis_zset_safety_workload_test.clj | 9 +++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index c596886b2..5f27fc386 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -124,8 +124,16 @@ (defn- parse-withscores "Carmine returns a flat [member score member score ...] vector for ZRANGE WITHSCORES. Convert to a sorted vector of [member (double score)] - preserving server-returned order (score ascending, then member)." + preserving server-returned order (score ascending, then member). + + Throws on odd-length payloads: a WITHSCORES reply with a dangling member + is a protocol violation and this workload is meant to surface exactly + that kind of anomaly, not silently drop evidence." [flat] + (when (odd? (count flat)) + (throw (ex-info "WITHSCORES reply has odd element count" + {:count (count flat) + :payload flat}))) (->> flat (partition 2) (mapv (fn [[m s]] diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 0641d7efc..e7e9f984e 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -751,3 +751,12 @@ ["m-jvm" Double/POSITIVE_INFINITY] ["m-num" 3.5]] parsed)))) + +(deftest parse-withscores-rejects-odd-length-payload + ;; A WITHSCORES reply with a dangling member (odd element count) is a + ;; protocol violation. The checker must surface it rather than let + ;; `(partition 2)` silently drop evidence of the anomaly. + (is (thrown-with-msg? + clojure.lang.ExceptionInfo + #"odd element count" + (#'workload/parse-withscores ["m1" "1.0" "m2-dangling"])))) From 1a9370f42a38fcd64094ac022cd959093c0acbad Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 20:21:34 +0900 Subject: [PATCH 26/28] fix(jepsen-zset): coerce ZREM count across Long / string / bytes Some Carmine versions / RESP3 codepaths surface ZREM's reply as a numeric string ("1") or raw bytes rather than a Long. The previous (long (or removed 0)) guard handled nil but still threw ClassCastException on string/bytes, falling through to the general Exception handler and masking the real signal. Add coerce-zrem-count helper (parallel to coerce-zincrby-score) that accepts Number / String / bytes / nil / other and returns a non-negative long, with unparseable values treated as 0 so the op stays :ok. Covers regression with tests exercising string "1", string "0", bytes, and unexpected keyword replies. --- .../elastickv/redis_zset_safety_workload.clj | 48 ++++++++++++++--- .../redis_zset_safety_workload_test.clj | 52 +++++++++++++++++++ 2 files changed, 94 insertions(+), 6 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 5f27fc386..e3f121f6b 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -121,6 +121,39 @@ :else [:unexpected response])) +(defn- coerce-zrem-count + "Carmine's ZREM reply is normally a Long (count of removed members), + but under protocol edge cases / Carmine versions / RESP2 vs RESP3 + differences it can also arrive as a numeric string (\"1\") or raw + bytes. Blindly calling `(long reply)` on those forms throws + ClassCastException, which would fall through to the general exception + handler and mask the real signal. + + Returns a non-negative long count. Unparseable or unexpected values + are treated as 0 (i.e. \"nothing removed\") so the op still resolves + as :ok -- matching the existing nil-guard behaviour. + " + [response] + (cond + (nil? response) + 0 + + (number? response) + (long response) + + (string? response) + (try + (Long/parseLong ^String response) + (catch NumberFormatException _ 0)) + + (bytes? response) + (try + (Long/parseLong (String. ^bytes response "UTF-8")) + (catch NumberFormatException _ 0)) + + :else + 0)) + (defn- parse-withscores "Carmine returns a flat [member score member score ...] vector for ZRANGE WITHSCORES. Convert to a sorted vector of [member (double score)] @@ -232,12 +265,15 @@ (let [member (:value op) ;; Carmine normally returns an integer count. Guard ;; against nil / missing reply (protocol edge, closed - ;; connection, etc.) so `(long removed)` doesn't throw - ;; NPE -- that would otherwise fall through to the - ;; general Exception handler and be logged as a generic - ;; op failure, obscuring the actual signal. - removed (zrem! cs zset-key member)] - (assoc op :type :ok :value [member (pos? (long (or removed 0)))])) + ;; connection, etc.) AND against non-numeric shapes + ;; (string "1", raw bytes) that some Carmine versions + ;; or RESP3 codepaths surface. A naked `(long reply)` + ;; would NPE on nil and ClassCastException on + ;; string/bytes, falling through to the general + ;; Exception handler and masking the real signal. + removed (zrem! cs zset-key member) + n (coerce-zrem-count removed)] + (assoc op :type :ok :value [member (pos? n)])) :zrange-all (let [flat (car/wcar cs (car/zrange zset-key 0 -1 "WITHSCORES"))] diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index e7e9f984e..2da22fcba 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -735,6 +735,58 @@ (is (= :ok (:type result))) (is (= ["m1" true] (:value result))))))) +(deftest zrem-invoke-handles-string-response + ;; Some Carmine versions / RESP3 codepaths surface ZREM's count as a + ;; numeric string rather than a Long. `(long \"1\")` would throw + ;; ClassCastException; the coerce-zrem-count helper must parse the + ;; string and the op must still resolve as :ok with removed? true. + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zrem :value "m1" :process 0 :index 0}] + (with-redefs [workload/zrem! (fn [& _] "1")] + (let [result (client/invoke! client {} op)] + (is (= :ok (:type result)) + (str "expected :ok on string ZREM reply, got: " result)) + (is (= ["m1" true] (:value result)) + (str "expected removed? true on string \"1\", got: " result)))))) + +(deftest zrem-invoke-handles-string-zero-response + ;; String "0" must be parsed as removed? false (not truthy because it + ;; is a non-empty string). + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zrem :value "ghost" :process 0 :index 0}] + (with-redefs [workload/zrem! (fn [& _] "0")] + (let [result (client/invoke! client {} op)] + (is (= :ok (:type result))) + (is (= ["ghost" false] (:value result))))))) + +(deftest zrem-invoke-handles-bytes-response + ;; Raw-bytes numeric reply (RESP binary-safe path) must be decoded as + ;; UTF-8 and parsed. "1" => removed? true. + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zrem :value "m1" :process 0 :index 0}] + (with-redefs [workload/zrem! (fn [& _] (.getBytes "1" "UTF-8"))] + (let [result (client/invoke! client {} op)] + (is (= :ok (:type result))) + (is (= ["m1" true] (:value result))))))) + +(deftest zrem-invoke-handles-unparseable-response + ;; Totally unexpected reply shape: treat as 0 (nothing removed) rather + ;; than throw. Keeps the op :ok and records removed? false. + (let [client (workload/->ElastickvRedisZSetSafetyClient + {} {:pool {} :spec {:host "localhost" :port 6379 + :timeout-ms 100}}) + op {:type :invoke :f :zrem :value "ghost" :process 0 :index 0}] + (with-redefs [workload/zrem! (fn [& _] :weird)] + (let [result (client/invoke! client {} op)] + (is (= :ok (:type result))) + (is (= ["ghost" false] (:value result))))))) + (deftest parse-withscores-handles-inf-strings ;; Redis returns "inf" / "+inf" / "-inf" for infinite ;; ZSET scores. Double/parseDouble expects "Infinity"; the workload's From ad9079c60c03d203b6fd0b1a9c56feaf58a1c855 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 20:22:26 +0900 Subject: [PATCH 27/28] fix(jepsen-zset): include full :allowed set in missing-member-range :missing-member-range previously reported :expected-score (first scores), which is misleading when concurrent writers leave multiple admissible scores -- picking an arbitrary element hides the other valid linearizations from consumers reading the error map. Add :allowed scores to the error map (matching the sibling :score-mismatch-range convention). :expected-score is retained for backward compatibility but is only populated when the admissible set has exactly one element; with >1 admissible scores it is nil, forcing consumers to look at :allowed instead of silently reading a half-truth. Covers the new error shape with tests for both the multi-score (several concurrent writers) and single-score (sanity / backcompat) cases. --- .../elastickv/redis_zset_safety_workload.clj | 13 ++++- .../redis_zset_safety_workload_test.clj | 50 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index e3f121f6b..4e44338e6 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -782,11 +782,22 @@ (when (and must-be-present? (score-definitely-in-range? scores unknown-score? lo hi) (not (contains? observed-members member))) + ;; Report the full set of admissible scores (:allowed), not + ;; just an arbitrary first element -- picking `(first + ;; scores)` on a multi-element set is misleading when + ;; concurrent writers leave several linearizations valid. + ;; :allowed matches the convention used by the sibling + ;; :score-mismatch-range error above. :expected-score is + ;; retained (as `(first scores)` for a single-element set, + ;; nil otherwise) for backward compatibility with any + ;; out-of-tree consumers. (swap! errors conj {:kind :missing-member-range :index cmp-idx :bounds bounds :member member - :expected-score (first scores)}))))) + :allowed scores + :expected-score (when (= 1 (count scores)) + (first scores))}))))) @errors)) (defn zset-safety-checker diff --git a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj index 2da22fcba..a6eae2ae9 100644 --- a/jepsen/test/elastickv/redis_zset_safety_workload_test.clj +++ b/jepsen/test/elastickv/redis_zset_safety_workload_test.clj @@ -346,6 +346,56 @@ (is (contains? kinds :missing-member-range) (str "expected :missing-member-range, got kinds=" kinds)))) +(deftest missing-member-range-error-reports-full-allowed-score-set + ;; When a member is missing from ZRANGEBYSCORE and multiple + ;; concurrent writers make several scores admissible, the error map + ;; must surface the FULL admissible set under :allowed (matching + ;; :score-mismatch-range convention) rather than pick an arbitrary + ;; single :expected-score. + (let [history [;; Two concurrent ZADDs for m1, both committed before + ;; the read. Either score (5 or 6) is admissible, both + ;; fall inside [0, 10]. + {:type :invoke :process 0 :f :zadd :value ["m1" 5] :index 0} + {:type :invoke :process 1 :f :zadd :value ["m1" 6] :index 1} + {:type :ok :process 0 :f :zadd :value ["m1" 5] :index 2} + {:type :ok :process 1 :f :zadd :value ["m1" 6] :index 3} + ;; Read sees nothing -- m1 must appear under any + ;; admissible linearization, so :missing-member-range + ;; fires. + {:type :invoke :process 2 :f :zrangebyscore :value [0.0 10.0] :index 4} + {:type :ok :process 2 :f :zrangebyscore + :value {:bounds [0.0 10.0] :members []} :index 5}] + result (run-checker history) + miss (first (filter #(= :missing-member-range (:kind %)) + (:first-errors result)))] + (is (not (:valid? result))) + (is (some? miss) + (str "expected a :missing-member-range error, got: " (:first-errors result))) + (is (contains? miss :allowed) + (str "error map must include :allowed, got: " miss)) + (is (= #{5.0 6.0} (set (:allowed miss))) + (str "expected :allowed to contain both admissible scores, got: " miss)) + ;; :expected-score is retained for backcompat but MUST be nil when + ;; there is more than one admissible score, to avoid misleading + ;; consumers that read it. + (is (nil? (:expected-score miss)) + (str "expected :expected-score nil for multi-score set, got: " miss)))) + +(deftest missing-member-range-error-keeps-expected-score-when-single + ;; Backcompat: when the admissible set has exactly one score, + ;; :expected-score matches it. + (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 5] :index 0} + {:type :ok :process 0 :f :zadd :value ["m1" 5] :index 1} + {:type :invoke :process 0 :f :zrangebyscore :value [0.0 10.0] :index 2} + {:type :ok :process 0 :f :zrangebyscore + :value {:bounds [0.0 10.0] :members []} :index 3}] + result (run-checker history) + miss (first (filter #(= :missing-member-range (:kind %)) + (:first-errors result)))] + (is (some? miss)) + (is (= #{5.0} (set (:allowed miss)))) + (is (= 5.0 (:expected-score miss))))) + (deftest zrange-completeness-still-detects-truly-missing-member ;; Sanity: no uncertainty, member committed-present. Absence flagged. (let [history [{:type :invoke :process 0 :f :zadd :value ["m1" 5] :index 0} From 69db24e1cc15704585acb2571b1c2e3fc105bdef Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 20:24:51 +0900 Subject: [PATCH 28/28] fix(jepsen-zset): catch Throwable in invoke! so Errors don't crash workers Addresses gemini medium on 559e83d4. The `invoke!` `(catch Exception e)` won't catch `java.lang.Error` subclasses (NoClassDefFoundError, OutOfMemoryError, LinkageError, ...). In Jepsen those normally propagate, crash the worker thread, and can hang the run or leave history gaps that mask the underlying failure. Widen to `(catch Throwable t)`, matching the pattern already used one call site up in zincrby's coerce branch at line 224. The op is still recorded as :info with a descriptive :error field, so the history shows exactly which operation hit the Error and why. Tests: 47 tests / 83 assertions, 0 failures. --- jepsen/src/elastickv/redis_zset_safety_workload.clj | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jepsen/src/elastickv/redis_zset_safety_workload.clj b/jepsen/src/elastickv/redis_zset_safety_workload.clj index 4e44338e6..0478967ec 100644 --- a/jepsen/src/elastickv/redis_zset_safety_workload.clj +++ b/jepsen/src/elastickv/redis_zset_safety_workload.clj @@ -287,9 +287,9 @@ "WITHSCORES"))] (assoc op :type :ok :value {:bounds [lo hi] :members (parse-withscores flat)}))) - (catch Exception e - (warn e (str "ZSet safety op failed: " (:f op))) - (assoc op :type :info :error (or (.getMessage e) (str e)))))))) + (catch Throwable t + (warn t (str "ZSet safety op failed: " (:f op))) + (assoc op :type :info :error (or (.getMessage ^Throwable t) (str t)))))))) ;; --------------------------------------------------------------------------- ;; Generator