From 1fad5aa9ff0fd63edd168c94787672b40857e778 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Mon, 18 May 2026 10:32:34 +0300
Subject: [PATCH 1/4] =?UTF-8?q?perf(fse):=20donor=20FSE=5FbuildCTable=5Fwk?=
 =?UTF-8?q?sp=20parity=20=E2=80=94=20drop=20per-symbol=20Vec<State>?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#110. Replace O(num_symbols × table_size) builder loop + BTreeSet
dedup with donor `FSE_buildCTable_wksp` (cumul + tableSymbol spread +
sorted-by-symbol sweep). Production no longer materializes any
per-state Vec; `nextStateTable` (`state_table_flat`) + `symbolTT`
are everything the hot path needs. `start_state` and `max_num_bits`
are precomputed once per symbol via donor 16.16 arithmetic and held
in plain arrays — no per-symbol scan on read.

Test parity check `fse::check_tables` rewritten to enumerate
`next_state` over every (symbol, input_state) pair (was iterating
the per-symbol Vec). Same invariant, no live storage required.

Speed (small-4k-log-lines L2_dfast pure_rust): 83.7 µs → 47.4 µs
(−43%, +77% throughput). z000033 L3 unchanged (entropy build was
not the bottleneck there). Ratio preserved across the matrix.

501/501 lib tests, clippy clean.
---
 zstd/src/fse/fse_encoder.rs | 292 +++++++++++++++++-------------------
 zstd/src/fse/mod.rs         |  45 ++++--
 2 files changed, 175 insertions(+), 162 deletions(-)
diff --git a/zstd/src/fse/fse_encoder.rs b/zstd/src/fse/fse_encoder.rs
index 8d87c287..ed236716 100644
--- a/zstd/src/fse/fse_encoder.rs
+++ b/zstd/src/fse/fse_encoder.rs
@@ -1,5 +1,4 @@
 use crate::bit_io::BitWriter;
-use alloc::collections::BTreeSet;
 use alloc::vec::Vec;
 
 pub(crate) struct FSEEncoder<'output, V: AsMut<Vec<u8>>> {
@@ -225,11 +224,20 @@ impl FSETable {
     /// return shape so callers can store `State` directly without
     /// juggling lifetimes.
     pub(crate) fn start_state(&self, symbol: u8) -> State {
-        let states = &self.states[symbol as usize];
-        let slot = states
-            .start_state_slot
+        let index = self.states[symbol as usize]
+            .start_state
             .expect("symbol must be present in the FSE table");
-        states.states[slot]
+        // Callers consume only `index` (audited across the encoder + sequence
+        // emit paths). Donor `FSE_initCState2` likewise stores just the
+        // start state value; `num_bits` / `baseline` are properties of
+        // transitions, not of the initial state, so they have no
+        // meaningful values here and are zeroed.
+        State {
+            num_bits: 0,
+            baseline: 0,
+            last_index: 0,
+            index,
+        }
     }
 
     pub fn acc_log(&self) -> u8 {
@@ -242,11 +250,7 @@ impl FSETable {
     }
 
     pub(crate) fn max_num_bits_for_symbol(&self, symbol: u8) -> Option<u8> {
-        let states = &self.states[symbol as usize];
-        if states.probability == 0 {
-            return None;
-        }
-        states.states.iter().map(|state| state.num_bits).max()
+        self.states[symbol as usize].max_num_bits
     }
 
     /// Compute the exact serialized size (in bits) of the FSE table header,
@@ -359,18 +363,35 @@ impl FSETable {
     }
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub(super) struct SymbolStates {
-    /// Sorted by baseline to allow easy lookup using an index
-    pub(super) states: Vec<State>,
+    /// Donor-encoder start-state slot (`FSE_initCState2` result, in
+    /// `0..table_size`). `None` when `probability == 0`. Callers
+    /// consume only the `index` field of the [`State`] returned from
+    /// [`FSETable::start_state`] (see line-by-line audit on the
+    /// `encode` / `encode_interleaved` / compressed-block sequence
+    /// paths) — so the other [`State`] fields stay zeroed on the
+    /// returned value.
+    pub(super) start_state: Option<usize>,
+    /// Probability assigned to this symbol (`0` absent, `-1` less-than-one).
     pub(super) probability: i32,
-    start_state_slot: Option<usize>,
+    /// Max `num_bits` emitted by [`FSETable::next_state`] across all input
+    /// states for this symbol. `None` when `probability == 0`. Computed via
+    /// donor arithmetic at build time (`(2*table_size-1 + delta_nb_bits) >> 16`)
+    /// so [`FSETable::max_num_bits_for_symbol`] is a single array load
+    /// instead of a `Vec<State>` scan.
+    pub(super) max_num_bits: Option<u8>,
 }
 
 // SymbolStates::get (the old linear-scan next-state lookup) was
 // replaced by [`FSETable::next_state`]'s O(1) donor arithmetic in
-// #164. The Vec<State> storage remains for `start_state`,
-// `symbol_probability`, `max_num_bits_for_symbol`, and `write_table`.
+// #164. The legacy per-symbol `Vec<State>` storage was dropped in
+// #110: production no longer materializes any per-state vector; donor
+// `FSE_buildCTable_wksp` only ever stores `nextStateTable` (here
+// [`FSETable::state_table_flat`]) + `symbolTT` (here
+// [`FSETable::symbol_tt`]). Everything else — start state,
+// max-nb-bits, probability — is precomputed once per symbol via the
+// donor arithmetic and held in [`SymbolStates`].
 
 #[derive(Debug, Clone, Copy)]
 pub(crate) struct State {
@@ -626,181 +647,146 @@ fn donor_normalize_m2(
 }
 
 pub(super) fn build_table_from_probabilities(probs: &[i32], acc_log: u8) -> FSETable {
-    let mut states = core::array::from_fn::<SymbolStates, 256, _>(|_| SymbolStates {
-        states: Vec::new(),
-        probability: 0,
-        start_state_slot: None,
-    });
-    let mut symbol_positions = core::array::from_fn::<Vec<usize>, 256, _>(|_| Vec::new());
-
-    // distribute -1 symbols
-    let mut negative_idx = (1 << acc_log) - 1;
-    for (symbol, _prob) in probs
-        .iter()
-        .copied()
-        .enumerate()
-        .filter(|prob| prob.1 == -1)
-    {
-        states[symbol].states.push(State {
-            num_bits: acc_log,
-            baseline: 0,
-            last_index: (1 << acc_log) - 1,
-            index: negative_idx,
-        });
-        symbol_positions[symbol].push(negative_idx);
-        states[symbol].probability = -1;
-        negative_idx -= 1;
+    let table_size: usize = 1 << acc_log;
+    let mut symbol_states: [SymbolStates; 256] = core::array::from_fn(|_| SymbolStates::default());
+
+    // Donor `FSE_buildCTable_wksp` (lib/compress/fse_compress.c) — build
+    // `nextStateTable` (== `state_table_flat`) once via cumul + spread +
+    // sorted-by-symbol sweep, without ever materializing per-symbol
+    // `Vec<State>`. Previous implementation paid an O(num_symbols ×
+    // table_size) enumeration with a BTreeSet dedup per symbol —
+    // ~18% self time on small-input encode profiles. Drop it; donor
+    // only ever needs symbolTT + nextStateTable, and we precompute
+    // `start_state` / `max_num_bits` for callers via the same
+    // arithmetic [`FSETable::next_state`] uses on the hot path.
+    //
+    // Phase 1 — distribute `-1` (low-probability) symbols at the top
+    // of the table; bump the high-threshold cursor down. Build the
+    // `cumul` prefix-sum table that maps each symbol to its first
+    // `nextStateTable` slot in sorted-by-symbol layout.
+    let mut table_symbol = alloc::vec![0u8; table_size];
+    let mut high_threshold = (table_size - 1) as isize;
+    let mut cumul = [0u16; 257];
+    for (symbol, &prob) in probs.iter().enumerate() {
+        let bump = match prob {
+            -1 => {
+                table_symbol[high_threshold as usize] = symbol as u8;
+                high_threshold -= 1;
+                1
+            }
+            p if p > 0 => p as u16,
+            _ => 0,
+        };
+        cumul[symbol + 1] = cumul[symbol] + bump;
     }
 
-    // distribute other symbols
-
-    // Setup all needed states per symbol with their respective index
-    let mut idx = 0;
-    for (symbol, prob) in probs.iter().copied().enumerate() {
+    // Phase 2 — spread positive-probability symbols across the
+    // remaining slots, donor `step`-walk with low-prob area skip.
+    let step = (table_size >> 1) + (table_size >> 3) + 3;
+    let table_mask = table_size - 1;
+    let mut position: usize = 0;
+    for (symbol, &prob) in probs.iter().enumerate() {
         if prob <= 0 {
             continue;
         }
-        states[symbol].probability = prob;
-        let states = &mut states[symbol].states;
-        let positions = &mut symbol_positions[symbol];
         for _ in 0..prob {
-            states.push(State {
-                num_bits: 0,
-                baseline: 0,
-                last_index: 0,
-                index: idx,
-            });
-            positions.push(idx);
-
-            idx = next_position(idx, 1 << acc_log);
-            while idx > negative_idx {
-                idx = next_position(idx, 1 << acc_log);
+            table_symbol[position] = symbol as u8;
+            position = (position + step) & table_mask;
+            while (position as isize) > high_threshold {
+                position = (position + step) & table_mask;
             }
         }
-        assert_eq!(states.len(), prob as usize);
-    }
-
-    // Materialize the C `stateTable`: symbols are grouped by symbol and, within
-    // each symbol, ordered by table position.
-    let mut state_table = Vec::with_capacity(1 << acc_log);
-    for positions in &mut symbol_positions {
-        positions.sort_unstable();
-        state_table.extend(positions.iter().copied());
     }
+    debug_assert_eq!(
+        position, 0,
+        "FSE spread must cycle exactly once through tableSize positions"
+    );
 
-    // Donor `FSE_encodeSymbol` driver tables (`fse_compress.c`). Built
-    // alongside the legacy `Vec<State>` storage. The flat
-    // `state_table_flat` is the donor `nextStateTable` (u16 entries
-    // for direct memory parity); `symbol_tt[s]` holds the per-symbol
-    // `{delta_nb_bits, delta_find_state}`. Once populated these drive
-    // the O(1) `FSETable::next_state` arithmetic; the `Vec<State>` per
-    // symbol stays alive for `start_state` / `symbol_probability` /
-    // `max_num_bits_for_symbol` / `write_table` and the existing test
-    // suite, but is no longer touched on the encode hot path. See #164.
-    let mut state_table_flat: alloc::vec::Vec<u16> = alloc::vec::Vec::with_capacity(1 << acc_log);
-    for &slot in &state_table {
-        // `slot` originates from `idx` values bounded by
-        // `(1 << acc_log) - 1` (see the negative_idx / next_position
-        // distribution loops above), so `u16` is wide enough for
-        // every supported `acc_log` (max 12 → table_size 4096).
-        state_table_flat.push(slot as u16);
+    // Phase 3 — emit `nextStateTable` ordered by (symbol, slot). For
+    // each table slot `u`, look up its symbol and write `table_size + u`
+    // into the symbol's running cumul cursor. Donor stores the value
+    // pre-shifted by `table_size` so `FSE_encodeSymbol` can index
+    // directly without an extra add on the hot path.
+    let mut state_table_flat: alloc::vec::Vec<u16> = alloc::vec![0u16; table_size];
+    let mut cursor = cumul;
+    for (u, &symbol_at_slot) in table_symbol.iter().enumerate() {
+        let s = symbol_at_slot as usize;
+        // The Rust convention here keeps `state_table_flat[i]` as the
+        // raw slot (`0..table_size`); donor stores `table_size + u`
+        // and subtracts on read. `next_state` arithmetic ([`FSETable::next_state`])
+        // matches the Rust convention — store the slot directly.
+        state_table_flat[cursor[s] as usize] = u as u16;
+        cursor[s] += 1;
     }
     let state_table_flat: alloc::boxed::Box<[u16]> = state_table_flat.into_boxed_slice();
-    let mut symbol_tt = [SymbolTT::default(); 256];
 
-    // Build encoder transitions directly from C `FSE_encodeSymbol()` formulas.
-    let mut symbol_transform_total = 0usize;
-    for (symbol, probability) in probs.iter().copied().enumerate() {
-        if probability == 0 {
+    // Phase 4 — `symbolTT[]` (delta_nb_bits, delta_find_state) plus
+    // precomputed `start_state` and `max_num_bits` per symbol. All via
+    // donor 16.16 fixed-point arithmetic; no per-state enumeration.
+    let mut symbol_tt = [SymbolTT::default(); 256];
+    let mut total: usize = 0;
+    for (symbol, &prob) in probs.iter().enumerate() {
+        symbol_states[symbol].probability = prob;
+        if prob == 0 {
+            // Donor fills `symbolTT` for prob==0 too, so `FSE_getMaxNbBits`
+            // still works (returns `acc_log + 1` for absent symbols).
+            // We don't expose that path, but mirror the value for parity.
+            symbol_tt[symbol] = SymbolTT {
+                delta_nb_bits: ((acc_log as u32 + 1) << 16).saturating_sub(1u32 << acc_log),
+                delta_find_state: 0,
+            };
             continue;
         }
-        let probability_abs = probability.unsigned_abs() as usize;
-        // Donor 16.16 fixed-point arithmetic — performed in `u32` so a
-        // 16-bit `usize` target (AVR / MSP430 / no-atomic Cortex-M0)
-        // can't silently overflow the `<<16` shift. Donor
-        // `fse_compress.c` keeps the same width.
-        let (delta_nb_bits, delta_find_state): (u32, isize) = match probability {
+        let (delta_nb_bits, delta_find_state): (u32, isize) = match prob {
             -1 | 1 => (
                 ((acc_log as u32) << 16).saturating_sub(1u32 << acc_log),
-                symbol_transform_total as isize - 1,
+                total as isize - 1,
             ),
-            probability if probability > 1 => {
-                let probability = probability as u32;
-                let max_bits_out = (acc_log as u32) - (probability - 1).ilog2();
-                let min_state_plus = probability << max_bits_out;
+            p if p > 1 => {
+                let p_u32 = p as u32;
+                let max_bits_out = (acc_log as u32) - (p_u32 - 1).ilog2();
+                let min_state_plus = p_u32 << max_bits_out;
                 (
                     (max_bits_out << 16).saturating_sub(min_state_plus),
-                    symbol_transform_total as isize - probability as isize,
+                    total as isize - p_u32 as isize,
                 )
             }
-            _ => unreachable!(),
+            _ => unreachable!("probability is one of {{-1, 1+}} after the prob==0 gate above"),
         };
         symbol_tt[symbol] = SymbolTT {
             delta_nb_bits,
             delta_find_state,
         };
-        let state = &mut states[symbol];
+        total += prob.unsigned_abs() as usize;
+
+        // Donor `FSE_initCState2`: start_state =
+        //   stateTable[(((nbBitsOut<<16) - deltaNbBits) >> nbBitsOut) + deltaFindState]
+        // where `nbBitsOut = (deltaNbBits + (1<<15)) >> 16`.
         let init_nb_bits_out = (delta_nb_bits + (1 << 15)) >> 16;
         let init_value = (init_nb_bits_out << 16).saturating_sub(delta_nb_bits);
         let state_table_index = (init_value >> init_nb_bits_out) as isize + delta_find_state;
-        let start_index = state_table[state_table_index as usize];
-        symbol_transform_total += probability_abs;
-        state.states = Vec::with_capacity(probability_abs.max(1));
-        // Dedup via `BTreeSet<(baseline, last_index, index, num_bits)>` instead
-        // of the previous `state.states.iter().any(...)` linear scan. With
-        // `acc_log` up to 12 (table_size = 4096) the inner scan was O(N²) per
-        // symbol and dominated encoder FSE table construction (~6% exclusive
-        // on level22 profile). BTreeSet keeps insertion order on the
-        // accepted-entries Vec untouched — important because the downstream
-        // `start_state_slot = states.iter().position(|e| e.index == start_index)`
-        // lookup depends on which duplicate-keyed entry shows up first.
-        let mut seen: BTreeSet<(usize, usize, usize, u8)> = BTreeSet::new();
-        for current_index in 0..(1usize << acc_log) {
-            // Same 16.16 fixed-point arithmetic as `next_state` —
-            // keep in `u32` for 16-bit-target safety, then cast back
-            // to `usize` at indexing sites.
-            let current_value = (1u32 << acc_log) + (current_index as u32);
-            let num_bits = ((current_value + delta_nb_bits) >> 16) as usize;
-            let next_state_idx = (current_value >> num_bits) as isize + delta_find_state;
-            let next_index = state_table[next_state_idx as usize];
-            let mask = (1usize << num_bits) - 1;
-            let baseline = current_index & !mask;
-            let last_index = baseline + mask;
-            if !seen.insert((baseline, last_index, next_index, num_bits as u8)) {
-                continue;
-            }
-            state.states.push(State {
-                num_bits: num_bits as u8,
-                baseline,
-                last_index,
-                index: next_index,
-            });
-        }
-
-        // For encoding we use the states ordered by the indexes they target
-        state.states.sort_by_key(|l| l.baseline);
-        state.start_state_slot = state
-            .states
-            .iter()
-            .position(|entry| entry.index == start_index);
+        let start_index = state_table_flat[state_table_index as usize] as usize;
+
+        // Max nb_bits across all input states `0..table_size`. Donor
+        // `next_state` arithmetic: `nb_bits = (value + delta_nb_bits) >> 16`
+        // with `value = table_size + idx`, `idx ∈ 0..table_size`. The
+        // maximum is at `idx = table_size - 1`. Single op vs the prior
+        // `Vec<State>::iter().map(|s|s.num_bits).max()` linear scan.
+        let max_value = (2 * table_size as u32 - 1) + delta_nb_bits;
+        let max_num_bits = (max_value >> 16) as u8;
+
+        symbol_states[symbol].start_state = Some(start_index);
+        symbol_states[symbol].max_num_bits = Some(max_num_bits);
     }
 
     FSETable {
-        table_size: 1 << acc_log,
-        states,
+        table_size,
+        states: symbol_states,
         state_table_flat,
         symbol_tt,
     }
 }
 
-/// Calculate the position of the next entry of the table given the current
-/// position and size of the table.
-fn next_position(mut p: usize, table_size: usize) -> usize {
-    p += (table_size >> 1) + (table_size >> 3) + 3;
-    p &= table_size - 1;
-    p
-}
-
 const ML_DIST: &[i32] = &[
     1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1,
diff --git a/zstd/src/fse/mod.rs b/zstd/src/fse/mod.rs
index 55013884..c5a302f4 100644
--- a/zstd/src/fse/mod.rs
+++ b/zstd/src/fse/mod.rs
@@ -60,15 +60,42 @@ fn tables_equal() {
 
 #[cfg(any(test, feature = "fuzz_exports"))]
 fn check_tables(dec_table: &fse_decoder::FSETable, enc_table: &fse_encoder::FSETable) {
-    for (idx, dec_state) in dec_table.decode.iter().enumerate() {
-        let enc_states = &enc_table.states[dec_state.symbol as usize];
-        let enc_state = enc_states
-            .states
-            .iter()
-            .find(|state| state.index == idx)
-            .unwrap();
-        assert_eq!(enc_state.baseline, dec_state.new_state as usize);
-        assert_eq!(enc_state.num_bits, dec_state.num_bits);
+    // Per-symbol `Vec<State>` storage was dropped in #110 — the encoder now
+    // holds only `nextStateTable` (donor parity) and per-symbol
+    // `symbolTT`. Recover the (symbol, input_state) → next_state mapping
+    // by enumerating `next_state` over every input state value: each
+    // decode-table slot must be reached by exactly one (symbol, prev)
+    // pair whose `next.baseline` / `next.num_bits` match the decoder.
+    let table_size = enc_table.table_size;
+    let mut hit = alloc::vec![false; table_size];
+    for symbol_u in 0..=255u16 {
+        let symbol = symbol_u as u8;
+        if enc_table.symbol_probability(symbol) == 0 {
+            continue;
+        }
+        for input_state in 0..table_size {
+            let next = enc_table.next_state(symbol, input_state);
+            let dec_state = &dec_table.decode[next.index];
+            if dec_state.symbol == symbol {
+                assert_eq!(
+                    next.baseline, dec_state.new_state as usize,
+                    "decode/encode baseline mismatch at slot {} (symbol {symbol})",
+                    next.index
+                );
+                assert_eq!(
+                    next.num_bits, dec_state.num_bits,
+                    "decode/encode num_bits mismatch at slot {} (symbol {symbol})",
+                    next.index
+                );
+                hit[next.index] = true;
+            }
+        }
+    }
+    for (idx, was_hit) in hit.iter().enumerate() {
+        assert!(
+            *was_hit,
+            "decoder slot {idx} not reached by any (symbol, prev_state) encoder transition"
+        );
     }
 }
 

From 9b1b778845315faef5427d873119e156b4afb143 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Mon, 18 May 2026 10:42:31 +0300
Subject: [PATCH 2/4] test(fse): tighten decoder/encoder parity check; clarify
 Phase 3 doc comment

- `fse::check_tables`: assert every `(symbol, input_state)` transition
  lands on a decoder slot OWNED by that symbol (was silently skipping
  cross-symbol transitions, which would mask a routing bug).
- `build_table_from_probabilities` Phase 3 doc: rewrite to describe
  the raw-slot Rust convention without misclaiming the code stores
  `table_size + u` (donor representation); cross-reference
  `FSETable::next_state` arithmetic that depends on the raw-slot form.
---
 zstd/src/fse/fse_encoder.rs | 14 +++++++-----
 zstd/src/fse/mod.rs         | 45 +++++++++++++++++++++++--------------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/zstd/src/fse/fse_encoder.rs b/zstd/src/fse/fse_encoder.rs
index ed236716..ee3e9b65 100644
--- a/zstd/src/fse/fse_encoder.rs
+++ b/zstd/src/fse/fse_encoder.rs
@@ -702,11 +702,15 @@ pub(super) fn build_table_from_probabilities(probs: &[i32], acc_log: u8) -> FSET
         "FSE spread must cycle exactly once through tableSize positions"
     );
 
-    // Phase 3 — emit `nextStateTable` ordered by (symbol, slot). For
-    // each table slot `u`, look up its symbol and write `table_size + u`
-    // into the symbol's running cumul cursor. Donor stores the value
-    // pre-shifted by `table_size` so `FSE_encodeSymbol` can index
-    // directly without an extra add on the hot path.
+    // Phase 3 — emit `state_table_flat` (donor `nextStateTable`)
+    // ordered by `(symbol, slot)`. Walk every table slot `u`, look up
+    // its owning symbol via `table_symbol[u]`, and write the raw slot
+    // `u` into that symbol's running cumul cursor. The Rust convention
+    // stores raw slots (`0..table_size`); donor stores `table_size + u`
+    // pre-shifted and recovers `u` on read by subtracting `table_size`.
+    // Both representations encode the same `(symbol → next_slot)`
+    // mapping; [`FSETable::next_state`] is written against the raw-slot
+    // convention so the pre-shift is intentionally skipped here.
     let mut state_table_flat: alloc::vec::Vec<u16> = alloc::vec![0u16; table_size];
     let mut cursor = cumul;
     for (u, &symbol_at_slot) in table_symbol.iter().enumerate() {
diff --git a/zstd/src/fse/mod.rs b/zstd/src/fse/mod.rs
index c5a302f4..873665c9 100644
--- a/zstd/src/fse/mod.rs
+++ b/zstd/src/fse/mod.rs
@@ -62,10 +62,16 @@ fn tables_equal() {
 fn check_tables(dec_table: &fse_decoder::FSETable, enc_table: &fse_encoder::FSETable) {
     // Per-symbol `Vec<State>` storage was dropped in #110 — the encoder now
     // holds only `nextStateTable` (donor parity) and per-symbol
-    // `symbolTT`. Recover the (symbol, input_state) → next_state mapping
-    // by enumerating `next_state` over every input state value: each
-    // decode-table slot must be reached by exactly one (symbol, prev)
-    // pair whose `next.baseline` / `next.num_bits` match the decoder.
+    // `symbolTT`. Verify decoder/encoder parity by enumerating every
+    // valid `(symbol, input_state)` transition and asserting that:
+    //   (a) `next.index` lands on a decoder slot owned by `symbol`
+    //       (the transition reaches a state that decodes back to the
+    //       same symbol — without this, a broken encoder routing
+    //       symbol A into symbol B's slot would slip through), and
+    //   (b) `baseline` / `num_bits` from the encoder match what the
+    //       decoder records for that slot.
+    // After enumerating, every decoder slot must have been hit at
+    // least once (full coverage).
     let table_size = enc_table.table_size;
     let mut hit = alloc::vec![false; table_size];
     for symbol_u in 0..=255u16 {
@@ -76,19 +82,24 @@ fn check_tables(dec_table: &fse_decoder::FSETable, enc_table: &fse_encoder::FSET
         for input_state in 0..table_size {
             let next = enc_table.next_state(symbol, input_state);
             let dec_state = &dec_table.decode[next.index];
-            if dec_state.symbol == symbol {
-                assert_eq!(
-                    next.baseline, dec_state.new_state as usize,
-                    "decode/encode baseline mismatch at slot {} (symbol {symbol})",
-                    next.index
-                );
-                assert_eq!(
-                    next.num_bits, dec_state.num_bits,
-                    "decode/encode num_bits mismatch at slot {} (symbol {symbol})",
-                    next.index
-                );
-                hit[next.index] = true;
-            }
+            assert_eq!(
+                dec_state.symbol, symbol,
+                "encoder transition for symbol {symbol} from state {input_state} \
+                 lands on decoder slot {} which decodes to symbol {} \
+                 (encoder/decoder routing mismatch)",
+                next.index, dec_state.symbol
+            );
+            assert_eq!(
+                next.baseline, dec_state.new_state as usize,
+                "decode/encode baseline mismatch at slot {} (symbol {symbol})",
+                next.index
+            );
+            assert_eq!(
+                next.num_bits, dec_state.num_bits,
+                "decode/encode num_bits mismatch at slot {} (symbol {symbol})",
+                next.index
+            );
+            hit[next.index] = true;
         }
     }
     for (idx, was_hit) in hit.iter().enumerate() {

From 594e769946f2c82377517a70efe1bd3316ec3715 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Mon, 18 May 2026 10:52:22 +0300
Subject: [PATCH 3/4] fix(fse): widen cumul prefix-sum to u32 for acc_log=16
 round-trip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`fse_decoder::FSETable::to_encoder_table` can round-trip through
`build_table_from_probabilities` with `accuracy_log` up to
`ENTRY_MAX_ACCURACY_LOG = 16`, where the prefix sum reaches
`table_size = 65 536` — one past `u16::MAX`. The Phase-1 `cumul`
array (and the Phase-3 `cursor` snapshot) must be `u32` so the
cumulative count is representable for every valid `acc_log`. Slot
indices written into `state_table_flat` stay in `0..table_size-1`
(≤ `u16::MAX`) and remain `u16`.
---
 zstd/src/fse/fse_encoder.rs | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/zstd/src/fse/fse_encoder.rs b/zstd/src/fse/fse_encoder.rs
index ee3e9b65..8b88b12e 100644
--- a/zstd/src/fse/fse_encoder.rs
+++ b/zstd/src/fse/fse_encoder.rs
@@ -666,15 +666,24 @@ pub(super) fn build_table_from_probabilities(probs: &[i32], acc_log: u8) -> FSET
     // `nextStateTable` slot in sorted-by-symbol layout.
     let mut table_symbol = alloc::vec![0u8; table_size];
     let mut high_threshold = (table_size - 1) as isize;
-    let mut cumul = [0u16; 257];
+    // `cumul` / running prefix-sum holds slot counts up to `table_size`.
+    // Decoder accepts `accuracy_log` up to `ENTRY_MAX_ACCURACY_LOG = 16`
+    // and `fse_decoder::FSETable::to_encoder_table` round-trips through
+    // this builder; at `acc_log == 16` the prefix sum reaches 65 536
+    // which overflows `u16` (max 65 535). Keep `cumul` / `cursor` at
+    // `u32` so the cumulative count is representable for every valid
+    // `acc_log`. Slot indices written into `state_table_flat` stay in
+    // `0..table_size-1` (≤ u16::MAX) and remain `u16` — only the
+    // running cursor needs the wider type.
+    let mut cumul = [0u32; 257];
     for (symbol, &prob) in probs.iter().enumerate() {
-        let bump = match prob {
+        let bump: u32 = match prob {
             -1 => {
                 table_symbol[high_threshold as usize] = symbol as u8;
                 high_threshold -= 1;
                 1
             }
-            p if p > 0 => p as u16,
+            p if p > 0 => p as u32,
             _ => 0,
         };
         cumul[symbol + 1] = cumul[symbol] + bump;

From a8c5e93e697bc7fe3e790f45a986ebf657a834f4 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov <mail@polaz.com>
Date: Mon, 18 May 2026 11:02:37 +0300
Subject: [PATCH 4/4] fix(fse): debug_assert non-negative start_state index
 before usize cast
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Donor `FSE_initCState2` arithmetic guarantees `state_table_index ≥ 0`
by construction, but the `isize → usize` cast at the indexing site
would silently wrap on regression. Add a `debug_assert!` so a future
arithmetic bug surfaces in dev builds instead of an out-of-bounds
panic or wrong slot read.
---
 zstd/src/fse/fse_encoder.rs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/zstd/src/fse/fse_encoder.rs b/zstd/src/fse/fse_encoder.rs
index 8b88b12e..0fb21f60 100644
--- a/zstd/src/fse/fse_encoder.rs
+++ b/zstd/src/fse/fse_encoder.rs
@@ -778,6 +778,17 @@ pub(super) fn build_table_from_probabilities(probs: &[i32], acc_log: u8) -> FSET
         let init_nb_bits_out = (delta_nb_bits + (1 << 15)) >> 16;
         let init_value = (init_nb_bits_out << 16).saturating_sub(delta_nb_bits);
         let state_table_index = (init_value >> init_nb_bits_out) as isize + delta_find_state;
+        // Donor `FSE_initCState2` guarantees this index is in
+        // `0..table_size` by construction (`delta_find_state` is bounded
+        // by `total - probability`, and `(value >> nb_bits_out)` is
+        // bounded by `2 * probability - 1`). The `debug_assert` makes
+        // the invariant explicit so a future regression in the donor
+        // arithmetic surfaces in dev builds before the silent
+        // `as usize` wraparound.
+        debug_assert!(
+            state_table_index >= 0,
+            "FSE start_state index must be non-negative (got {state_table_index} for symbol {symbol})"
+        );
         let start_index = state_table_flat[state_table_index as usize] as usize;
 
         // Max nb_bits across all input states `0..table_size`. Donor