leanEthereum · Barnadrot · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/crates/backend/air/src/lib.rs b/crates/backend/air/src/lib.rs
@@ -14,6 +14,19 @@ pub trait Air: Send + Sync + 'static {
 
     fn degree_air(&self) -> usize;
 
+    /// True max constraint degree along a fold line. Defaults to `degree_air()`.
+    /// Override when `degree_air` overstates the true degree: the prover skips
+    /// the redundant top eval pass. Wire format unchanged (sized by `degree_air`).
+    fn degree_z(&self) -> usize {
+        self.degree_air()
+    }
+
+    /// Whether the C2 per-pair constraint cache pays for itself. Both paths
+    /// are bit-identical — this flag only selects the faster one per table.
+    fn c2_table_profitable(&self) -> bool {
+        true
+    }
+
     fn n_columns(&self) -> usize;
 
     fn n_constraints(&self) -> usize;
@@ -24,6 +37,13 @@ pub trait Air: Send + Sync + 'static {
     fn n_shift_columns(&self) -> usize;
 
     fn eval<AB: AirBuilder>(&self, builder: &mut AB, extra_data: &Self::ExtraData);
+
+    /// Emit only bus constraints (for the C2 seed round). On valid rows all
+    /// non-bus constraints vanish, so the bus-only accumulator equals the full
+    /// one. Default = full eval (bit-identical, just slower).
+    fn eval_bus_only<AB: AirBuilder>(&self, builder: &mut AB, extra_data: &Self::ExtraData) {
+        self.eval(builder, extra_data);
+    }
 }
 
 pub trait AirBuilder: Sized {

diff --git a/crates/backend/air/src/symbolic.rs b/crates/backend/air/src/symbolic.rs
@@ -259,6 +259,7 @@ struct SymbolicAirBuilder<F: Field> {
     constraints: Vec<SymbolicExpression<F>>,
     bus_multiplicity_value: Option<SymbolicExpression<F>>,
     bus_data_values: Option<Vec<SymbolicExpression<F>>>,
+    extra_declared_groups: Vec<Vec<SymbolicExpression<F>>>,
 }
 
 impl<F: Field> SymbolicAirBuilder<F> {
@@ -276,6 +277,7 @@ impl<F: Field> SymbolicAirBuilder<F> {
             constraints: Vec::new(),
             bus_multiplicity_value: None,
             bus_data_values: None,
+            extra_declared_groups: Vec::new(),
         }
     }
 
@@ -309,9 +311,10 @@ impl<F: Field> AirBuilder for SymbolicAirBuilder<F> {
         if self.bus_multiplicity_value.is_none() {
             assert_eq!(values.len(), 1);
             self.bus_multiplicity_value = Some(values[0]);
-        } else {
-            assert!(self.bus_data_values.is_none());
+        } else if self.bus_data_values.is_none() {
             self.bus_data_values = Some(values.to_vec());
+        } else {
+            self.extra_declared_groups.push(values.to_vec());
         }
     }
 }
@@ -320,6 +323,7 @@ pub type SymbolicAirData<F> = (
     Vec<SymbolicExpression<F>>,
     SymbolicExpression<F>,
     Vec<SymbolicExpression<F>>,
+    Vec<Vec<SymbolicExpression<F>>>,
 );
 
 pub fn get_symbolic_constraints_and_bus_data_values<F: Field, A: Air>(air: &A) -> SymbolicAirData<F>
@@ -332,5 +336,6 @@ where
         builder.constraints(),
         builder.bus_multiplicity_value.unwrap(),
         builder.bus_data_values.unwrap(),
+        builder.extra_declared_groups,
     )
 }
diff --git a/crates/backend/field/src/field.rs b/crates/backend/field/src/field.rs
@@ -384,6 +384,66 @@ pub trait PrimeCharacteristicRing:
     fn zero_vec(len: usize) -> Vec<Self> {
         vec![Self::ZERO; len]
     }
+
+    /// Deferred multiply-accumulate protocol: unreduced product of two ring elements.
+    ///
+    /// The four slots of the `[Self; 4]` accumulator values used by `unreduced_mul` /
+    /// `lazy_acc_*` are an implementation-defined unreduced representation; only the
+    /// protocol methods of the SAME type may create or inspect them. The default
+    /// implementations are eager (slot 0 carries a canonical running value, slots 1-3
+    /// stay `ZERO`), so for types that do not override the protocol it is exactly
+    /// equivalent to `acc + a * b` / `acc - a * b` chains — pure sugar, no behavioral
+    /// change. Overrides (e.g. 64-bit prime fields with widening multipliers) may keep
+    /// products unreduced across many accumulation steps and reduce once in
+    /// [`Self::lazy_acc_finish`].
+    ///
+    /// # Contract
+    /// - At most `2^20` accumulation calls (each adding/subtracting one
+    ///   [`Self::unreduced_mul`] product) between [`Self::lazy_acc_zero`] and
+    ///   [`Self::lazy_acc_finish`]; callers chunk longer loops.
+    /// - `n_sub` passed to [`Self::lazy_acc_finish`] must equal the number of
+    ///   [`Self::lazy_acc_sub`] calls since [`Self::lazy_acc_zero`]; implementations
+    ///   whose subtraction is exact ignore it.
+    /// - Accumulator values must not be mixed across types or fed to ordinary
+    ///   ring arithmetic.
+    #[must_use]
+    #[inline]
+    fn unreduced_mul(a: Self, b: Self) -> [Self; 4] {
+        [a * b, Self::ZERO, Self::ZERO, Self::ZERO]
+    }
+
+    /// Fresh accumulator representing zero. See [`Self::unreduced_mul`] for the protocol.
+    #[must_use]
+    #[inline]
+    fn lazy_acc_zero() -> [Self; 4] {
+        [Self::ZERO; 4]
+    }
+
+    /// Accumulate `+t` (a value produced by [`Self::unreduced_mul`]) into `acc`.
+    /// See [`Self::unreduced_mul`] for the protocol.
+    #[must_use]
+    #[inline]
+    fn lazy_acc_add(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
+        [acc[0] + t[0], Self::ZERO, Self::ZERO, Self::ZERO]
+    }
+
+    /// Accumulate `-t` (a value produced by [`Self::unreduced_mul`]) into `acc`.
+    /// See [`Self::unreduced_mul`] for the protocol.
+    #[must_use]
+    #[inline]
+    fn lazy_acc_sub(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
+        [acc[0] - t[0], Self::ZERO, Self::ZERO, Self::ZERO]
+    }
+
+    /// Collapse an accumulator to a canonical ring element. `n_sub` must equal the
+    /// number of [`Self::lazy_acc_sub`] calls on this accumulator (protocol docs at
+    /// [`Self::unreduced_mul`]).
+    #[must_use]
+    #[inline]
+    fn lazy_acc_finish(acc: [Self; 4], n_sub: u64) -> Self {
+        let _ = n_sub;
+        acc[0]
+    }
 }
 
 /// A vector space `V` over `F` with a fixed basis. Fixing the basis allows elements of `V` to be

diff --git a/crates/backend/goldilocks/src/goldilocks.rs b/crates/backend/goldilocks/src/goldilocks.rs
@@ -285,6 +285,74 @@ impl PrimeCharacteristicRing for Goldilocks {
         // SAFETY: `#[repr(transparent)]` means `Goldilocks` and `u64` share layout.
         unsafe { flatten_to_base(vec![0u64; len]) }
     }
+
+    // Deferred multiply-accumulate protocol.
+    //
+    // Accumulator layout: 192-bit little-endian limbs in slots 0..3:
+    //   slot 0 = l0 (bits 0..64), slot 1 = l1 (bits 64..128), slot 2 = l2 (bits 128..192),
+    //   slot 3 unused. Seeded with OFFSET192 = P * 2^126 (a multiple of P, ~2^190) so
+    //   mixed-sign accumulation never wraps the 192-bit domain:
+    //   each term is < 2^128, the contract caps terms at 5 * 2^20 < 2^23, and
+    //   2^23 * 2^128 = 2^151 << 2^190 (headroom above: 2^192 - 2^190 = 3 * 2^190).
+    //   The limb values are raw u64 bit patterns wrapped in `Goldilocks` (same trick as
+    //   `zero_vec`'s transparent layout); they are never used as field elements.
+    //
+    // Same family as the `dot_product` override above (goldilocks.rs:248) and the
+    // poseidon1.rs:85-113 scalar MDS u128 accumulation (read-only precedent).
+
+    #[inline]
+    fn unreduced_mul(a: Self, b: Self) -> [Self; 4] {
+        let wide = (a.value as u128) * (b.value as u128);
+        [
+            Self::new(wide as u64),
+            Self::new((wide >> 64) as u64),
+            Self::ZERO,
+            Self::ZERO,
+        ]
+    }
+
+    #[inline]
+    fn lazy_acc_zero() -> [Self; 4] {
+        // OFFSET192 = P << 126 = (P >> 2) * 2^128 + (P & 0b11 = 1) * 2^126:
+        //   l0 = 0, l1 = 1 << 62, l2 = P >> 2.
+        [Self::ZERO, Self::new(1u64 << 62), Self::new(P >> 2), Self::ZERO]
+    }
+
+    #[inline]
+    fn lazy_acc_add(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
+        let (l0, c0) = acc[0].value.overflowing_add(t[0].value);
+        let (l1a, c1a) = acc[1].value.overflowing_add(t[1].value);
+        let (l1, c1b) = l1a.overflowing_add(c0 as u64);
+        let l2 = acc[2].value.wrapping_add(c1a as u64).wrapping_add(c1b as u64);
+        [Self::new(l0), Self::new(l1), Self::new(l2), Self::ZERO]
+    }
+
+    #[inline]
+    fn lazy_acc_sub(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
+        let (l0, b0) = acc[0].value.overflowing_sub(t[0].value);
+        let (l1a, b1a) = acc[1].value.overflowing_sub(t[1].value);
+        let (l1, b1b) = l1a.overflowing_sub(b0 as u64);
+        let l2 = acc[2].value.wrapping_sub(b1a as u64).wrapping_sub(b1b as u64);
+        [Self::new(l0), Self::new(l1), Self::new(l2), Self::ZERO]
+    }
+
+    #[inline]
+    fn lazy_acc_finish(acc: [Self; 4], n_sub: u64) -> Self {
+        // Subtraction above is exact (borrow chains); no NOT-deficit to repay.
+        let _ = n_sub;
+        let l0 = acc[0].value;
+        let l1 = acc[1].value;
+        let l2 = acc[2].value;
+        // l2 must stay below 2^63 (deferred reduction invariant).
+        debug_assert!(l2 < (1u64 << 63), "lazy accumulator l2 overflow");
+        // V = l2*2^128 + l1*2^64 + l0 ≡ l0 + l1*eps - l2*2^32 (mod P)
+        // (2^64 ≡ eps, 2^128 ≡ -2^32). Add P*2^33 (multiple of P, ~2^97 > l2*2^32 < 2^95)
+        // to keep the u128 arithmetic borrow-free, then one reduce128:
+        // r < 2^64 + 2^96 + 2^97 < 2^98.
+        const P_SHL33: u128 = (P as u128) << 33;
+        let r = (l0 as u128) + (l1 as u128) * (Self::NEG_ORDER as u128) + P_SHL33 - (l2 as u128) * (1u128 << 32);
+        reduce128(r)
+    }
 }
 
 /// `p - 1 = 2^32 * 3 * 5 * 17 * 257 * 65537`. The smallest `D` with `gcd(p - 1, D) = 1` is 7.

diff --git a/crates/backend/goldilocks/src/x86_64_avx512/packing.rs b/crates/backend/goldilocks/src/x86_64_avx512/packing.rs
@@ -126,6 +126,136 @@ impl PrimeCharacteristicRing for PackedGoldilocksAVX512 {
         // SAFETY: this is a repr(transparent) wrapper around an array.
         unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
     }
+
+    // Deferred multiply-accumulate protocol.
+    //
+    // Accumulator layout: [L, H, W, K], one zmm each:
+    //   L = wrapping sum of term lows;  K = count of L-wraps (each worth 2^64)
+    //   H = wrapping sum of term highs; W = count of H-wraps (each worth 2^128)
+    // so the represented value is V = L + (H + K)*2^64 + W*2^128.
+    //
+    // Subtraction is NOT-based: adding (~t_hi, ~t_lo) contributes
+    // 2^128 - 1 - t = -t - (2^32 + 1) (mod P), since 2^128 = -2^32. Each sub
+    // therefore leaves a constant deficit of (2^32 + 1), repaid once at finish
+    // via `n_sub` (T1 protocol contract).
+    //
+    // The separate K counter is mandatory: folding the lo-carry into t_hi via a
+    // masked +1 is only safe for t_hi <= 2^64 - 2 (true for raw products) but NOT
+    // for NOT-ed terms, where ~t_hi = 2^64 - 1 whenever t_hi = 0 — and zero
+    // products are reachable in real eval tables
+    // cleverness).
+    //
+    // Finish: merge K into H (carry into W), then one folding step
+    //   V = L + H'_lo*eps - (H'_hi + W*2^32)   (mod P)
+    // using 2^64 = eps, 2^96 = -1, 2^128 = -2^32, with the same
+    // sub_no_double/add_no_double tail as `reduce128` below. Bounds:
+    // the T1 contract caps accumulation at 5*2^20 terms, so W, K < 2^23 and
+    // s = H'_hi + W*2^32 < 2^32 + 2^55 < P (tail precondition).
+
+    #[inline]
+    fn unreduced_mul(a: Self, b: Self) -> [Self; 4] {
+        let (hi, lo) = mul64_64(a.to_vector(), b.to_vector());
+        [Self::from_vector(lo), Self::from_vector(hi), Self::ZERO, Self::ZERO]
+    }
+
+    #[inline]
+    fn lazy_acc_zero() -> [Self; 4] {
+        [Self::ZERO; 4]
+    }
+
+    #[inline]
+    fn lazy_acc_add(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
+        unsafe {
+            let (l, h, w, k) = (
+                acc[0].to_vector(),
+                acc[1].to_vector(),
+                acc[2].to_vector(),
+                acc[3].to_vector(),
+            );
+            let (t_lo, t_hi) = (t[0].to_vector(), t[1].to_vector());
+            let one = _mm512_set1_epi64(1);
+
+            let l2 = _mm512_add_epi64(l, t_lo);
+            let carry_l = _mm512_cmplt_epu64_mask(l2, t_lo);
+            let k2 = _mm512_mask_add_epi64(k, carry_l, k, one);
+
+            let h2 = _mm512_add_epi64(h, t_hi);
+            let carry_h = _mm512_cmplt_epu64_mask(h2, t_hi);
+            let w2 = _mm512_mask_add_epi64(w, carry_h, w, one);
+
+            [
+                Self::from_vector(l2),
+                Self::from_vector(h2),
+                Self::from_vector(w2),
+                Self::from_vector(k2),
+            ]
+        }
+    }
+
+    #[inline]
+    fn lazy_acc_sub(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
+        unsafe {
+            // NOT both halves (vpternlogq imm 0x55 = NOT a), then add as usual.
+            let t_lo = t[0].to_vector();
+            let t_hi = t[1].to_vector();
+            let nt_lo = _mm512_ternarylogic_epi64::<0x55>(t_lo, t_lo, t_lo);
+            let nt_hi = _mm512_ternarylogic_epi64::<0x55>(t_hi, t_hi, t_hi);
+            Self::lazy_acc_add(
+                acc,
+                [
+                    Self::from_vector(nt_lo),
+                    Self::from_vector(nt_hi),
+                    Self::ZERO,
+                    Self::ZERO,
+                ],
+            )
+        }
+    }
+
+    #[inline]
+    fn lazy_acc_finish(acc: [Self; 4], n_sub: u64) -> Self {
+        unsafe {
+            let (l, h, w, k) = (
+                acc[0].to_vector(),
+                acc[1].to_vector(),
+                acc[2].to_vector(),
+                acc[3].to_vector(),
+            );
+            let one = _mm512_set1_epi64(1);
+
+            // Merge the lo-wrap counter into H, carrying into W.
+            let h2 = _mm512_add_epi64(h, k);
+            let carry = _mm512_cmplt_epu64_mask(h2, k);
+            let w2 = _mm512_mask_add_epi64(w, carry, w, one);
+
+            #[cfg(debug_assertions)]
+            {
+                let w_arr: [u64; WIDTH] = transmute(w2);
+                let k_arr: [u64; WIDTH] = transmute(k);
+                for i in 0..WIDTH {
+                    debug_assert!(
+                        w_arr[i] < (1 << 23) && k_arr[i] < (1 << 23),
+                        "lazy accumulator wrap counters overflow"
+                    );
+                }
+            }
+
+            // One folding step: V = L + H'_lo*eps - (H'_hi + W*2^32)  (mod P).
+            let e = _mm512_mul_epu32(h2, EPSILON);
+            let s = _mm512_add_epi64(_mm512_srli_epi64::<32>(h2), _mm512_slli_epi64::<32>(w2));
+            let t0 = sub_no_double_overflow_64_64(l, s);
+            let r = add_no_double_overflow_64_64(t0, e);
+            let r = Self::from_vector(r);
+
+            if n_sub == 0 {
+                r
+            } else {
+                // Repay the NOT deficit: n_sub * (2^32 + 1), n_sub < 2^23 so no overflow.
+                let deficit = Goldilocks::new(n_sub * ((1u64 << 32) + 1));
+                r + Self::broadcast(deficit)
+            }
+        }
+    }
 }
 
 impl_add_base_field!(PackedGoldilocksAVX512, Goldilocks);
@@ -390,9 +520,7 @@ unsafe fn mds_output<const I: usize>(s: &[__m512i; 8], s_hi: &[__m512i; 8]) -> _
 /// `vpmuludq`, while the `vpaddq` it replaces was happily dual-issuing on the
 /// add ports. Kept the `vpmuludq + vpaddq` form.
 #[inline(always)]
-pub(crate) fn mds_mul_simd(
-    state: [PackedGoldilocksAVX512; POSEIDON1_WIDTH],
-) -> [PackedGoldilocksAVX512; POSEIDON1_WIDTH] {
+pub fn mds_mul_simd(state: [PackedGoldilocksAVX512; POSEIDON1_WIDTH]) -> [PackedGoldilocksAVX512; POSEIDON1_WIDTH] {
     unsafe {
         let s: [__m512i; 8] = [
             state[0].to_vector(),