Skip to content
Open
20 changes: 20 additions & 0 deletions crates/backend/air/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@ pub trait Air: Send + Sync + 'static {

fn degree_air(&self) -> usize;

/// True max constraint degree along a fold line. Defaults to `degree_air()`.
/// Override when `degree_air` overstates the true degree: the prover skips
/// the redundant top eval pass. Wire format unchanged (sized by `degree_air`).
fn degree_z(&self) -> usize {
self.degree_air()
}

/// Whether the C2 per-pair constraint cache pays for itself. Both paths
/// are bit-identical — this flag only selects the faster one per table.
fn c2_table_profitable(&self) -> bool {
true
}

fn n_columns(&self) -> usize;

fn n_constraints(&self) -> usize;
Expand All @@ -24,6 +37,13 @@ pub trait Air: Send + Sync + 'static {
fn n_shift_columns(&self) -> usize;

fn eval<AB: AirBuilder>(&self, builder: &mut AB, extra_data: &Self::ExtraData);

/// Emit only bus constraints (for the C2 seed round). On valid rows all
/// non-bus constraints vanish, so the bus-only accumulator equals the full
/// one. Default = full eval (bit-identical, just slower).
fn eval_bus_only<AB: AirBuilder>(&self, builder: &mut AB, extra_data: &Self::ExtraData) {
self.eval(builder, extra_data);
}
}

pub trait AirBuilder: Sized {
Expand Down
9 changes: 7 additions & 2 deletions crates/backend/air/src/symbolic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ struct SymbolicAirBuilder<F: Field> {
constraints: Vec<SymbolicExpression<F>>,
bus_multiplicity_value: Option<SymbolicExpression<F>>,
bus_data_values: Option<Vec<SymbolicExpression<F>>>,
extra_declared_groups: Vec<Vec<SymbolicExpression<F>>>,
}

impl<F: Field> SymbolicAirBuilder<F> {
Expand All @@ -276,6 +277,7 @@ impl<F: Field> SymbolicAirBuilder<F> {
constraints: Vec::new(),
bus_multiplicity_value: None,
bus_data_values: None,
extra_declared_groups: Vec::new(),
}
}

Expand Down Expand Up @@ -309,9 +311,10 @@ impl<F: Field> AirBuilder for SymbolicAirBuilder<F> {
if self.bus_multiplicity_value.is_none() {
assert_eq!(values.len(), 1);
self.bus_multiplicity_value = Some(values[0]);
} else {
assert!(self.bus_data_values.is_none());
} else if self.bus_data_values.is_none() {
self.bus_data_values = Some(values.to_vec());
} else {
self.extra_declared_groups.push(values.to_vec());
}
}
}
Expand All @@ -320,6 +323,7 @@ pub type SymbolicAirData<F> = (
Vec<SymbolicExpression<F>>,
SymbolicExpression<F>,
Vec<SymbolicExpression<F>>,
Vec<Vec<SymbolicExpression<F>>>,
);

pub fn get_symbolic_constraints_and_bus_data_values<F: Field, A: Air>(air: &A) -> SymbolicAirData<F>
Expand All @@ -332,5 +336,6 @@ where
builder.constraints(),
builder.bus_multiplicity_value.unwrap(),
builder.bus_data_values.unwrap(),
builder.extra_declared_groups,
)
}
60 changes: 60 additions & 0 deletions crates/backend/field/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,66 @@ pub trait PrimeCharacteristicRing:
fn zero_vec(len: usize) -> Vec<Self> {
vec![Self::ZERO; len]
}

/// Deferred multiply-accumulate protocol: unreduced product of two ring elements.
///
/// The four slots of the `[Self; 4]` accumulator values used by `unreduced_mul` /
/// `lazy_acc_*` are an implementation-defined unreduced representation; only the
/// protocol methods of the SAME type may create or inspect them. The default
/// implementations are eager (slot 0 carries a canonical running value, slots 1-3
/// stay `ZERO`), so for types that do not override the protocol it is exactly
/// equivalent to `acc + a * b` / `acc - a * b` chains — pure sugar, no behavioral
/// change. Overrides (e.g. 64-bit prime fields with widening multipliers) may keep
/// products unreduced across many accumulation steps and reduce once in
/// [`Self::lazy_acc_finish`].
///
/// # Contract
/// - At most `2^20` accumulation calls (each adding/subtracting one
/// [`Self::unreduced_mul`] product) between [`Self::lazy_acc_zero`] and
/// [`Self::lazy_acc_finish`]; callers chunk longer loops.
/// - `n_sub` passed to [`Self::lazy_acc_finish`] must equal the number of
/// [`Self::lazy_acc_sub`] calls since [`Self::lazy_acc_zero`]; implementations
/// whose subtraction is exact ignore it.
/// - Accumulator values must not be mixed across types or fed to ordinary
/// ring arithmetic.
#[must_use]
#[inline]
fn unreduced_mul(a: Self, b: Self) -> [Self; 4] {
[a * b, Self::ZERO, Self::ZERO, Self::ZERO]
}

/// Fresh accumulator representing zero. See [`Self::unreduced_mul`] for the protocol.
#[must_use]
#[inline]
fn lazy_acc_zero() -> [Self; 4] {
[Self::ZERO; 4]
}

/// Accumulate `+t` (a value produced by [`Self::unreduced_mul`]) into `acc`.
/// See [`Self::unreduced_mul`] for the protocol.
#[must_use]
#[inline]
fn lazy_acc_add(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
[acc[0] + t[0], Self::ZERO, Self::ZERO, Self::ZERO]
}

/// Accumulate `-t` (a value produced by [`Self::unreduced_mul`]) into `acc`.
/// See [`Self::unreduced_mul`] for the protocol.
#[must_use]
#[inline]
fn lazy_acc_sub(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
[acc[0] - t[0], Self::ZERO, Self::ZERO, Self::ZERO]
}

/// Collapse an accumulator to a canonical ring element. `n_sub` must equal the
/// number of [`Self::lazy_acc_sub`] calls on this accumulator (protocol docs at
/// [`Self::unreduced_mul`]).
#[must_use]
#[inline]
fn lazy_acc_finish(acc: [Self; 4], n_sub: u64) -> Self {
let _ = n_sub;
acc[0]
}
}

/// A vector space `V` over `F` with a fixed basis. Fixing the basis allows elements of `V` to be
Expand Down
68 changes: 68 additions & 0 deletions crates/backend/goldilocks/src/goldilocks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,74 @@ impl PrimeCharacteristicRing for Goldilocks {
// SAFETY: `#[repr(transparent)]` means `Goldilocks` and `u64` share layout.
unsafe { flatten_to_base(vec![0u64; len]) }
}

// Deferred multiply-accumulate protocol.
//
// Accumulator layout: 192-bit little-endian limbs in slots 0..3:
// slot 0 = l0 (bits 0..64), slot 1 = l1 (bits 64..128), slot 2 = l2 (bits 128..192),
// slot 3 unused. Seeded with OFFSET192 = P * 2^126 (a multiple of P, ~2^190) so
// mixed-sign accumulation never wraps the 192-bit domain:
// each term is < 2^128, the contract caps terms at 5 * 2^20 < 2^23, and
// 2^23 * 2^128 = 2^151 << 2^190 (headroom above: 2^192 - 2^190 = 3 * 2^190).
// The limb values are raw u64 bit patterns wrapped in `Goldilocks` (same trick as
// `zero_vec`'s transparent layout); they are never used as field elements.
//
// Same family as the `dot_product` override above (goldilocks.rs:248) and the
// poseidon1.rs:85-113 scalar MDS u128 accumulation (read-only precedent).

#[inline]
fn unreduced_mul(a: Self, b: Self) -> [Self; 4] {
let wide = (a.value as u128) * (b.value as u128);
[
Self::new(wide as u64),
Self::new((wide >> 64) as u64),
Self::ZERO,
Self::ZERO,
]
}

#[inline]
fn lazy_acc_zero() -> [Self; 4] {
// OFFSET192 = P << 126 = (P >> 2) * 2^128 + (P & 0b11 = 1) * 2^126:
// l0 = 0, l1 = 1 << 62, l2 = P >> 2.
[Self::ZERO, Self::new(1u64 << 62), Self::new(P >> 2), Self::ZERO]
}

#[inline]
fn lazy_acc_add(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
let (l0, c0) = acc[0].value.overflowing_add(t[0].value);
let (l1a, c1a) = acc[1].value.overflowing_add(t[1].value);
let (l1, c1b) = l1a.overflowing_add(c0 as u64);
let l2 = acc[2].value.wrapping_add(c1a as u64).wrapping_add(c1b as u64);
[Self::new(l0), Self::new(l1), Self::new(l2), Self::ZERO]
}

#[inline]
fn lazy_acc_sub(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
let (l0, b0) = acc[0].value.overflowing_sub(t[0].value);
let (l1a, b1a) = acc[1].value.overflowing_sub(t[1].value);
let (l1, b1b) = l1a.overflowing_sub(b0 as u64);
let l2 = acc[2].value.wrapping_sub(b1a as u64).wrapping_sub(b1b as u64);
[Self::new(l0), Self::new(l1), Self::new(l2), Self::ZERO]
}

#[inline]
fn lazy_acc_finish(acc: [Self; 4], n_sub: u64) -> Self {
// Subtraction above is exact (borrow chains); no NOT-deficit to repay.
let _ = n_sub;
let l0 = acc[0].value;
let l1 = acc[1].value;
let l2 = acc[2].value;
// l2 must stay below 2^63 (deferred reduction invariant).
debug_assert!(l2 < (1u64 << 63), "lazy accumulator l2 overflow");
// V = l2*2^128 + l1*2^64 + l0 ≡ l0 + l1*eps - l2*2^32 (mod P)
// (2^64 ≡ eps, 2^128 ≡ -2^32). Add P*2^33 (multiple of P, ~2^97 > l2*2^32 < 2^95)
// to keep the u128 arithmetic borrow-free, then one reduce128:
// r < 2^64 + 2^96 + 2^97 < 2^98.
const P_SHL33: u128 = (P as u128) << 33;
let r = (l0 as u128) + (l1 as u128) * (Self::NEG_ORDER as u128) + P_SHL33 - (l2 as u128) * (1u128 << 32);
reduce128(r)
}
}

/// `p - 1 = 2^32 * 3 * 5 * 17 * 257 * 65537`. The smallest `D` with `gcd(p - 1, D) = 1` is 7.
Expand Down
134 changes: 131 additions & 3 deletions crates/backend/goldilocks/src/x86_64_avx512/packing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,136 @@ impl PrimeCharacteristicRing for PackedGoldilocksAVX512 {
// SAFETY: this is a repr(transparent) wrapper around an array.
unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
}

// Deferred multiply-accumulate protocol.
//
// Accumulator layout: [L, H, W, K], one zmm each:
// L = wrapping sum of term lows; K = count of L-wraps (each worth 2^64)
// H = wrapping sum of term highs; W = count of H-wraps (each worth 2^128)
// so the represented value is V = L + (H + K)*2^64 + W*2^128.
//
// Subtraction is NOT-based: adding (~t_hi, ~t_lo) contributes
// 2^128 - 1 - t = -t - (2^32 + 1) (mod P), since 2^128 = -2^32. Each sub
// therefore leaves a constant deficit of (2^32 + 1), repaid once at finish
// via `n_sub` (T1 protocol contract).
//
// The separate K counter is mandatory: folding the lo-carry into t_hi via a
// masked +1 is only safe for t_hi <= 2^64 - 2 (true for raw products) but NOT
// for NOT-ed terms, where ~t_hi = 2^64 - 1 whenever t_hi = 0 — and zero
// products are reachable in real eval tables
// cleverness).
//
// Finish: merge K into H (carry into W), then one folding step
// V = L + H'_lo*eps - (H'_hi + W*2^32) (mod P)
// using 2^64 = eps, 2^96 = -1, 2^128 = -2^32, with the same
// sub_no_double/add_no_double tail as `reduce128` below. Bounds:
// the T1 contract caps accumulation at 5*2^20 terms, so W, K < 2^23 and
// s = H'_hi + W*2^32 < 2^32 + 2^55 < P (tail precondition).

#[inline]
fn unreduced_mul(a: Self, b: Self) -> [Self; 4] {
let (hi, lo) = mul64_64(a.to_vector(), b.to_vector());
[Self::from_vector(lo), Self::from_vector(hi), Self::ZERO, Self::ZERO]
}

#[inline]
fn lazy_acc_zero() -> [Self; 4] {
[Self::ZERO; 4]
}

#[inline]
fn lazy_acc_add(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
unsafe {
let (l, h, w, k) = (
acc[0].to_vector(),
acc[1].to_vector(),
acc[2].to_vector(),
acc[3].to_vector(),
);
let (t_lo, t_hi) = (t[0].to_vector(), t[1].to_vector());
let one = _mm512_set1_epi64(1);

let l2 = _mm512_add_epi64(l, t_lo);
let carry_l = _mm512_cmplt_epu64_mask(l2, t_lo);
let k2 = _mm512_mask_add_epi64(k, carry_l, k, one);

let h2 = _mm512_add_epi64(h, t_hi);
let carry_h = _mm512_cmplt_epu64_mask(h2, t_hi);
let w2 = _mm512_mask_add_epi64(w, carry_h, w, one);

[
Self::from_vector(l2),
Self::from_vector(h2),
Self::from_vector(w2),
Self::from_vector(k2),
]
}
}

#[inline]
fn lazy_acc_sub(acc: [Self; 4], t: [Self; 4]) -> [Self; 4] {
unsafe {
// NOT both halves (vpternlogq imm 0x55 = NOT a), then add as usual.
let t_lo = t[0].to_vector();
let t_hi = t[1].to_vector();
let nt_lo = _mm512_ternarylogic_epi64::<0x55>(t_lo, t_lo, t_lo);
let nt_hi = _mm512_ternarylogic_epi64::<0x55>(t_hi, t_hi, t_hi);
Self::lazy_acc_add(
acc,
[
Self::from_vector(nt_lo),
Self::from_vector(nt_hi),
Self::ZERO,
Self::ZERO,
],
)
}
}

#[inline]
fn lazy_acc_finish(acc: [Self; 4], n_sub: u64) -> Self {
unsafe {
let (l, h, w, k) = (
acc[0].to_vector(),
acc[1].to_vector(),
acc[2].to_vector(),
acc[3].to_vector(),
);
let one = _mm512_set1_epi64(1);

// Merge the lo-wrap counter into H, carrying into W.
let h2 = _mm512_add_epi64(h, k);
let carry = _mm512_cmplt_epu64_mask(h2, k);
let w2 = _mm512_mask_add_epi64(w, carry, w, one);

#[cfg(debug_assertions)]
{
let w_arr: [u64; WIDTH] = transmute(w2);
let k_arr: [u64; WIDTH] = transmute(k);
for i in 0..WIDTH {
debug_assert!(
w_arr[i] < (1 << 23) && k_arr[i] < (1 << 23),
"lazy accumulator wrap counters overflow"
);
}
}

// One folding step: V = L + H'_lo*eps - (H'_hi + W*2^32) (mod P).
let e = _mm512_mul_epu32(h2, EPSILON);
let s = _mm512_add_epi64(_mm512_srli_epi64::<32>(h2), _mm512_slli_epi64::<32>(w2));
let t0 = sub_no_double_overflow_64_64(l, s);
let r = add_no_double_overflow_64_64(t0, e);
let r = Self::from_vector(r);

if n_sub == 0 {
r
} else {
// Repay the NOT deficit: n_sub * (2^32 + 1), n_sub < 2^23 so no overflow.
let deficit = Goldilocks::new(n_sub * ((1u64 << 32) + 1));
r + Self::broadcast(deficit)
}
}
}
}

impl_add_base_field!(PackedGoldilocksAVX512, Goldilocks);
Expand Down Expand Up @@ -390,9 +520,7 @@ unsafe fn mds_output<const I: usize>(s: &[__m512i; 8], s_hi: &[__m512i; 8]) -> _
/// `vpmuludq`, while the `vpaddq` it replaces was happily dual-issuing on the
/// add ports. Kept the `vpmuludq + vpaddq` form.
#[inline(always)]
pub(crate) fn mds_mul_simd(
state: [PackedGoldilocksAVX512; POSEIDON1_WIDTH],
) -> [PackedGoldilocksAVX512; POSEIDON1_WIDTH] {
pub fn mds_mul_simd(state: [PackedGoldilocksAVX512; POSEIDON1_WIDTH]) -> [PackedGoldilocksAVX512; POSEIDON1_WIDTH] {
unsafe {
let s: [__m512i; 8] = [
state[0].to_vector(),
Expand Down
Loading