Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
5f369f0
spike
ColoCarletti Jun 29, 2026
0fa7f32
resident-data scaffolding
ColoCarletti Jun 29, 2026
bdea00e
part 2
ColoCarletti Jun 29, 2026
e9dcda8
composition parts fold
ColoCarletti Jun 29, 2026
b08ef81
fix
ColoCarletti Jun 29, 2026
8ea8a0a
fix_doc
ColoCarletti Jun 29, 2026
caa7efd
move merkle tree to gpu
ColoCarletti Jun 29, 2026
b811337
merkle tree
ColoCarletti Jun 29, 2026
819ad01
finish merkle
ColoCarletti Jun 29, 2026
0fb8df6
fix
ColoCarletti Jun 29, 2026
8ab633f
rm unused functions
ColoCarletti Jun 29, 2026
fd31fc9
cleanup
ColoCarletti Jun 29, 2026
0f53e65
Merge branch 'main' into gpu_integration
ColoCarletti Jun 30, 2026
b5fbdef
fix
ColoCarletti Jun 30, 2026
b37b649
fix
ColoCarletti Jun 30, 2026
b81e63f
Merge branch 'main' into gpu_integration
ColoCarletti Jun 30, 2026
873491e
fix clippy
ColoCarletti Jun 30, 2026
e4ada64
fix comments
ColoCarletti Jun 30, 2026
31d5524
Merge branch 'main' into gpu_integration
ColoCarletti Jun 30, 2026
4ca240c
fix
ColoCarletti Jun 30, 2026
45b2cfe
Merge branch 'gpu_integration' of github.com:yetanotherco/lambda_vm i…
ColoCarletti Jun 30, 2026
ea411be
rm data transfer
ColoCarletti Jul 1, 2026
f0e0d38
fmt
ColoCarletti Jul 1, 2026
b2596f1
refactor
ColoCarletti Jul 1, 2026
7ae0382
Merge branch 'main' into gpu_logup_aux
ColoCarletti Jul 1, 2026
0861b71
Merge branch 'main' into gpu_logup_aux
ColoCarletti Jul 2, 2026
66c8d45
fmt
ColoCarletti Jul 2, 2026
ae56284
Merge branch 'gpu_logup_aux' of github.com:yetanotherco/lambda_vm int…
ColoCarletti Jul 2, 2026
91d7816
Merge branch 'main' into gpu_logup_aux
MauroToscano Jul 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crypto/math-cuda/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,5 @@ fn main() {
compile_ptx("deep.cu", "deep.ptx", have_nvcc);
compile_ptx("fri.cu", "fri.ptx", have_nvcc);
compile_ptx("inverse.cu", "inverse.ptx", have_nvcc);
compile_ptx("logup.cu", "logup.ptx", have_nvcc);
}
238 changes: 238 additions & 0 deletions crypto/math-cuda/kernels/logup.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
// LogUp aux build: fingerprint kernel.
//
// One ext3 fingerprint per (interaction, row):
// lc = bus_id + sum_e alpha_powers[alpha_idx(e)] * base_e
// base_e = const_e + sum_t coef_t * main_col[col_t][row] (Goldilocks base)
// fp = z - lc
// Mirrors stark::logup_gpu::eval_fingerprint byte for byte.
//
// Layouts:
// main: column-major, main[col * num_rows + row].
// descriptor (CSR): interactions -> elements -> terms.
// alpha_powers: ext3 interleaved, 3 limbs each.
// out: ext3 interleaved, out[(k*num_rows + row)*3 + {0,1,2}].

#include "ext3.cuh"

using namespace ext3;

extern "C" __global__ void logup_fingerprint_ext3(
const uint64_t *__restrict__ main,
uint32_t num_rows,
uint32_t num_interactions,
const uint64_t *__restrict__ bus_ids,
const uint32_t *__restrict__ elem_offsets,
const uint32_t *__restrict__ elem_alpha_idx,
const uint64_t *__restrict__ elem_const,
const uint32_t *__restrict__ term_offsets,
const uint64_t *__restrict__ term_coef,
const uint32_t *__restrict__ term_col,
const uint64_t *__restrict__ alpha_powers,
uint64_t z0, uint64_t z1, uint64_t z2,
uint64_t *__restrict__ out) {
uint64_t tid = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x;
uint64_t total = (uint64_t)num_interactions * (uint64_t)num_rows;
if (tid >= total)
return;

uint32_t k = (uint32_t)(tid / num_rows);
uint32_t row = (uint32_t)(tid % num_rows);

Fe3 lc = make(bus_ids[k], 0, 0);

uint32_t e_hi = elem_offsets[k + 1];
for (uint32_t e = elem_offsets[k]; e < e_hi; ++e) {
uint64_t base = elem_const[e];
uint32_t t_hi = term_offsets[e + 1];
for (uint32_t t = term_offsets[e]; t < t_hi; ++t) {
uint64_t col_val = main[(uint64_t)term_col[t] * (uint64_t)num_rows + row];
base = goldilocks::add(base, goldilocks::mul(term_coef[t], col_val));
}
uint32_t ai = elem_alpha_idx[e];
Fe3 a = make(alpha_powers[ai * 3 + 0], alpha_powers[ai * 3 + 1],
alpha_powers[ai * 3 + 2]);
lc = add(lc, mul_base(a, base));
}

Fe3 z = make(z0, z1, z2);
Fe3 fp = sub(z, lc);
uint64_t o = tid * 3;
out[o + 0] = fp.a;
out[o + 1] = fp.b;
out[o + 2] = fp.c;
}

// Term combine: one ext3 per (output column, row):
// term = sum_{k in col} signed_mult_k(row) * reciprocal_k[row]
// signed_mult_k = mult_const[k] + sum_t mult_coef_t * main_col[col_t][row]
// (receiver sign already folded into the coefficients by the builder).
// reciprocals: ext3 interleaved, [(k*num_rows + row)*3 + limb].
// out: ext3 interleaved, [(col*num_rows + row)*3 + limb].
extern "C" __global__ void logup_term_ext3(
const uint64_t *__restrict__ main,
uint32_t num_rows,
const uint64_t *__restrict__ reciprocals,
uint32_t num_out_cols,
const uint32_t *__restrict__ out_col_offsets,
const uint32_t *__restrict__ out_col_interactions,
const uint64_t *__restrict__ mult_const,
const uint32_t *__restrict__ mult_term_offsets,
const uint64_t *__restrict__ mult_term_coef,
const uint32_t *__restrict__ mult_term_col,
uint64_t *__restrict__ out) {
uint64_t tid = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x;
uint64_t total = (uint64_t)num_out_cols * (uint64_t)num_rows;
if (tid >= total)
return;

uint32_t col = (uint32_t)(tid / num_rows);
uint32_t row = (uint32_t)(tid % num_rows);

Fe3 term = zero();
uint32_t ki_hi = out_col_offsets[col + 1];
for (uint32_t ki = out_col_offsets[col]; ki < ki_hi; ++ki) {
uint32_t k = out_col_interactions[ki];

uint64_t m = mult_const[k];
uint32_t t_hi = mult_term_offsets[k + 1];
for (uint32_t t = mult_term_offsets[k]; t < t_hi; ++t) {
uint64_t col_val = main[(uint64_t)mult_term_col[t] * (uint64_t)num_rows + row];
m = goldilocks::add(m, goldilocks::mul(mult_term_coef[t], col_val));
}

uint64_t ro = ((uint64_t)k * num_rows + row) * 3;
Fe3 r = make(reciprocals[ro], reciprocals[ro + 1], reciprocals[ro + 2]);
term = add(term, mul_base(r, m));
}

uint64_t o = tid * 3;
out[o + 0] = term.a;
out[o + 1] = term.b;
out[o + 2] = term.c;
}

// ===========================================================================
// Accumulated column (K4): running sum of the term columns, on device.
// row_sum[i] = sum over all term columns of term[col][i]
// S = inclusive prefix scan of row_sum ; L = S[n-1] ; offset = L / N
// acc[i] = S[i] - (i+1) * offset (matches build_accumulated_column)
// Additive 3-phase Hillis-Steele scan (mirrors inverse.cu, add not mul).
// ===========================================================================

#define LOGUP_BLK 256

// row_sum[i] = sum_c term[(c*num_rows + i)] over all num_cols term columns.
extern "C" __global__ void logup_row_sum_ext3(
const uint64_t *__restrict__ terms, uint32_t num_cols, uint32_t num_rows,
uint64_t *__restrict__ row_sum) {
uint64_t i = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x;
if (i >= num_rows)
return;
Fe3 s = zero();
for (uint32_t c = 0; c < num_cols; ++c) {
uint64_t o = ((uint64_t)c * num_rows + i) * 3;
s = add(s, make(terms[o], terms[o + 1], terms[o + 2]));
}
row_sum[i * 3] = s.a;
row_sum[i * 3 + 1] = s.b;
row_sum[i * 3 + 2] = s.c;
}

// Per-block inclusive additive scan; writes block totals (last valid element).
extern "C" __global__ void logup_scan_block_add_ext3(
const uint64_t *__restrict__ input, uint64_t n,
uint64_t *__restrict__ scan_out, uint64_t *__restrict__ block_totals) {
__shared__ Fe3 sh[LOGUP_BLK];
uint32_t tid = threadIdx.x;
uint64_t gid = blockIdx.x * (uint64_t)LOGUP_BLK + tid;
Fe3 v = (gid < n) ? make(input[gid * 3], input[gid * 3 + 1], input[gid * 3 + 2])
: zero();
sh[tid] = v;
__syncthreads();
for (uint32_t off = 1; off < LOGUP_BLK; off <<= 1) {
Fe3 t = (tid >= off) ? sh[tid - off] : zero();
__syncthreads();
if (tid >= off)
sh[tid] = add(sh[tid], t);
__syncthreads();
}
if (gid < n) {
uint64_t o = gid * 3;
scan_out[o] = sh[tid].a;
scan_out[o + 1] = sh[tid].b;
scan_out[o + 2] = sh[tid].c;
}
uint64_t block_end = (blockIdx.x + 1) * (uint64_t)LOGUP_BLK;
uint32_t last = (block_end <= n)
? (LOGUP_BLK - 1)
: (uint32_t)(n - blockIdx.x * (uint64_t)LOGUP_BLK - 1);
if (tid == last) {
uint64_t b = blockIdx.x * 3;
block_totals[b] = sh[tid].a;
block_totals[b + 1] = sh[tid].b;
block_totals[b + 2] = sh[tid].c;
}
}

// Phase 3: block b>0 adds the scanned prefix of preceding block totals.
extern "C" __global__ void logup_apply_offsets_add_ext3(
uint64_t *__restrict__ scan_inout, uint64_t n,
const uint64_t *__restrict__ block_totals_scanned) {
if (blockIdx.x == 0)
return;
uint64_t gid = blockIdx.x * (uint64_t)LOGUP_BLK + threadIdx.x;
if (gid >= n)
return;
uint64_t ob = (blockIdx.x - 1) * 3;
Fe3 off = make(block_totals_scanned[ob], block_totals_scanned[ob + 1],
block_totals_scanned[ob + 2]);
uint64_t o = gid * 3;
Fe3 v = add(make(scan_inout[o], scan_inout[o + 1], scan_inout[o + 2]), off);
scan_inout[o] = v.a;
scan_inout[o + 1] = v.b;
scan_inout[o + 2] = v.c;
}

// acc[i] = scan[i] - (i+1) * (L * inv_N), L = scan[n-1]. inv_N is ext3 (1/N).
extern "C" __global__ void logup_finalize_accum_ext3(
const uint64_t *__restrict__ scan, uint64_t n, uint64_t inv0, uint64_t inv1,
uint64_t inv2, uint64_t *__restrict__ acc) {
uint64_t i = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x;
if (i >= n)
return;
uint64_t lo = (n - 1) * 3;
Fe3 L = make(scan[lo], scan[lo + 1], scan[lo + 2]);
Fe3 offset = mul(L, make(inv0, inv1, inv2));
Fe3 s = make(scan[i * 3], scan[i * 3 + 1], scan[i * 3 + 2]);
Fe3 a = sub(s, mul_base(offset, i + 1));
acc[i * 3] = a.a;
acc[i * 3 + 1] = a.b;
acc[i * 3 + 2] = a.c;
}

// Assemble the row-major aux trace buffer from the resident committed term
// columns + the accumulated column:
// aux[row * num_aux_cols + col] = committed[col][row] (col < num_committed)
// = accumulated[row] (col == num_committed)
// terms layout is [col][row] (column-major); aux is row-major [row][col].
extern "C" __global__ void logup_assemble_aux_ext3(
const uint64_t *__restrict__ committed, uint32_t num_committed,
const uint64_t *__restrict__ accumulated, uint32_t num_rows,
uint64_t *__restrict__ aux) {
uint64_t i = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x;
if (i >= num_rows)
return;
uint32_t num_aux_cols = num_committed + 1;
for (uint32_t col = 0; col < num_committed; ++col) {
uint64_t src = ((uint64_t)col * num_rows + i) * 3;
uint64_t dst = ((uint64_t)i * num_aux_cols + col) * 3;
aux[dst] = committed[src];
aux[dst + 1] = committed[src + 1];
aux[dst + 2] = committed[src + 2];
}
uint64_t asrc = i * 3;
uint64_t adst = ((uint64_t)i * num_aux_cols + num_committed) * 3;
aux[adst] = accumulated[asrc];
aux[adst + 1] = accumulated[asrc + 1];
aux[adst + 2] = accumulated[asrc + 2];
}
16 changes: 16 additions & 0 deletions crypto/math-cuda/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ const BARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/barycentric.ptx")
const DEEP_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/deep.ptx"));
const FRI_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/fri.ptx"));
const INVERSE_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/inverse.ptx"));
const LOGUP_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/logup.ptx"));

/// Number of CUDA streams in the pool. Larger pools let many rayon-parallel
/// callers overlap on the GPU without serializing on stream ownership. The
Expand Down Expand Up @@ -179,6 +180,13 @@ pub struct Backend {
pub block_inclusive_scan_rev_ext3: CudaFunction,
pub apply_block_offsets_rev_ext3: CudaFunction,
pub batch_inverse_combine_ext3: CudaFunction,
pub logup_fingerprint_ext3: CudaFunction,
pub logup_term_ext3: CudaFunction,
pub logup_row_sum_ext3: CudaFunction,
pub logup_scan_block_add_ext3: CudaFunction,
pub logup_apply_offsets_add_ext3: CudaFunction,
pub logup_finalize_accum_ext3: CudaFunction,
pub logup_assemble_aux_ext3: CudaFunction,

// Twiddle caches keyed by log_n.
fwd_twiddles: Mutex<Vec<Option<Arc<CudaSlice<u64>>>>>,
Expand Down Expand Up @@ -280,6 +288,7 @@ impl Backend {
let deep = ctx.load_module(Ptx::from_src(DEEP_PTX))?;
let fri = ctx.load_module(Ptx::from_src(FRI_PTX))?;
let inverse = ctx.load_module(Ptx::from_src(INVERSE_PTX))?;
let logup = ctx.load_module(Ptx::from_src(LOGUP_PTX))?;

let mut streams = Vec::with_capacity(STREAM_POOL_SIZE);
for _ in 0..STREAM_POOL_SIZE {
Expand Down Expand Up @@ -360,6 +369,13 @@ impl Backend {
.load_function("block_inclusive_scan_rev_ext3")?,
apply_block_offsets_rev_ext3: inverse.load_function("apply_block_offsets_rev_ext3")?,
batch_inverse_combine_ext3: inverse.load_function("batch_inverse_combine_ext3")?,
logup_fingerprint_ext3: logup.load_function("logup_fingerprint_ext3")?,
logup_term_ext3: logup.load_function("logup_term_ext3")?,
logup_row_sum_ext3: logup.load_function("logup_row_sum_ext3")?,
logup_scan_block_add_ext3: logup.load_function("logup_scan_block_add_ext3")?,
logup_apply_offsets_add_ext3: logup.load_function("logup_apply_offsets_add_ext3")?,
logup_finalize_accum_ext3: logup.load_function("logup_finalize_accum_ext3")?,
logup_assemble_aux_ext3: logup.load_function("logup_assemble_aux_ext3")?,
fwd_twiddles: Mutex::new(vec![None; max_log]),
inv_twiddles: Mutex::new(vec![None; max_log]),
ctx,
Expand Down
Loading
Loading