yetanotherco · MauroToscano · Jul 3, 2026 · Jun 30, 2026 · Jun 30, 2026 · Jun 30, 2026
diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
diff --git a/Makefile b/Makefile
@@ -2,7 +2,8 @@
 compile-programs compile-recursion-elfs clean-asm clean-rust clean-bench clean-shared \
 clean-recursion-elfs clean test test-asm \
 test-rust test-executor test-flamegraph flamegraph-prover test-profile-recursion test-profile-recursion-single test-profile-recursion-multi \
-test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \
+test-fast test-prover test-prover-all test-prover-debug test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
+test-prover-cuda test-prover-comprehensive-cuda \
 bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
 update-ethrex-fixture-checksums check-ethrex-fixture-checksums
 
@@ -287,11 +288,32 @@ test-math-cuda:
 	cargo test -p math-cuda --release
 
 # End-to-end cuda dispatch coverage (requires NVIDIA GPU + nvcc).
-# Asserts every R1/R2/R3 GPU counter fired on a real prove.
+# Asserts the R1-R4 GPU dispatch counters fired on a real prove.
 test-cuda-integration:
 	cargo test -p lambda-vm-prover --release --features cuda \
 	    --test cuda_path_integration -- --ignored --nocapture
 
+# GPU error-path coverage (requires NVIDIA GPU + nvcc).
+# Forces cuda dispatch errors and asserts the CPU fallback still produces a verifying proof.
+test-cuda-fallback:
+	cargo test -p lambda-vm-prover --release --features test-cuda-faults \
+	    --test cuda_fallback_tests -- --ignored --nocapture --test-threads=1
+
+# The prover/stark/crypto/ecsm test suite with the GPU (cuda) path enabled (requires NVIDIA
+# GPU + nvcc). The GPU CI counterpart of CPU CI's sharded prover tests. Single-threaded: the
+# GPU serializes proves and the dispatch counters are process-global. cuda on prover cascades
+# to stark; crypto/ecsm build without it (they have no GPU path).
+test-prover-cuda:
+	cargo test --release -p lambda-vm-prover -p stark -p crypto -p ecsm \
+	    --features lambda-vm-prover/cuda -- --test-threads=1
+
+# The comprehensive all-instructions prove (ignored by default) on the GPU path (requires
+# NVIDIA GPU + nvcc). GPU counterpart of the all-instructions half of CPU CI's merge-queue-only
+# comprehensive job (the CPU job also runs test_recursion_execute; recursion has no GPU leg yet).
+test-prover-comprehensive-cuda:
+	cargo test --release -p lambda-vm-prover --features cuda \
+	    test_prove_elfs_all_instructions_64_full -- --ignored --test-threads=1 --nocapture
+
 # math-cuda quick microbench (median of 10 runs)
 bench-math-cuda:
 	cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture

diff --git a/README.md b/README.md
@@ -185,7 +185,11 @@ See [`spec/README.md`](./spec/README.md) for full setup instructions.
 | `make test-asm` | Compile and run ASM tests |
 | `make test-rust` | Compile and run Rust tests |
 | `make test-executor` | Compile all programs and run executor tests |
-| `make test-math-cuda` | math-cuda parity tests (requires NVIDIA GPU + nvcc) |
+| `make test-math-cuda` | math-cuda GPU kernel parity tests (requires NVIDIA GPU + nvcc; see GPU Tests) |
+| `make test-cuda-integration` | End-to-end GPU dispatch + proof verification (requires NVIDIA GPU + nvcc) |
+| `make test-cuda-fallback` | GPU error-path / CPU-fallback tests (requires NVIDIA GPU + nvcc) |
+| `make test-prover-cuda` | Prover/stark/crypto/ecsm suite on the GPU path (requires NVIDIA GPU + nvcc) |
+| `make test-prover-comprehensive-cuda` | Comprehensive all-instructions prove on the GPU path (requires NVIDIA GPU + nvcc) |
 | `make build` | Build all workspace crates |
 | `make check` | Check all crates (faster than build, no codegen) |
 | `make clippy` | Run clippy on all crates |
@@ -219,6 +223,21 @@ You can run it with
 
 `make test-rust`
 
+### GPU Tests
+
+The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`:
+
+- `make test-math-cuda` — GPU-vs-CPU kernel parity (NTT, LDE, barycentric, FRI, …)
+- `make test-cuda-integration` — proves a guest on GPU and checks every dispatch fired + the proof verifies
+- `make test-cuda-fallback` — forces GPU dispatch errors and checks the CPU fallback still verifies
+- `make test-prover-cuda` — the prover/stark/crypto/ecsm suite with the GPU path enabled
+- `make test-prover-comprehensive-cuda` — the comprehensive all-instructions prove on the GPU path
+
+The kernels are compiled by `nvcc` into PTX that the driver JIT-compiles at load, so the GPU's
+driver must be new enough for the toolkit — an older driver rejects the PTX with
+`CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. These groups run automatically on a rented GPU in the merge
+queue via `.github/workflows/gpu-tests.yml` (which filters offers on `cuda_max_good`).
+
 ## Benchmarking & Profiling
 
 You can create a flamegraph for proof generation using the following target:
@@ -298,3 +317,4 @@ at your option.
 Unless you explicitly state otherwise, any contribution intentionally submitted
 for inclusion in the work by you, as defined in the Apache-2.0 license, shall
 be dual licensed as above, without any additional terms or conditions.
+
diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs
@@ -72,6 +72,11 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) {
     // compute capability. If unset, try `nvidia-smi` to match the host GPU
     // (avoids JIT failures like nvcc-13.0 PTX rejected on Blackwell drivers);
     // fall back to compute_89 (Ada) when detection fails.
+    //
+    // NOTE: this `-arch` only sets the *virtual arch*, not the PTX ISA version, which is
+    // fixed by this nvcc's CUDA toolkit. The runtime driver must support that toolkit's CUDA
+    // version or it rejects the PTX with CUDA_ERROR_UNSUPPORTED_PTX_VERSION — i.e. the box's
+    // driver CUDA must be >= the build toolkit's CUDA. See README "GPU Tests".
     let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| detect_arch());
 
     let status = Command::new(nvcc_path())

diff --git a/docs/roadmap.md b/docs/roadmap.md
@@ -55,8 +55,8 @@ The first version is going to use the primitives contained in [lambdaworks](http
 | Feature                     | Description                       | Status       |
 |---------------------------- |-----------------------------------|--------------|
 | Fields                      | Improve field performance using assembly | Planned |
-| GPU-Fast-Fourier transform      | Implement GPU version of FFT | Planned |
-| GPU-Merkle tree                 | Implement GPU version for Merkle trees | Planned |
+| GPU-Fast-Fourier transform      | Implement GPU version of FFT | Done |
+| GPU-Merkle tree                 | Implement GPU version for Merkle trees | Done |
 | Parallel trace generation   | Use GPU for fast trace generation | Planned |
-| GPU-FRI | Perform FRI on GPU | Planned |
+| GPU-FRI | Perform FRI on GPU | Done |
 
diff --git a/prover/src/tests/prove_elfs_tests.rs b/prover/src/tests/prove_elfs_tests.rs
@@ -1440,7 +1440,7 @@ fn test_verify_rejects_tampered_public_output() {
 /// - Division: DIV, DIVU, REM, REMU
 /// - Control: LUI, AUIPC, JALR
 #[test]
-#[ignore] // Slow: run with `cargo test --ignored` or `make test-prover-all`
+#[ignore] // Slow: run with `cargo test -- --ignored` or `make test-prover-all`
 fn test_prove_elfs_all_instructions_64_full() {
     let _ = env_logger::builder().is_test(true).try_init();
 

diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+#
+# gpu_test.sh — run the CUDA-only test groups on a GPU box.
+#
+# Exercises the CUDA path, which CPU CI can't (GitHub runners have no GPU):
+#   1. math-cuda kernel parity         (make test-math-cuda)
+#   2. end-to-end GPU dispatch + proof  (make test-cuda-integration)
+#   3. GPU error-path / CPU fallback    (make test-cuda-fallback)
+#   4. prover/stark/crypto/ecsm suite   (make test-prover-cuda) — CPU CI's prover tests on GPU
+#   5. comprehensive all-instructions   (make test-prover-comprehensive-cuda)
+#
+# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All groups
+# run even if one fails (so the log shows every failure); the script exits non-zero if ANY
+# group failed, which fails the workflow job and blocks the merge.
+#
+# Env:
+#   CUDARC_PIN   cudarc CUDA-version feature to pin (default cuda-12080). See the sed below.
+#   SYSROOT_DIR  rv64 sysroot (default /opt/lambda-vm-sysroot, provisioned by the template).
+
+set -euo pipefail
+
+CUDARC_PIN="${CUDARC_PIN:-cuda-12080}"
+export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}"
+
+log() { printf '\n=== %s ===\n' "$*"; }
+
+# --- GPU toolchain sanity (fail loudly rather than silently falling back to CPU) ---
+log "GPU toolchain"
+if ! command -v nvcc >/dev/null 2>&1; then
+    for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do
+        [ -x "$d/nvcc" ] && export PATH="$d:$PATH" && break
+    done
+fi
+command -v nvcc >/dev/null 2>&1 || { echo "ERROR: nvcc not found — CUDA toolkit missing" >&2; exit 1; }
+nvcc --version | tail -n 2
+# Full nvidia-smi up front: GPU model, driver + CUDA runtime version, memory — for the log.
+nvidia-smi
+nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader
+
+# --- Pin cudarc so it binds a fixed driver-symbol set --------------------------
+# crypto/math-cuda/Cargo.toml uses `cuda-version-from-build-system` + `fallback-latest`;
+# when detection falls back to "latest", cudarc requests symbols some boxes' driver doesn't
+# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed,
+# conservative CUDA version binds a known driver-symbol set instead. (This is cudarc's
+# host-side driver-API floor — independent of the PTX/driver version the offer filter targets.)
+log "pinning cudarc to $CUDARC_PIN"
+# Guard the sed anchors: if math-cuda's cudarc features are ever renamed/reformatted, a silent
+# no-op here would bring the fallback-latest driver-symbol panic back with a confusing signature.
+for anchor in '"cuda-version-from-build-system"' '"fallback-latest"'; do
+    grep -qF "$anchor" crypto/math-cuda/Cargo.toml \
+        || { echo "ERROR: sed anchor $anchor not found in crypto/math-cuda/Cargo.toml — update this script's cudarc pin" >&2; exit 1; }
+done
+# Restore the tracked file on exit so a manual run on a dev box doesn't leave the tree dirty
+# (CI doesn't need this — the workflow re-checks-out before every run — but it's harmless there).
+CUDARC_TOML_BACKUP="$(mktemp)"
+cp crypto/math-cuda/Cargo.toml "$CUDARC_TOML_BACKUP"
+trap 'cp "$CUDARC_TOML_BACKUP" crypto/math-cuda/Cargo.toml; rm -f "$CUDARC_TOML_BACKUP"' EXIT
+sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \
+    crypto/math-cuda/Cargo.toml
+
+# --- Build the guest ELFs the tests prove ---------------------------------------
+# math-cuda parity needs none; cuda_path_integration / cuda_fallback prove an asm ELF; the
+# prover suite (Groups 4 & 5) proves asm AND rust guests. Build both up front.
+log "compiling guest programs (asm + rust)"
+make compile-programs-asm
+make compile-programs-rust
+
+# --- Run the CUDA test groups via the Makefile targets --------------------------
+fail=0
+run() {  # $1 = make target
+    log "make $1"
+    if ! make "$1"; then
+        echo "::error::GPU test group failed: $1"
+        fail=1
+    fi
+}
+run test-math-cuda                  # Group 1: kernel parity
+run test-cuda-integration           # Group 2: end-to-end GPU dispatch + proof verifies
+run test-cuda-fallback              # Group 3: GPU error -> CPU fallback still verifies
+run test-prover-cuda                # Group 4: prover/stark/crypto/ecsm suite on the GPU path
+run test-prover-comprehensive-cuda  # Group 5: comprehensive all-instructions prove on GPU
+
+if [ "$fail" -ne 0 ]; then
+    log "FAILED — one or more GPU test groups failed"
+    exit 1
+fi
+log "all GPU test groups passed"