yetanotherco · MauroToscano · Jul 3, 2026 · Jul 3, 2026
diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
@@ -232,8 +232,9 @@ jobs:
               } >> "$GITHUB_STEP_SUMMARY"
               echo "instance $IID ready — proceeding to tests"
               exit 0
+            else
+              rc=$?
             fi
-            rc=$?
             echo "::warning::instance $IID not ready (rc=$rc: 1=timeout, 2=image/scheduling failure); destroying and trying another host"
             destroy "$IID"; rm -f "$RUNNER_TEMP/vast_instance_id"
           done
@@ -267,7 +268,10 @@ jobs:
           # so we test exactly what will land. workflow_dispatch: the chosen branch ref.
           REF: ${{ github.ref }}
         run: |
-          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          # ServerAlive*: ConnectTimeout only covers connection setup; without keepalives a box
+          # that wedges or drops off the network mid-suite would hang this step silently until
+          # the 240-minute job timeout. 60s x 10 fails the run ~10 minutes after the box goes dark.
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=60 -o ServerAliveCountMax=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
           # Defense-in-depth: never interpolate an unvalidated ref into the remote `bash -lc`.
           case "$REF" in
             ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;;
@@ -327,10 +331,21 @@ jobs:
                 echo; echo "No individual test failures parsed (build/infra error?). Last lines:"
                 echo '```'; tail -n 40 "$OUT" 2>/dev/null || echo "(no output captured)"; echo '```'
               fi
-              echo; echo "<sub>Full output is in the \"Run GPU tests\" step log.</sub>"
+              echo; echo "<sub>Full output: \"Run GPU tests\" step log, or the \`gpu-test-log\` artifact (survives UI log truncation).</sub>"
             fi
           } >> "$GITHUB_STEP_SUMMARY"
 
+      # The step log gets truncated/rotated in the UI for multi-hour runs (see the machine-info
+      # comment above); the artifact keeps the complete output retrievable.
+      - name: Upload full test log
+        if: always() && (steps.tests.outcome == 'success' || steps.tests.outcome == 'failure')
+        uses: actions/upload-artifact@v4
+        with:
+          name: gpu-test-log
+          path: ${{ runner.temp }}/gpu_test_out.txt
+          if-no-files-found: ignore
+          retention-days: 14
+
       # --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
       - name: Destroy instance
         if: always()

diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 compile-programs compile-recursion-elfs clean-asm clean-rust clean-bench clean-shared \
 clean-recursion-elfs clean test test-asm \
 test-rust test-executor test-flamegraph flamegraph-prover test-profile-recursion test-profile-recursion-single test-profile-recursion-multi \
-test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
+test-fast test-prover test-prover-all test-prover-debug test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
 test-prover-cuda test-prover-comprehensive-cuda \
 bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
 update-ethrex-fixture-checksums check-ethrex-fixture-checksums
@@ -288,7 +288,7 @@ test-math-cuda:
 	cargo test -p math-cuda --release
 
 # End-to-end cuda dispatch coverage (requires NVIDIA GPU + nvcc).
-# Asserts every R1/R2/R3 GPU counter fired on a real prove.
+# Asserts the R1-R4 GPU dispatch counters fired on a real prove.
 test-cuda-integration:
 	cargo test -p lambda-vm-prover --release --features cuda \
 	    --test cuda_path_integration -- --ignored --nocapture
@@ -308,7 +308,8 @@ test-prover-cuda:
 	    --features lambda-vm-prover/cuda -- --test-threads=1
 
 # The comprehensive all-instructions prove (ignored by default) on the GPU path (requires
-# NVIDIA GPU + nvcc). GPU counterpart of CPU CI's merge-queue-only comprehensive job.
+# NVIDIA GPU + nvcc). GPU counterpart of the all-instructions half of CPU CI's merge-queue-only
+# comprehensive job (the CPU job also runs test_recursion_execute; recursion has no GPU leg yet).
 test-prover-comprehensive-cuda:
 	cargo test --release -p lambda-vm-prover --features cuda \
 	    test_prove_elfs_all_instructions_64_full -- --ignored --test-threads=1 --nocapture

diff --git a/docs/roadmap.md b/docs/roadmap.md
@@ -55,8 +55,8 @@ The first version is going to use the primitives contained in [lambdaworks](http
 | Feature                     | Description                       | Status       |
 |---------------------------- |-----------------------------------|--------------|
 | Fields                      | Improve field performance using assembly | Planned |
-| GPU-Fast-Fourier transform      | Implement GPU version of FFT | Planned |
-| GPU-Merkle tree                 | Implement GPU version for Merkle trees | Planned |
+| GPU-Fast-Fourier transform      | Implement GPU version of FFT | Done |
+| GPU-Merkle tree                 | Implement GPU version for Merkle trees | Done |
 | Parallel trace generation   | Use GPU for fast trace generation | Planned |
-| GPU-FRI | Perform FRI on GPU | Planned |
+| GPU-FRI | Perform FRI on GPU | Done |
 
diff --git a/prover/src/tests/prove_elfs_tests.rs b/prover/src/tests/prove_elfs_tests.rs
@@ -1440,7 +1440,7 @@ fn test_verify_rejects_tampered_public_output() {
 /// - Division: DIV, DIVU, REM, REMU
 /// - Control: LUI, AUIPC, JALR
 #[test]
-#[ignore] // Slow: run with `cargo test --ignored` or `make test-prover-all`
+#[ignore] // Slow: run with `cargo test -- --ignored` or `make test-prover-all`
 fn test_prove_elfs_all_instructions_64_full() {
     let _ = env_logger::builder().is_test(true).try_init();
 

diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh
@@ -44,6 +44,17 @@ nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader
 # conservative CUDA version binds a known driver-symbol set instead. (This is cudarc's
 # host-side driver-API floor — independent of the PTX/driver version the offer filter targets.)
 log "pinning cudarc to $CUDARC_PIN"
+# Guard the sed anchors: if math-cuda's cudarc features are ever renamed/reformatted, a silent
+# no-op here would bring the fallback-latest driver-symbol panic back with a confusing signature.
+for anchor in '"cuda-version-from-build-system"' '"fallback-latest"'; do
+    grep -qF "$anchor" crypto/math-cuda/Cargo.toml \
+        || { echo "ERROR: sed anchor $anchor not found in crypto/math-cuda/Cargo.toml — update this script's cudarc pin" >&2; exit 1; }
+done
+# Restore the tracked file on exit so a manual run on a dev box doesn't leave the tree dirty
+# (CI doesn't need this — the workflow re-checks-out before every run — but it's harmless there).
+CUDARC_TOML_BACKUP="$(mktemp)"
+cp crypto/math-cuda/Cargo.toml "$CUDARC_TOML_BACKUP"
+trap 'cp "$CUDARC_TOML_BACKUP" crypto/math-cuda/Cargo.toml; rm -f "$CUDARC_TOML_BACKUP"' EXIT
 sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \
     crypto/math-cuda/Cargo.toml