Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,9 @@ jobs:
} >> "$GITHUB_STEP_SUMMARY"
echo "instance $IID ready — proceeding to tests"
exit 0
else
rc=$?
fi
rc=$?
echo "::warning::instance $IID not ready (rc=$rc: 1=timeout, 2=image/scheduling failure); destroying and trying another host"
destroy "$IID"; rm -f "$RUNNER_TEMP/vast_instance_id"
done
Expand Down Expand Up @@ -267,7 +268,10 @@ jobs:
# so we test exactly what will land. workflow_dispatch: the chosen branch ref.
REF: ${{ github.ref }}
run: |
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
# ServerAlive*: ConnectTimeout only covers connection setup; without keepalives a box
# that wedges or drops off the network mid-suite would hang this step silently until
# the 240-minute job timeout. 60s x 10 fails the run ~10 minutes after the box goes dark.
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=60 -o ServerAliveCountMax=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
# Defense-in-depth: never interpolate an unvalidated ref into the remote `bash -lc`.
case "$REF" in
''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;;
Expand Down Expand Up @@ -327,10 +331,21 @@ jobs:
echo; echo "No individual test failures parsed (build/infra error?). Last lines:"
echo '```'; tail -n 40 "$OUT" 2>/dev/null || echo "(no output captured)"; echo '```'
fi
echo; echo "<sub>Full output is in the \"Run GPU tests\" step log.</sub>"
echo; echo "<sub>Full output: \"Run GPU tests\" step log, or the \`gpu-test-log\` artifact (survives UI log truncation).</sub>"
fi
} >> "$GITHUB_STEP_SUMMARY"

# The step log gets truncated/rotated in the UI for multi-hour runs (see the machine-info
# comment above); the artifact keeps the complete output retrievable.
- name: Upload full test log
if: always() && (steps.tests.outcome == 'success' || steps.tests.outcome == 'failure')
uses: actions/upload-artifact@v4
with:
name: gpu-test-log
path: ${{ runner.temp }}/gpu_test_out.txt
if-no-files-found: ignore
retention-days: 14

# --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
- name: Destroy instance
if: always()
Expand Down
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
compile-programs compile-recursion-elfs clean-asm clean-rust clean-bench clean-shared \
clean-recursion-elfs clean test test-asm \
test-rust test-executor test-flamegraph flamegraph-prover test-profile-recursion test-profile-recursion-single test-profile-recursion-multi \
test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
test-fast test-prover test-prover-all test-prover-debug test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
test-prover-cuda test-prover-comprehensive-cuda \
bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
update-ethrex-fixture-checksums check-ethrex-fixture-checksums
Expand Down Expand Up @@ -288,7 +288,7 @@ test-math-cuda:
cargo test -p math-cuda --release

# End-to-end cuda dispatch coverage (requires NVIDIA GPU + nvcc).
# Asserts every R1/R2/R3 GPU counter fired on a real prove.
# Asserts the R1-R4 GPU dispatch counters fired on a real prove.
test-cuda-integration:
cargo test -p lambda-vm-prover --release --features cuda \
--test cuda_path_integration -- --ignored --nocapture
Expand All @@ -308,7 +308,8 @@ test-prover-cuda:
--features lambda-vm-prover/cuda -- --test-threads=1

# The comprehensive all-instructions prove (ignored by default) on the GPU path (requires
# NVIDIA GPU + nvcc). GPU counterpart of CPU CI's merge-queue-only comprehensive job.
# NVIDIA GPU + nvcc). GPU counterpart of the all-instructions half of CPU CI's merge-queue-only
# comprehensive job (the CPU job also runs test_recursion_execute; recursion has no GPU leg yet).
test-prover-comprehensive-cuda:
cargo test --release -p lambda-vm-prover --features cuda \
test_prove_elfs_all_instructions_64_full -- --ignored --test-threads=1 --nocapture
Expand Down
6 changes: 3 additions & 3 deletions docs/roadmap.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ The first version is going to use the primitives contained in [lambdaworks](http
| Feature | Description | Status |
|---------------------------- |-----------------------------------|--------------|
| Fields | Improve field performance using assembly | Planned |
| GPU-Fast-Fourier transform | Implement GPU version of FFT | Planned |
| GPU-Merkle tree | Implement GPU version for Merkle trees | Planned |
| GPU-Fast-Fourier transform | Implement GPU version of FFT | Done |
| GPU-Merkle tree | Implement GPU version for Merkle trees | Done |
| Parallel trace generation | Use GPU for fast trace generation | Planned |
| GPU-FRI | Perform FRI on GPU | Planned |
| GPU-FRI | Perform FRI on GPU | Done |

2 changes: 1 addition & 1 deletion prover/src/tests/prove_elfs_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1440,7 +1440,7 @@ fn test_verify_rejects_tampered_public_output() {
/// - Division: DIV, DIVU, REM, REMU
/// - Control: LUI, AUIPC, JALR
#[test]
#[ignore] // Slow: run with `cargo test --ignored` or `make test-prover-all`
#[ignore] // Slow: run with `cargo test -- --ignored` or `make test-prover-all`
fn test_prove_elfs_all_instructions_64_full() {
let _ = env_logger::builder().is_test(true).try_init();

Expand Down
11 changes: 11 additions & 0 deletions scripts/gpu_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader
# conservative CUDA version binds a known driver-symbol set instead. (This is cudarc's
# host-side driver-API floor — independent of the PTX/driver version the offer filter targets.)
log "pinning cudarc to $CUDARC_PIN"
# Guard the sed anchors: if math-cuda's cudarc features are ever renamed/reformatted, a silent
# no-op here would bring the fallback-latest driver-symbol panic back with a confusing signature.
for anchor in '"cuda-version-from-build-system"' '"fallback-latest"'; do
grep -qF "$anchor" crypto/math-cuda/Cargo.toml \
|| { echo "ERROR: sed anchor $anchor not found in crypto/math-cuda/Cargo.toml — update this script's cudarc pin" >&2; exit 1; }
done
# Restore the tracked file on exit so a manual run on a dev box doesn't leave the tree dirty
# (CI doesn't need this — the workflow re-checks-out before every run — but it's harmless there).
CUDARC_TOML_BACKUP="$(mktemp)"
cp crypto/math-cuda/Cargo.toml "$CUDARC_TOML_BACKUP"
trap 'cp "$CUDARC_TOML_BACKUP" crypto/math-cuda/Cargo.toml; rm -f "$CUDARC_TOML_BACKUP"' EXIT
sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \
crypto/math-cuda/Cargo.toml

Expand Down
Loading