From 6eeadc4f4908d6c083c56cbc21edba002ffa6d8a Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Fri, 12 Jun 2026 22:05:06 +0000 Subject: [PATCH 1/2] Add b300-cw (CoreWeave B300) runner launch script and pool New CoreWeave B300 cluster: 5 nodes of 8x B300, Slurm partition b300, shared storage on /mnt/vast. Single-node launcher adapted from launch_h200-cw.sh (same CoreWeave salloc + enroot/pyxis pattern) with the framework-tagged benchmark-script selection from launch_b300-nv.sh. Multi-node is not wired up yet and exits with a clear error. Registers pool key b300-cw with one runner (b300-cw_0), following the gb300-cw naming convention. Co-Authored-By: Claude Fable 5 --- .github/configs/runners.yaml | 2 + runners/launch_b300-cw.sh | 71 ++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100755 runners/launch_b300-cw.sh diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index eee8405d0..d6f9217f8 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -145,6 +145,8 @@ b300: - 'b300-nv_8' b300-p1: - 'b300-p1' +b300-cw: +- 'b300-cw_0' gb300: - 'gb300-nv_0' - 'gb300-nv_1' diff --git a/runners/launch_b300-cw.sh b/runners/launch_b300-cw.sh new file mode 100755 index 000000000..2130e16f3 --- /dev/null +++ b/runners/launch_b300-cw.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +# Launches single-node benchmarks on the b300-cw (CoreWeave B300) cluster. +# Adapted from launch_h200-cw.sh (same CoreWeave salloc/pyxis pattern) with +# the benchmark-script selection logic from launch_b300-nv.sh. The runner +# lives on the Slurm login node; jobs are scheduled onto the 8xB300 compute +# nodes via salloc/srun. +# +# Multi-node (srt-slurm/dynamo) is not wired up yet on this cluster — adapt +# the IS_MULTINODE branch of launch_b300-nv.sh when needed. + +export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache" +export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/aiperf_mmap_cache" +export PORT=8888 + +if [[ "$IS_MULTINODE" == "true" ]]; then + echo "Multi-node benchmarks are not yet supported on b300-cw." >&2 + exit 1 +fi + +PARTITION="b300" +SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" + +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +# Prefer a framework-tagged script (e.g. minimaxm2.5_fp4_b300_trt.sh) so models +# with multiple inference engines can coexist; fall back to the historical +# name without an engine suffix (`_trt` for trt, bare for everyone else) +# for scripts that haven't been retagged yet. +BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_b300" +BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" +if [[ ! -f "$BENCH_SCRIPT" ]]; then + LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" +fi + +if [[ -n "${BENCH_SCRIPT_OVERRIDE:-}" ]]; then + BENCH_SCRIPT="$BENCH_SCRIPT_OVERRIDE" +fi + +set -x + +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b300:$TP --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + +if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 +fi + +# Use flock to serialize concurrent imports to the same squash file +srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi +" +CONTAINER_IMAGE=$(realpath $SQUASH_FILE) + +srun --jobid=$JOB_ID \ +--container-image=$CONTAINER_IMAGE \ +--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \ +--container-mount-home \ +--container-workdir=/workspace/ \ +--no-container-entrypoint --export=ALL,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \ +bash "$BENCH_SCRIPT" + +scancel $JOB_ID From e3c622170ddbb759640ec7afc7fa21ec199617a9 Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Fri, 12 Jun 2026 22:43:40 +0000 Subject: [PATCH 2/2] launch_b300-cw.sh: use node-local /tmp import (proven CoreWeave pattern) Replace the initial /mnt/vast (shared NFS) import with the launch_b200-cw.sh node-local /tmp pattern: import the container on the allocated worker under flock and pass the squash path as-is. Avoids the enroot aufs-whiteout failures root-squash NFS triggers (documented in launch_b300-nv.sh), and matches the launcher exercised by the b300-cw smoke test. Co-Authored-By: Claude Fable 5 --- runners/launch_b300-cw.sh | 89 ++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 44 deletions(-) diff --git a/runners/launch_b300-cw.sh b/runners/launch_b300-cw.sh index 2130e16f3..df680f671 100755 --- a/runners/launch_b300-cw.sh +++ b/runners/launch_b300-cw.sh @@ -1,71 +1,72 @@ #!/usr/bin/env bash -# Launches single-node benchmarks on the b300-cw (CoreWeave B300) cluster. -# Adapted from launch_h200-cw.sh (same CoreWeave salloc/pyxis pattern) with -# the benchmark-script selection logic from launch_b300-nv.sh. The runner -# lives on the Slurm login node; jobs are scheduled onto the 8xB300 compute -# nodes via salloc/srun. -# -# Multi-node (srt-slurm/dynamo) is not wired up yet on this cluster — adapt -# the IS_MULTINODE branch of launch_b300-nv.sh when needed. +# Single-node launcher for the CoreWeave B300 cluster (runner b300-cw). +# Follows the launch_b200-cw.sh CoreWeave template: allocate one node, import +# the container to that node's local /tmp under flock, then srun the benchmark +# in the same allocation. Importing to node-local /tmp (rather than shared NFS) +# avoids the enroot aufs-whiteout failures that root-squash NFS triggers +# (see launch_b300-nv.sh). -export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache" -export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/aiperf_mmap_cache" +export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache" export PORT=8888 -if [[ "$IS_MULTINODE" == "true" ]]; then - echo "Multi-node benchmarks are not yet supported on b300-cw." >&2 - exit 1 -fi - -PARTITION="b300" -SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="${SQUASH_FILE}.lock" - +MODEL_CODE="${EXP_NAME%%_*}" +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') -# Prefer a framework-tagged script (e.g. minimaxm2.5_fp4_b300_trt.sh) so models +# Prefer a framework-tagged script (e.g. dsv4_fp4_b300_vllm.sh) so models # with multiple inference engines can coexist; fall back to the historical -# name without an engine suffix (`_trt` for trt, bare for everyone else) -# for scripts that haven't been retagged yet. -BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_b300" +# name without an engine suffix (`_trt` for trt, bare for everyone else). +BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_b300" BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" if [[ ! -f "$BENCH_SCRIPT" ]]; then - LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') - BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" + BENCH_SCRIPT="${BENCH_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" fi -if [[ -n "${BENCH_SCRIPT_OVERRIDE:-}" ]]; then - BENCH_SCRIPT="$BENCH_SCRIPT_OVERRIDE" -fi +PARTITION="b300" +SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" + +CONTAINER_MOUNT_DIR=/workspace set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b300:$TP --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b300:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" exit 1 fi -# Use flock to serialize concurrent imports to the same squash file -srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE - fi -" -CONTAINER_IMAGE=$(realpath $SQUASH_FILE) +# Use Docker image directly for openai/gpt-oss-120b with trt, otherwise use squash file +if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then + CONTAINER_IMAGE=$IMAGE +else + # Use flock to serialize concurrent imports to the same squash file. + # mkdir on the worker first: /tmp/gharunner is node-local and may not + # exist on a freshly allocated node. + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + mkdir -p /tmp/gharunner/squash \"$HF_HUB_CACHE_MOUNT\" + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + # Squash file lives on the allocated worker node's /tmp, which is not + # visible from the host, so realpath on the host would return empty. + # Pass the path as-is; srun resolves it inside the job. + CONTAINER_IMAGE=$SQUASH_FILE +fi srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ ---container-workdir=/workspace/ \ ---no-container-entrypoint --export=ALL,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \ +--container-workdir=$CONTAINER_MOUNT_DIR \ +--no-container-entrypoint --export=ALL \ bash "$BENCH_SCRIPT" scancel $JOB_ID