diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index eee8405d0..d6f9217f8 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -145,6 +145,8 @@ b300: - 'b300-nv_8' b300-p1: - 'b300-p1' +b300-cw: +- 'b300-cw_0' gb300: - 'gb300-nv_0' - 'gb300-nv_1' diff --git a/runners/launch_b300-cw.sh b/runners/launch_b300-cw.sh new file mode 100755 index 000000000..df680f671 --- /dev/null +++ b/runners/launch_b300-cw.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash + +# Single-node launcher for the CoreWeave B300 cluster (runner b300-cw). +# Follows the launch_b200-cw.sh CoreWeave template: allocate one node, import +# the container to that node's local /tmp under flock, then srun the benchmark +# in the same allocation. Importing to node-local /tmp (rather than shared NFS) +# avoids the enroot aufs-whiteout failures that root-squash NFS triggers +# (see launch_b300-nv.sh). + +export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache" +export PORT=8888 + +MODEL_CODE="${EXP_NAME%%_*}" +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +# Prefer a framework-tagged script (e.g. dsv4_fp4_b300_vllm.sh) so models +# with multiple inference engines can coexist; fall back to the historical +# name without an engine suffix (`_trt` for trt, bare for everyone else). +BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_b300" +BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" +if [[ ! -f "$BENCH_SCRIPT" ]]; then + BENCH_SCRIPT="${BENCH_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" +fi + +PARTITION="b300" +SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" + +CONTAINER_MOUNT_DIR=/workspace + +set -x + +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b300:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + +if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 +fi + +# Use Docker image directly for openai/gpt-oss-120b with trt, otherwise use squash file +if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then + CONTAINER_IMAGE=$IMAGE +else + # Use flock to serialize concurrent imports to the same squash file. + # mkdir on the worker first: /tmp/gharunner is node-local and may not + # exist on a freshly allocated node. + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + mkdir -p /tmp/gharunner/squash \"$HF_HUB_CACHE_MOUNT\" + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + # Squash file lives on the allocated worker node's /tmp, which is not + # visible from the host, so realpath on the host would return empty. + # Pass the path as-is; srun resolves it inside the job. + CONTAINER_IMAGE=$SQUASH_FILE +fi + +srun --jobid=$JOB_ID \ +--container-image=$CONTAINER_IMAGE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mount-home \ +--container-workdir=$CONTAINER_MOUNT_DIR \ +--no-container-entrypoint --export=ALL \ +bash "$BENCH_SCRIPT" + +scancel $JOB_ID