Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ b300:
- 'b300-nv_8'
b300-p1:
- 'b300-p1'
b300-cw:
- 'b300-cw_0'
gb300:
- 'gb300-nv_0'
- 'gb300-nv_1'
Expand Down
72 changes: 72 additions & 0 deletions runners/launch_b300-cw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env bash

# Single-node launcher for the CoreWeave B300 cluster (runner b300-cw).
# Follows the launch_b200-cw.sh CoreWeave template: allocate one node, import
# the container to that node's local /tmp under flock, then srun the benchmark
# in the same allocation. Importing to node-local /tmp (rather than shared NFS)
# avoids the enroot aufs-whiteout failures that root-squash NFS triggers
# (see launch_b300-nv.sh).

export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache"
export PORT=8888

MODEL_CODE="${EXP_NAME%%_*}"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
# Prefer a framework-tagged script (e.g. dsv4_fp4_b300_vllm.sh) so models
# with multiple inference engines can coexist; fall back to the historical
# name without an engine suffix (`_trt` for trt, bare for everyone else).
BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_b300"
BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
if [[ ! -f "$BENCH_SCRIPT" ]]; then
BENCH_SCRIPT="${BENCH_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
fi

PARTITION="b300"
SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

CONTAINER_MOUNT_DIR=/workspace

set -x

JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b300:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')

if [ -z "$JOB_ID" ]; then
echo "ERROR: salloc failed to allocate a job"
exit 1
fi

# Use Docker image directly for openai/gpt-oss-120b with trt, otherwise use squash file
if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then
CONTAINER_IMAGE=$IMAGE
else
# Use flock to serialize concurrent imports to the same squash file.
# mkdir on the worker first: /tmp/gharunner is node-local and may not
# exist on a freshly allocated node.
srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
mkdir -p /tmp/gharunner/squash \"$HF_HUB_CACHE_MOUNT\"
exec 9>\"$LOCK_FILE\"
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
echo 'Squash file already exists and is valid, skipping import'
else
rm -f \"$SQUASH_FILE\"
enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
fi
"
# Squash file lives on the allocated worker node's /tmp, which is not
# visible from the host, so realpath on the host would return empty.
# Pass the path as-is; srun resolves it inside the job.
CONTAINER_IMAGE=$SQUASH_FILE
fi

srun --jobid=$JOB_ID \
--container-image=$CONTAINER_IMAGE \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--container-mount-home \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL \
bash "$BENCH_SCRIPT"

scancel $JOB_ID