From 6eeadc4f4908d6c083c56cbc21edba002ffa6d8a Mon Sep 17 00:00:00 2001
From: Jordan Nanos
 <jordan@slurm-login-0.slurm-login.tenant-slurm.svc.cluster.local>
Date: Fri, 12 Jun 2026 22:05:06 +0000
Subject: [PATCH 1/2] Add b300-cw (CoreWeave B300) runner launch script and
 pool

New CoreWeave B300 cluster: 5 nodes of 8x B300, Slurm partition b300,
shared storage on /mnt/vast. Single-node launcher adapted from
launch_h200-cw.sh (same CoreWeave salloc + enroot/pyxis pattern) with
the framework-tagged benchmark-script selection from launch_b300-nv.sh.
Multi-node is not wired up yet and exits with a clear error.

Registers pool key b300-cw with one runner (b300-cw_0), following the
gb300-cw naming convention.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .github/configs/runners.yaml |  2 +
 runners/launch_b300-cw.sh    | 71 ++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100755 runners/launch_b300-cw.sh

diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index eee8405d0..d6f9217f8 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -145,6 +145,8 @@ b300:
 - 'b300-nv_8'
 b300-p1:
 - 'b300-p1'
+b300-cw:
+- 'b300-cw_0'
 gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
diff --git a/runners/launch_b300-cw.sh b/runners/launch_b300-cw.sh
new file mode 100755
index 000000000..2130e16f3
--- /dev/null
+++ b/runners/launch_b300-cw.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+
+# Launches single-node benchmarks on the b300-cw (CoreWeave B300) cluster.
+# Adapted from launch_h200-cw.sh (same CoreWeave salloc/pyxis pattern) with
+# the benchmark-script selection logic from launch_b300-nv.sh. The runner
+# lives on the Slurm login node; jobs are scheduled onto the 8xB300 compute
+# nodes via salloc/srun.
+#
+# Multi-node (srt-slurm/dynamo) is not wired up yet on this cluster — adapt
+# the IS_MULTINODE branch of launch_b300-nv.sh when needed.
+
+export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache"
+export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/aiperf_mmap_cache"
+export PORT=8888
+
+if [[ "$IS_MULTINODE" == "true" ]]; then
+    echo "Multi-node benchmarks are not yet supported on b300-cw." >&2
+    exit 1
+fi
+
+PARTITION="b300"
+SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+LOCK_FILE="${SQUASH_FILE}.lock"
+
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+# Prefer a framework-tagged script (e.g. minimaxm2.5_fp4_b300_trt.sh) so models
+# with multiple inference engines can coexist; fall back to the historical
+# name without an engine suffix (`_trt` for trt, bare for everyone else)
+# for scripts that haven't been retagged yet.
+BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_b300"
+BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
+if [[ ! -f "$BENCH_SCRIPT" ]]; then
+    LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+    BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh"
+fi
+
+if [[ -n "${BENCH_SCRIPT_OVERRIDE:-}" ]]; then
+    BENCH_SCRIPT="$BENCH_SCRIPT_OVERRIDE"
+fi
+
+set -x
+
+JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b300:$TP --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+
+if [ -z "$JOB_ID" ]; then
+    echo "ERROR: salloc failed to allocate a job"
+    exit 1
+fi
+
+# Use flock to serialize concurrent imports to the same squash file
+srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+    exec 9>\"$LOCK_FILE\"
+    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+    if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+        echo 'Squash file already exists and is valid, skipping import'
+    else
+        rm -f \"$SQUASH_FILE\"
+        enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+    fi
+"
+CONTAINER_IMAGE=$(realpath $SQUASH_FILE)
+
+srun --jobid=$JOB_ID \
+--container-image=$CONTAINER_IMAGE \
+--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \
+--container-mount-home \
+--container-workdir=/workspace/ \
+--no-container-entrypoint --export=ALL,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \
+bash "$BENCH_SCRIPT"
+
+scancel $JOB_ID

From e3c622170ddbb759640ec7afc7fa21ec199617a9 Mon Sep 17 00:00:00 2001
From: Jordan Nanos
 <jordan@slurm-login-0.slurm-login.tenant-slurm.svc.cluster.local>
Date: Fri, 12 Jun 2026 22:43:40 +0000
Subject: [PATCH 2/2] launch_b300-cw.sh: use node-local /tmp import (proven
 CoreWeave pattern)

Replace the initial /mnt/vast (shared NFS) import with the launch_b200-cw.sh
node-local /tmp pattern: import the container on the allocated worker under
flock and pass the squash path as-is. Avoids the enroot aufs-whiteout
failures root-squash NFS triggers (documented in launch_b300-nv.sh), and
matches the launcher exercised by the b300-cw smoke test.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 runners/launch_b300-cw.sh | 89 ++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/runners/launch_b300-cw.sh b/runners/launch_b300-cw.sh
index 2130e16f3..df680f671 100755
--- a/runners/launch_b300-cw.sh
+++ b/runners/launch_b300-cw.sh
@@ -1,71 +1,72 @@
 #!/usr/bin/env bash
 
-# Launches single-node benchmarks on the b300-cw (CoreWeave B300) cluster.
-# Adapted from launch_h200-cw.sh (same CoreWeave salloc/pyxis pattern) with
-# the benchmark-script selection logic from launch_b300-nv.sh. The runner
-# lives on the Slurm login node; jobs are scheduled onto the 8xB300 compute
-# nodes via salloc/srun.
-#
-# Multi-node (srt-slurm/dynamo) is not wired up yet on this cluster — adapt
-# the IS_MULTINODE branch of launch_b300-nv.sh when needed.
+# Single-node launcher for the CoreWeave B300 cluster (runner b300-cw).
+# Follows the launch_b200-cw.sh CoreWeave template: allocate one node, import
+# the container to that node's local /tmp under flock, then srun the benchmark
+# in the same allocation. Importing to node-local /tmp (rather than shared NFS)
+# avoids the enroot aufs-whiteout failures that root-squash NFS triggers
+# (see launch_b300-nv.sh).
 
-export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache"
-export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/aiperf_mmap_cache"
+export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache"
 export PORT=8888
 
-if [[ "$IS_MULTINODE" == "true" ]]; then
-    echo "Multi-node benchmarks are not yet supported on b300-cw." >&2
-    exit 1
-fi
-
-PARTITION="b300"
-SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-LOCK_FILE="${SQUASH_FILE}.lock"
-
+MODEL_CODE="${EXP_NAME%%_*}"
+FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
 SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
-# Prefer a framework-tagged script (e.g. minimaxm2.5_fp4_b300_trt.sh) so models
+# Prefer a framework-tagged script (e.g. dsv4_fp4_b300_vllm.sh) so models
 # with multiple inference engines can coexist; fall back to the historical
-# name without an engine suffix (`_trt` for trt, bare for everyone else)
-# for scripts that haven't been retagged yet.
-BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_b300"
+# name without an engine suffix (`_trt` for trt, bare for everyone else).
+BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_b300"
 BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
 if [[ ! -f "$BENCH_SCRIPT" ]]; then
-    LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
-    BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh"
+    BENCH_SCRIPT="${BENCH_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
 fi
 
-if [[ -n "${BENCH_SCRIPT_OVERRIDE:-}" ]]; then
-    BENCH_SCRIPT="$BENCH_SCRIPT_OVERRIDE"
-fi
+PARTITION="b300"
+SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+LOCK_FILE="${SQUASH_FILE}.lock"
+
+CONTAINER_MOUNT_DIR=/workspace
 
 set -x
 
-JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b300:$TP --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b300:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
 
 if [ -z "$JOB_ID" ]; then
     echo "ERROR: salloc failed to allocate a job"
     exit 1
 fi
 
-# Use flock to serialize concurrent imports to the same squash file
-srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
-    exec 9>\"$LOCK_FILE\"
-    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
-    if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
-        echo 'Squash file already exists and is valid, skipping import'
-    else
-        rm -f \"$SQUASH_FILE\"
-        enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
-    fi
-"
-CONTAINER_IMAGE=$(realpath $SQUASH_FILE)
+# Use Docker image directly for openai/gpt-oss-120b with trt, otherwise use squash file
+if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then
+    CONTAINER_IMAGE=$IMAGE
+else
+    # Use flock to serialize concurrent imports to the same squash file.
+    # mkdir on the worker first: /tmp/gharunner is node-local and may not
+    # exist on a freshly allocated node.
+    srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+        mkdir -p /tmp/gharunner/squash \"$HF_HUB_CACHE_MOUNT\"
+        exec 9>\"$LOCK_FILE\"
+        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+            echo 'Squash file already exists and is valid, skipping import'
+        else
+            rm -f \"$SQUASH_FILE\"
+            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+        fi
+    "
+    # Squash file lives on the allocated worker node's /tmp, which is not
+    # visible from the host, so realpath on the host would return empty.
+    # Pass the path as-is; srun resolves it inside the job.
+    CONTAINER_IMAGE=$SQUASH_FILE
+fi
 
 srun --jobid=$JOB_ID \
 --container-image=$CONTAINER_IMAGE \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \
+--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 --container-mount-home \
---container-workdir=/workspace/ \
---no-container-entrypoint --export=ALL,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \
+--container-workdir=$CONTAINER_MOUNT_DIR \
+--no-container-entrypoint --export=ALL \
 bash "$BENCH_SCRIPT"
 
 scancel $JOB_ID