From 336998ce8e0aa1b51cc135ce3890bb083edf09b4 Mon Sep 17 00:00:00 2001 From: Alex Trotta Date: Thu, 28 May 2026 12:09:36 -0400 Subject: [PATCH] Generate local resources in a .bazelrc file A recent Bazel change now errors if a local resource is requested but it will never be available (https://github.com/bazelbuild/bazel/commit/d9861632b5) This does another round of querying nvidia/rocm/amd-smi for GPU count and GPU memory so that we can fill these in. We use these internally to schedule tests on GPUs with memory counts, this makes a local GPU look a bit more like a remote one. This should allow smarter scheduling of local tests so the GPU isn't overloaded. Heavily Claude-generated. --- README.md | 60 ++++++++++++++++ mojo/mojo_host_platform.bzl | 137 ++++++++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) diff --git a/README.md b/README.md index 5e4cb95..945a9a4 100644 --- a/README.md +++ b/README.md @@ -24,3 +24,63 @@ mojo_binary( See the [tests](https://github.com/modular/rules_mojo/tree/main/tests) directory for more examples. + +## Tracking `gpu-memory` as a local resource + +If you tag tests with `resources:gpu-memory:` and run with a recent +Bazel (8.3+), the resource manager will reject builds that reference +resources it doesn't know about: + +```text +Resource gpu-memory is not being tracked by the resource manager. +Available resources are: cpu, memory. +``` + +To fix this, Bazel needs `--local_extra_resources=gpu-memory=` set +before the resource manager starts. `mojo_host_platform` autodetects +the GPU count and total GPU memory on the host (via `nvidia-smi`, +`amd-smi`, or `rocm-smi`) and writes a `gpu_resources.bazelrc` +fragment exposing both `gpu-memory` and a `gpu-N` resource family +(`gpu-1`, `gpu-2`, `gpu-4`, sized to `floor(total_gpus / N)`) into +its external repo on each fetch. On first fetch it prints the +absolute path: + +```text +rules_mojo: detected 4 GPU(s) (327036 MB total). Exposing local +resources: gpu-1=4, gpu-2=2, gpu-4=1, gpu-memory=327036. +Add this line to your .bazelrc once: + + try-import /home/.../external/+mojo+mojo_host_platform/gpu_resources.bazelrc +``` + +The generated fragment looks like: + +```text +build --local_extra_resources=gpu-1=4 +test --local_extra_resources=gpu-1=4 +build --local_extra_resources=gpu-2=2 +test --local_extra_resources=gpu-2=2 +build --local_extra_resources=gpu-4=1 +test --local_extra_resources=gpu-4=1 +build --local_extra_resources=gpu-memory=327036 +test --local_extra_resources=gpu-memory=327036 +``` + +The `gpu-N` family follows the same convention used in our internal +test infrastructure: a test that needs N GPUs claims +`resources:gpu-N:0.01`, and the pool size on each host is set to the +number of N-sized groups available, so a 2-GPU machine naturally +won't schedule a `gpu-4` test. + +Paste that `try-import` line into your `.bazelrc` once. The fragment +re-runs whenever the host's GPU configuration changes (the rule is +`configure = True`), so the resource pool stays in sync without +further intervention. + +Caveats: + +- The path is under your Bazel output base, so it will change if you + point Bazel at a different cache or use a different user account. + Re-run any `mojo` extension target to see the current path. +- Set `MOJO_QUIET_GPU_RESOURCES=1` to silence the print on each + fetch. diff --git a/mojo/mojo_host_platform.bzl b/mojo/mojo_host_platform.bzl index 781004c..6406e60 100644 --- a/mojo/mojo_host_platform.bzl +++ b/mojo/mojo_host_platform.bzl @@ -87,6 +87,124 @@ def _get_amd_constraints_with_rocm_smi(rctx, rocm_smi, gpu_mapping): return constraints +def _detect_nvidia_memory(rctx, nvidia_smi): + """Returns (gpu_count, total_memory_mb).""" + result = rctx.execute([nvidia_smi, "--query-gpu=memory.total", "--format=csv,noheader,nounits"]) + _log_result(rctx, "{} memory.total".format(nvidia_smi), result) + if result.return_code != 0: + return 0, 0 + count = 0 + total = 0 + for line in result.stdout.splitlines(): + line = line.strip() + if not line: + continue + total += int(line) + count += 1 + return count, total + +def _detect_amd_smi_memory(rctx, amd_smi): + """Returns (gpu_count, total_memory_mb).""" + result = rctx.execute([amd_smi, "static", "--vram", "--json"]) + _log_result(rctx, "{} static --vram".format(amd_smi), result) + if result.return_code != 0: + return 0, 0 + json_lines = [] + for line in result.stdout.splitlines(): + if line.startswith("WARNING:"): + continue + json_lines.append(line) + blob = json.decode("\n".join(json_lines), default = None) + if blob == None: + return 0, 0 + if type(blob) == "dict" and "gpu_data" in blob: + blob = blob["gpu_data"] + count = 0 + total = 0 + for entry in blob: + count += 1 + vram = entry.get("vram", {}) + size = vram.get("size", {}) + value = size.get("value") + unit = size.get("unit", "MB") + if value == None: + continue + # Normalize to MB + if unit == "GB": + total += int(value) * 1024 + elif unit == "KB": + total += int(value) // 1024 + elif unit == "B": + total += int(value) // (1024 * 1024) + else: # MB or unknown + total += int(value) + return count, total + +def _detect_rocm_smi_memory(rctx, rocm_smi): + """Returns (gpu_count, total_memory_mb).""" + result = rctx.execute([rocm_smi, "--showmeminfo", "vram", "--json"]) + _log_result(rctx, "{} --showmeminfo vram".format(rocm_smi), result) + if result.return_code != 0 or not result.stdout: + return 0, 0 + blob = json.decode(result.stdout, default = None) + if blob == None: + return 0, 0 + count = 0 + total = 0 + for value in blob.values(): + count += 1 + # Field name varies across rocm-smi versions: + # "VRAM Total Memory (B)" (bytes), or "VRAM Total Memory (MiB)". + for key, raw in value.items(): + if "VRAM Total Memory" not in key: + continue + raw_str = str(raw).strip() + if not raw_str.isdigit(): + continue + num = int(raw_str) + if "(B)" in key: + total += num // (1024 * 1024) + elif "MiB" in key or "MB" in key: + total += num + elif "GiB" in key or "GB" in key: + total += num * 1024 + return count, total + +_GPU_COUNT_BUCKETS = [1, 2, 4] + +def _write_gpu_resources_bazelrc(rctx, gpu_count, total_mb): + # Mirrors the modular monorepo convention in bazel/internal/remote_run.py: + # a test that needs N GPUs claims `resources:gpu-N:0.01`. Each `gpu-N` + # resource pool is sized to floor(total_gpus / N) so the convention works + # naturally on smaller hosts (a 2-GPU box gets gpu-1=2, gpu-2=1, no gpu-4). + lines = [ + "# Generated by @rules_mojo//mojo:mojo_host_platform.bzl.", + "# Detected {} GPU(s), {} MB total memory.".format(gpu_count, total_mb), + ] + emitted_gpu_buckets = [] + for n in _GPU_COUNT_BUCKETS: + pool = gpu_count // n + if pool <= 0: + continue + emitted_gpu_buckets.append("gpu-{}={}".format(n, pool)) + lines.append("build --local_extra_resources=gpu-{}={}".format(n, pool)) + lines.append("test --local_extra_resources=gpu-{}={}".format(n, pool)) + if total_mb > 0: + lines.append("build --local_extra_resources=gpu-memory={}".format(total_mb)) + lines.append("test --local_extra_resources=gpu-memory={}".format(total_mb)) + rctx.file("gpu_resources.bazelrc", content = "\n".join(lines) + "\n") + abs_path = rctx.path("gpu_resources.bazelrc") + if rctx.getenv("MOJO_QUIET_GPU_RESOURCES") != "1": + summary = ", ".join(emitted_gpu_buckets + (["gpu-memory={}".format(total_mb)] if total_mb > 0 else [])) + # buildifier: disable=print + print( + "rules_mojo: detected {} GPU(s) ({} MB total). ".format(gpu_count, total_mb) + + "Exposing local resources: {}.\n".format(summary) + + "Add this line to your .bazelrc once:\n\n" + + " try-import {}\n\n".format(abs_path) + + "Silence this message with MOJO_QUIET_GPU_RESOURCES=1.", + ) + def _get_apple_constraint(rctx, gpu_mapping): result = rctx.execute(["/usr/bin/sw_vers", "--productVersion"]) _log_result(rctx, "/usr/sbin/sw_vers --productVersion", result) @@ -123,6 +241,8 @@ def _get_apple_constraint(rctx, gpu_mapping): def _impl(rctx): constraints = [] + total_gpu_count = 0 + total_gpu_memory_mb = 0 if rctx.os.name == "linux" and (rctx.os.arch == "amd64" or rctx.os.arch == "aarch64"): # A system may have both rocm-smi and nvidia-smi installed, check both. @@ -132,6 +252,19 @@ def _impl(rctx): amd_smi = rctx.which("amd-smi") rocm_smi = rctx.which("rocm-smi") + if nvidia_smi: + count, mb = _detect_nvidia_memory(rctx, nvidia_smi) + total_gpu_count += count + total_gpu_memory_mb += mb + if amd_smi: + count, mb = _detect_amd_smi_memory(rctx, amd_smi) + total_gpu_count += count + total_gpu_memory_mb += mb + elif rocm_smi: + count, mb = _detect_rocm_smi_memory(rctx, rocm_smi) + total_gpu_count += count + total_gpu_memory_mb += mb + _verbose_log(rctx, "nvidia-smi path: {}, rocm-smi path: {}, amd-smi path: {}".format(nvidia_smi, rocm_smi, amd_smi)) # NVIDIA @@ -219,11 +352,15 @@ platform( ) """.format(constraints = ", ".join(['"{}"'.format(x) for x in constraints]))) + if total_gpu_count > 0 or total_gpu_memory_mb > 0: + _write_gpu_resources_bazelrc(rctx, total_gpu_count, total_gpu_memory_mb) + mojo_host_platform = repository_rule( implementation = _impl, configure = True, environ = [ "MOJO_IGNORE_UNKNOWN_GPUS", + "MOJO_QUIET_GPU_RESOURCES", "MOJO_VERBOSE_GPU_DETECT", ], attrs = {