From 336998ce8e0aa1b51cc135ce3890bb083edf09b4 Mon Sep 17 00:00:00 2001
From: Alex Trotta <ahajha@gmail.com>
Date: Thu, 28 May 2026 12:09:36 -0400
Subject: [PATCH] Generate local resources in a .bazelrc file

A recent Bazel change now errors if a local resource is requested but it will never be available (https://github.com/bazelbuild/bazel/commit/d9861632b5)

This does another round of querying nvidia/rocm/amd-smi for GPU count and GPU memory so that we can fill these in. We use these internally to schedule tests on GPUs with memory counts, this makes a local GPU look a bit more like a remote one. This should allow smarter scheduling of local tests so the GPU isn't overloaded.

Heavily Claude-generated.
---
 README.md                   |  60 ++++++++++++++++
 mojo/mojo_host_platform.bzl | 137 ++++++++++++++++++++++++++++++++++++
 2 files changed, 197 insertions(+)
diff --git a/README.md b/README.md
index 5e4cb95..945a9a4 100644
--- a/README.md
+++ b/README.md
@@ -24,3 +24,63 @@ mojo_binary(
 
 See the [tests](https://github.com/modular/rules_mojo/tree/main/tests)
 directory for more examples.
+
+## Tracking `gpu-memory` as a local resource
+
+If you tag tests with `resources:gpu-memory:<N>` and run with a recent
+Bazel (8.3+), the resource manager will reject builds that reference
+resources it doesn't know about:
+
+```text
+Resource gpu-memory is not being tracked by the resource manager.
+Available resources are: cpu, memory.
+```
+
+To fix this, Bazel needs `--local_extra_resources=gpu-memory=<N>` set
+before the resource manager starts. `mojo_host_platform` autodetects
+the GPU count and total GPU memory on the host (via `nvidia-smi`,
+`amd-smi`, or `rocm-smi`) and writes a `gpu_resources.bazelrc`
+fragment exposing both `gpu-memory` and a `gpu-N` resource family
+(`gpu-1`, `gpu-2`, `gpu-4`, sized to `floor(total_gpus / N)`) into
+its external repo on each fetch. On first fetch it prints the
+absolute path:
+
+```text
+rules_mojo: detected 4 GPU(s) (327036 MB total). Exposing local
+resources: gpu-1=4, gpu-2=2, gpu-4=1, gpu-memory=327036.
+Add this line to your .bazelrc once:
+
+  try-import /home/.../external/+mojo+mojo_host_platform/gpu_resources.bazelrc
+```
+
+The generated fragment looks like:
+
+```text
+build --local_extra_resources=gpu-1=4
+test --local_extra_resources=gpu-1=4
+build --local_extra_resources=gpu-2=2
+test --local_extra_resources=gpu-2=2
+build --local_extra_resources=gpu-4=1
+test --local_extra_resources=gpu-4=1
+build --local_extra_resources=gpu-memory=327036
+test --local_extra_resources=gpu-memory=327036
+```
+
+The `gpu-N` family follows the same convention used in our internal
+test infrastructure: a test that needs N GPUs claims
+`resources:gpu-N:0.01`, and the pool size on each host is set to the
+number of N-sized groups available, so a 2-GPU machine naturally
+won't schedule a `gpu-4` test.
+
+Paste that `try-import` line into your `.bazelrc` once. The fragment
+re-runs whenever the host's GPU configuration changes (the rule is
+`configure = True`), so the resource pool stays in sync without
+further intervention.
+
+Caveats:
+
+- The path is under your Bazel output base, so it will change if you
+  point Bazel at a different cache or use a different user account.
+  Re-run any `mojo` extension target to see the current path.
+- Set `MOJO_QUIET_GPU_RESOURCES=1` to silence the print on each
+  fetch.
diff --git a/mojo/mojo_host_platform.bzl b/mojo/mojo_host_platform.bzl
index 781004c..6406e60 100644
--- a/mojo/mojo_host_platform.bzl
+++ b/mojo/mojo_host_platform.bzl
@@ -87,6 +87,124 @@ def _get_amd_constraints_with_rocm_smi(rctx, rocm_smi, gpu_mapping):
 
     return constraints
 
+def _detect_nvidia_memory(rctx, nvidia_smi):
+    """Returns (gpu_count, total_memory_mb)."""
+    result = rctx.execute([nvidia_smi, "--query-gpu=memory.total", "--format=csv,noheader,nounits"])
+    _log_result(rctx, "{} memory.total".format(nvidia_smi), result)
+    if result.return_code != 0:
+        return 0, 0
+    count = 0
+    total = 0
+    for line in result.stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        total += int(line)
+        count += 1
+    return count, total
+
+def _detect_amd_smi_memory(rctx, amd_smi):
+    """Returns (gpu_count, total_memory_mb)."""
+    result = rctx.execute([amd_smi, "static", "--vram", "--json"])
+    _log_result(rctx, "{} static --vram".format(amd_smi), result)
+    if result.return_code != 0:
+        return 0, 0
+    json_lines = []
+    for line in result.stdout.splitlines():
+        if line.startswith("WARNING:"):
+            continue
+        json_lines.append(line)
+    blob = json.decode("\n".join(json_lines), default = None)
+    if blob == None:
+        return 0, 0
+    if type(blob) == "dict" and "gpu_data" in blob:
+        blob = blob["gpu_data"]
+    count = 0
+    total = 0
+    for entry in blob:
+        count += 1
+        vram = entry.get("vram", {})
+        size = vram.get("size", {})
+        value = size.get("value")
+        unit = size.get("unit", "MB")
+        if value == None:
+            continue
+        # Normalize to MB
+        if unit == "GB":
+            total += int(value) * 1024
+        elif unit == "KB":
+            total += int(value) // 1024
+        elif unit == "B":
+            total += int(value) // (1024 * 1024)
+        else:  # MB or unknown
+            total += int(value)
+    return count, total
+
+def _detect_rocm_smi_memory(rctx, rocm_smi):
+    """Returns (gpu_count, total_memory_mb)."""
+    result = rctx.execute([rocm_smi, "--showmeminfo", "vram", "--json"])
+    _log_result(rctx, "{} --showmeminfo vram".format(rocm_smi), result)
+    if result.return_code != 0 or not result.stdout:
+        return 0, 0
+    blob = json.decode(result.stdout, default = None)
+    if blob == None:
+        return 0, 0
+    count = 0
+    total = 0
+    for value in blob.values():
+        count += 1
+        # Field name varies across rocm-smi versions:
+        #   "VRAM Total Memory (B)" (bytes), or "VRAM Total Memory (MiB)".
+        for key, raw in value.items():
+            if "VRAM Total Memory" not in key:
+                continue
+            raw_str = str(raw).strip()
+            if not raw_str.isdigit():
+                continue
+            num = int(raw_str)
+            if "(B)" in key:
+                total += num // (1024 * 1024)
+            elif "MiB" in key or "MB" in key:
+                total += num
+            elif "GiB" in key or "GB" in key:
+                total += num * 1024
+    return count, total
+
+_GPU_COUNT_BUCKETS = [1, 2, 4]
+
+def _write_gpu_resources_bazelrc(rctx, gpu_count, total_mb):
+    # Mirrors the modular monorepo convention in bazel/internal/remote_run.py:
+    # a test that needs N GPUs claims `resources:gpu-N:0.01`. Each `gpu-N`
+    # resource pool is sized to floor(total_gpus / N) so the convention works
+    # naturally on smaller hosts (a 2-GPU box gets gpu-1=2, gpu-2=1, no gpu-4).
+    lines = [
+        "# Generated by @rules_mojo//mojo:mojo_host_platform.bzl.",
+        "# Detected {} GPU(s), {} MB total memory.".format(gpu_count, total_mb),
+    ]
+    emitted_gpu_buckets = []
+    for n in _GPU_COUNT_BUCKETS:
+        pool = gpu_count // n
+        if pool <= 0:
+            continue
+        emitted_gpu_buckets.append("gpu-{}={}".format(n, pool))
+        lines.append("build --local_extra_resources=gpu-{}={}".format(n, pool))
+        lines.append("test --local_extra_resources=gpu-{}={}".format(n, pool))
+    if total_mb > 0:
+        lines.append("build --local_extra_resources=gpu-memory={}".format(total_mb))
+        lines.append("test --local_extra_resources=gpu-memory={}".format(total_mb))
+    rctx.file("gpu_resources.bazelrc", content = "\n".join(lines) + "\n")
+    abs_path = rctx.path("gpu_resources.bazelrc")
+    if rctx.getenv("MOJO_QUIET_GPU_RESOURCES") != "1":
+        summary = ", ".join(emitted_gpu_buckets + (["gpu-memory={}".format(total_mb)] if total_mb > 0 else []))
+        # buildifier: disable=print
+        print(
+            "rules_mojo: detected {} GPU(s) ({} MB total). ".format(gpu_count, total_mb) +
+            "Exposing local resources: {}.\n".format(summary) +
+            "Add this line to your .bazelrc once:\n\n" +
+            "  try-import {}\n\n".format(abs_path) +
+            "Silence this message with MOJO_QUIET_GPU_RESOURCES=1.",
+        )
+
 def _get_apple_constraint(rctx, gpu_mapping):
     result = rctx.execute(["/usr/bin/sw_vers", "--productVersion"])
     _log_result(rctx, "/usr/sbin/sw_vers --productVersion", result)
@@ -123,6 +241,8 @@ def _get_apple_constraint(rctx, gpu_mapping):
 
 def _impl(rctx):
     constraints = []
+    total_gpu_count = 0
+    total_gpu_memory_mb = 0
 
     if rctx.os.name == "linux" and (rctx.os.arch == "amd64" or rctx.os.arch == "aarch64"):
         # A system may have both rocm-smi and nvidia-smi installed, check both.
@@ -132,6 +252,19 @@ def _impl(rctx):
         amd_smi = rctx.which("amd-smi")
         rocm_smi = rctx.which("rocm-smi")
 
+        if nvidia_smi:
+            count, mb = _detect_nvidia_memory(rctx, nvidia_smi)
+            total_gpu_count += count
+            total_gpu_memory_mb += mb
+        if amd_smi:
+            count, mb = _detect_amd_smi_memory(rctx, amd_smi)
+            total_gpu_count += count
+            total_gpu_memory_mb += mb
+        elif rocm_smi:
+            count, mb = _detect_rocm_smi_memory(rctx, rocm_smi)
+            total_gpu_count += count
+            total_gpu_memory_mb += mb
+
         _verbose_log(rctx, "nvidia-smi path: {}, rocm-smi path: {}, amd-smi path: {}".format(nvidia_smi, rocm_smi, amd_smi))
 
         # NVIDIA
@@ -219,11 +352,15 @@ platform(
 )
 """.format(constraints = ", ".join(['"{}"'.format(x) for x in constraints])))
 
+    if total_gpu_count > 0 or total_gpu_memory_mb > 0:
+        _write_gpu_resources_bazelrc(rctx, total_gpu_count, total_gpu_memory_mb)
+
 mojo_host_platform = repository_rule(
     implementation = _impl,
     configure = True,
     environ = [
         "MOJO_IGNORE_UNKNOWN_GPUS",
+        "MOJO_QUIET_GPU_RESOURCES",
         "MOJO_VERBOSE_GPU_DETECT",
     ],
     attrs = {