Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,63 @@ mojo_binary(

See the [tests](https://github.com/modular/rules_mojo/tree/main/tests)
directory for more examples.

## Tracking `gpu-memory` as a local resource

If you tag tests with `resources:gpu-memory:<N>` and run with a recent
Bazel (8.3+), the resource manager will reject builds that reference
resources it doesn't know about:

```text
Resource gpu-memory is not being tracked by the resource manager.
Available resources are: cpu, memory.
```

To fix this, Bazel needs `--local_extra_resources=gpu-memory=<N>` set
before the resource manager starts. `mojo_host_platform` autodetects
the GPU count and total GPU memory on the host (via `nvidia-smi`,
`amd-smi`, or `rocm-smi`) and writes a `gpu_resources.bazelrc`
fragment exposing both `gpu-memory` and a `gpu-N` resource family
(`gpu-1`, `gpu-2`, `gpu-4`, sized to `floor(total_gpus / N)`) into
its external repo on each fetch. On first fetch it prints the
absolute path:

```text
rules_mojo: detected 4 GPU(s) (327036 MB total). Exposing local
resources: gpu-1=4, gpu-2=2, gpu-4=1, gpu-memory=327036.
Add this line to your .bazelrc once:

try-import /home/.../external/+mojo+mojo_host_platform/gpu_resources.bazelrc
```

The generated fragment looks like:

```text
build --local_extra_resources=gpu-1=4
test --local_extra_resources=gpu-1=4
build --local_extra_resources=gpu-2=2
test --local_extra_resources=gpu-2=2
build --local_extra_resources=gpu-4=1
test --local_extra_resources=gpu-4=1
build --local_extra_resources=gpu-memory=327036
test --local_extra_resources=gpu-memory=327036
```

The `gpu-N` family follows the same convention used in our internal
test infrastructure: a test that needs N GPUs claims
`resources:gpu-N:0.01`, and the pool size on each host is set to the
number of N-sized groups available, so a 2-GPU machine naturally
won't schedule a `gpu-4` test.

Paste that `try-import` line into your `.bazelrc` once. The fragment
re-runs whenever the host's GPU configuration changes (the rule is
`configure = True`), so the resource pool stays in sync without
further intervention.

Caveats:

- The path is under your Bazel output base, so it will change if you
point Bazel at a different cache or use a different user account.
Re-run any `mojo` extension target to see the current path.
- Set `MOJO_QUIET_GPU_RESOURCES=1` to silence the print on each
fetch.
137 changes: 137 additions & 0 deletions mojo/mojo_host_platform.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,124 @@ def _get_amd_constraints_with_rocm_smi(rctx, rocm_smi, gpu_mapping):

return constraints

def _detect_nvidia_memory(rctx, nvidia_smi):
"""Returns (gpu_count, total_memory_mb)."""
result = rctx.execute([nvidia_smi, "--query-gpu=memory.total", "--format=csv,noheader,nounits"])
_log_result(rctx, "{} memory.total".format(nvidia_smi), result)
if result.return_code != 0:
return 0, 0
count = 0
total = 0
for line in result.stdout.splitlines():
line = line.strip()
if not line:
continue
total += int(line)
count += 1
return count, total

def _detect_amd_smi_memory(rctx, amd_smi):
"""Returns (gpu_count, total_memory_mb)."""
result = rctx.execute([amd_smi, "static", "--vram", "--json"])
_log_result(rctx, "{} static --vram".format(amd_smi), result)
if result.return_code != 0:
return 0, 0
json_lines = []
for line in result.stdout.splitlines():
if line.startswith("WARNING:"):
continue
json_lines.append(line)
blob = json.decode("\n".join(json_lines), default = None)
if blob == None:
return 0, 0
if type(blob) == "dict" and "gpu_data" in blob:
blob = blob["gpu_data"]
count = 0
total = 0
for entry in blob:
count += 1
vram = entry.get("vram", {})
size = vram.get("size", {})
value = size.get("value")
unit = size.get("unit", "MB")
if value == None:
continue
# Normalize to MB
if unit == "GB":
total += int(value) * 1024
elif unit == "KB":
total += int(value) // 1024
elif unit == "B":
total += int(value) // (1024 * 1024)
else: # MB or unknown
total += int(value)
return count, total

def _detect_rocm_smi_memory(rctx, rocm_smi):
"""Returns (gpu_count, total_memory_mb)."""
result = rctx.execute([rocm_smi, "--showmeminfo", "vram", "--json"])
_log_result(rctx, "{} --showmeminfo vram".format(rocm_smi), result)
if result.return_code != 0 or not result.stdout:
return 0, 0
blob = json.decode(result.stdout, default = None)
if blob == None:
return 0, 0
count = 0
total = 0
for value in blob.values():
count += 1
# Field name varies across rocm-smi versions:
# "VRAM Total Memory (B)" (bytes), or "VRAM Total Memory (MiB)".
for key, raw in value.items():
if "VRAM Total Memory" not in key:
continue
raw_str = str(raw).strip()
if not raw_str.isdigit():
continue
num = int(raw_str)
if "(B)" in key:
total += num // (1024 * 1024)
elif "MiB" in key or "MB" in key:
total += num
elif "GiB" in key or "GB" in key:
total += num * 1024
return count, total

_GPU_COUNT_BUCKETS = [1, 2, 4]

def _write_gpu_resources_bazelrc(rctx, gpu_count, total_mb):
# Mirrors the modular monorepo convention in bazel/internal/remote_run.py:
# a test that needs N GPUs claims `resources:gpu-N:0.01`. Each `gpu-N`
# resource pool is sized to floor(total_gpus / N) so the convention works
# naturally on smaller hosts (a 2-GPU box gets gpu-1=2, gpu-2=1, no gpu-4).
lines = [
"# Generated by @rules_mojo//mojo:mojo_host_platform.bzl.",
"# Detected {} GPU(s), {} MB total memory.".format(gpu_count, total_mb),
]
emitted_gpu_buckets = []
for n in _GPU_COUNT_BUCKETS:
pool = gpu_count // n
if pool <= 0:
continue
emitted_gpu_buckets.append("gpu-{}={}".format(n, pool))
lines.append("build --local_extra_resources=gpu-{}={}".format(n, pool))
lines.append("test --local_extra_resources=gpu-{}={}".format(n, pool))
if total_mb > 0:
lines.append("build --local_extra_resources=gpu-memory={}".format(total_mb))
lines.append("test --local_extra_resources=gpu-memory={}".format(total_mb))
rctx.file("gpu_resources.bazelrc", content = "\n".join(lines) + "\n")
abs_path = rctx.path("gpu_resources.bazelrc")
if rctx.getenv("MOJO_QUIET_GPU_RESOURCES") != "1":
summary = ", ".join(emitted_gpu_buckets + (["gpu-memory={}".format(total_mb)] if total_mb > 0 else []))
# buildifier: disable=print
print(
"rules_mojo: detected {} GPU(s) ({} MB total). ".format(gpu_count, total_mb) +
"Exposing local resources: {}.\n".format(summary) +
"Add this line to your .bazelrc once:\n\n" +
" try-import {}\n\n".format(abs_path) +
"Silence this message with MOJO_QUIET_GPU_RESOURCES=1.",
)

def _get_apple_constraint(rctx, gpu_mapping):
result = rctx.execute(["/usr/bin/sw_vers", "--productVersion"])
_log_result(rctx, "/usr/sbin/sw_vers --productVersion", result)
Expand Down Expand Up @@ -123,6 +241,8 @@ def _get_apple_constraint(rctx, gpu_mapping):

def _impl(rctx):
constraints = []
total_gpu_count = 0
total_gpu_memory_mb = 0

if rctx.os.name == "linux" and (rctx.os.arch == "amd64" or rctx.os.arch == "aarch64"):
# A system may have both rocm-smi and nvidia-smi installed, check both.
Expand All @@ -132,6 +252,19 @@ def _impl(rctx):
amd_smi = rctx.which("amd-smi")
rocm_smi = rctx.which("rocm-smi")

if nvidia_smi:
count, mb = _detect_nvidia_memory(rctx, nvidia_smi)
total_gpu_count += count
total_gpu_memory_mb += mb
if amd_smi:
count, mb = _detect_amd_smi_memory(rctx, amd_smi)
total_gpu_count += count
total_gpu_memory_mb += mb
elif rocm_smi:
count, mb = _detect_rocm_smi_memory(rctx, rocm_smi)
total_gpu_count += count
total_gpu_memory_mb += mb
Comment on lines +255 to +266
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should also include Apple GPU


_verbose_log(rctx, "nvidia-smi path: {}, rocm-smi path: {}, amd-smi path: {}".format(nvidia_smi, rocm_smi, amd_smi))

# NVIDIA
Expand Down Expand Up @@ -219,11 +352,15 @@ platform(
)
""".format(constraints = ", ".join(['"{}"'.format(x) for x in constraints])))

if total_gpu_count > 0 or total_gpu_memory_mb > 0:
_write_gpu_resources_bazelrc(rctx, total_gpu_count, total_gpu_memory_mb)

mojo_host_platform = repository_rule(
implementation = _impl,
configure = True,
environ = [
"MOJO_IGNORE_UNKNOWN_GPUS",
"MOJO_QUIET_GPU_RESOURCES",
"MOJO_VERBOSE_GPU_DETECT",
],
attrs = {
Expand Down
Loading