kagent-dev · peterj · Jun 13, 2026
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -15,7 +15,6 @@ env:
   # Cache key components for better organization
   CACHE_KEY_PREFIX: kagent-v2
   BRANCH_CACHE_KEY: ${{ github.head_ref || github.ref_name }}
-  AGENT_SANDBOX_VERSION: v0.3.10
   # Consistent builder configuration
   BUILDX_BUILDER_NAME: kagent-builder-v0.23.0
   BUILDX_VERSION: v0.23.0
@@ -78,13 +77,6 @@ jobs:
         run: |
           make create-kind-cluster
 
-      - name: Install agent-sandbox
-        run: |
-          kubectl apply -f "https://github.com/kubernetes-sigs/agent-sandbox/releases/download/${AGENT_SANDBOX_VERSION}/manifest.yaml"
-          timeout 90s bash -c 'until [ "$(kubectl get crd sandboxes.agents.x-k8s.io -o jsonpath="{.status.conditions[?(@.type==\"Established\")].status}" 2>/dev/null)" = "True" ]; do sleep 1; done'
-          kubectl rollout status deployment/agent-sandbox-controller -n agent-sandbox-system --timeout=120s
-          kubectl wait --for=condition=Ready pod -l app=agent-sandbox-controller -n agent-sandbox-system --timeout=120s
-
       - name: Install Kagent
         id: install-kagent
         env:
@@ -150,10 +142,6 @@ jobs:
           echo "::error::Failed to run e2e tests"
           echo "::error::Kubectl get pods -n kagent"
           kubectl describe pods -n kagent
-          echo "::error::Kubectl get pods -n agent-sandbox-system"
-          kubectl get pods -n agent-sandbox-system -o wide || true
-          echo "::error::Kubectl logs -n agent-sandbox-system deployment/agent-sandbox-controller"
-          kubectl logs -n agent-sandbox-system deployment/agent-sandbox-controller || true
           echo "::error::Kubectl get events -n kagent"
           kubectl get events -n kagent
           echo "::error::Kubectl get agents -n kagent"

@@ -423,14 +423,14 @@ helm-install-provider: helm-version check-api-key
 		--timeout 5m       \
 		--kube-context kind-$(KIND_CLUSTER_NAME) \
 		--wait \
-		--set ui.service.type=LoadBalancer \
+		--set ui.service.type=ClusterIP \
 		--set registry=$(DOCKER_REGISTRY) \
 		--set imagePullPolicy=Always \
 		--set tag=$(VERSION) \
 		--set controller.loglevel=debug \
 		--set controller.image.pullPolicy=Always \
 		--set ui.image.pullPolicy=Always \
-		--set controller.service.type=LoadBalancer \
+		--set controller.service.type=ClusterIP \
 		--set providers.openAI.apiKey=$(OPENAI_API_KEY) \
 		--set providers.azureOpenAI.apiKey=$(AZUREOPENAI_API_KEY) \
 		--set providers.anthropic.apiKey=$(ANTHROPIC_API_KEY) \

diff --git a/design/EP-XXXX-acp-integration.md b/design/EP-XXXX-acp-integration.md
diff --git a/docker/acp-sandbox/Dockerfile b/docker/acp-sandbox/Dockerfile
@@ -0,0 +1,168 @@
+# kagent ACP sandbox image family.
+#
+# Layout:
+#   Stage "builder"  — compiles the acp-shim static binary from this repo.
+#   Stage "base"     — minimal runtime + acp-shim entrypoint. Published as
+#                      ghcr.io/kagent-dev/kagent/acp-sandbox-base. Contains
+#                      NO agent: it is the uniform transport layer.
+#   Agent targets    — thin layers that install one stdio ACP agent and set
+#                      its launch command. Build with --target <agent>:
+#
+#       docker build -f docker/acp-sandbox/Dockerfile --target hermes   go/
+#       docker build -f docker/acp-sandbox/Dockerfile --target openclaw go/
+#
+# The contract between base and agent layers is intentionally tiny:
+#   - ENTRYPOINT is acp-shim; it serves ws://0.0.0.0:9000/acp.
+#   - The agent layer provides the child command either as CMD args after
+#     "--" or via the ACP_SHIM_CHILD env var.
+#   - The bearer token is mounted at /var/run/acp/token by the Substrate
+#     workload spec (same pattern as the AgentHarness gatewayToken).
+#
+# OpenClaw note: NemoClaw's sandbox-base is built from the NemoClaw repo
+# for a different runtime, so we don't extend it. The "openclaw" target below installs
+# the OpenClaw CLI directly on the kagent base (same npm install NemoClaw's
+# Dockerfile.base uses) and runs its own gateway alongside the shim.
+
+### Stage 0: build the acp-shim Go binary
+ARG BASE_IMAGE_REGISTRY=cgr.dev
+ARG BUILDPLATFORM
+FROM --platform=$BUILDPLATFORM $BASE_IMAGE_REGISTRY/chainguard/go:latest AS builder
+ARG TARGETARCH
+ARG TARGETOS
+
+WORKDIR /workspace
+
+COPY go.mod go.sum ./
+RUN --mount=type=cache,target=/root/go/pkg/mod,rw \
+    --mount=type=cache,target=/root/.cache/go-build,rw \
+    go mod download
+
+COPY api/ api/
+COPY core/ core/
+COPY adk/ adk/
+
+ARG LDFLAGS
+RUN --mount=type=cache,target=/root/go/pkg/mod,rw \
+    --mount=type=cache,target=/root/.cache/go-build,rw \
+    CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
+    go build -a -trimpath -ldflags "$LDFLAGS" -o /acp-shim ./core/cmd/acp-shim
+
+### Stage 1: kagent-owned sandbox base — shim only, no agent.
+FROM debian:trixie-slim AS base
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Single unprivileged user. Substrate provides the isolation boundary
+# (microVM); we don't need a separate gateway/sandbox user split here.
+RUN groupadd -r agent && useradd -r -m -g agent -d /home/agent -s /bin/bash agent \
+    && mkdir -p /var/run/acp \
+    && chown agent:agent /var/run/acp
+
+COPY --from=builder /acp-shim /usr/local/bin/acp-shim
+
+USER agent
+WORKDIR /home/agent
+EXPOSE 9000
+
+ENV ACP_SHIM_TOKEN_FILE=/var/run/acp/token
+
+ENTRYPOINT ["/usr/local/bin/acp-shim"]
+# Agent layers append the child command, e.g.:
+#   CMD ["--", "hermes", "acp"]
+
+### Shared: base + Node.js 22 for npm-distributed agents. Debian trixie's
+### apt nodejs is v20, below OpenClaw's >=22.19 requirement (NemoClaw's
+### sandbox-base uses node:22-trixie-slim for the same reason), so take
+### the runtime from the official image.
+FROM node:22-trixie-slim AS node
+FROM base AS node-base
+USER root
+COPY --from=node /usr/local/bin/node /usr/local/bin/node
+COPY --from=node /usr/local/lib/node_modules /usr/local/lib/node_modules
+RUN ln -s ../lib/node_modules/npm/bin/npm-cli.js /usr/local/bin/npm \
+    && ln -s ../lib/node_modules/npm/bin/npx-cli.js /usr/local/bin/npx \
+    && node --version && npm --version
+USER agent
+
+### Agent target: Hermes (pip)
+FROM base AS hermes
+USER root
+RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip python3-venv \
+    && rm -rf /var/lib/apt/lists/*
+USER agent
+RUN python3 -m venv /home/agent/.venv \
+    && /home/agent/.venv/bin/pip install --no-cache-dir "hermes-agent[acp]"
+ENV PATH="/home/agent/.venv/bin:${PATH}"
+# Hermes ACP sessions are in-memory: the child MUST be long-lived (default)
+# so session/load works across bridge reconnects.
+CMD ["--", "hermes", "acp"]
+
+### Agent target: OpenClaw — built on the kagent base, NOT on NemoClaw's
+### sandbox-base (that image is built from the NemoClaw repo and carries
+### NemoClaw-specific contracts — gateway/sandbox users, gosu, .openclaw
+### tree — that we don't want to depend on). We install the OpenClaw CLI
+### the same way NemoClaw's Dockerfile.base does: npm install -g.
+###
+### `openclaw acp` is a stdio→WS bridge that needs a sandbox-local gateway,
+### so this target uses a launcher that starts `openclaw gateway` in the
+### background before the shim takes over.
+FROM node-base AS openclaw
+ARG OPENCLAW_VERSION=2026.5.27
+USER root
+RUN npm install -g "openclaw@${OPENCLAW_VERSION}"
+# Gateway helper: start the sandbox-local gateway if it isn't accepting
+# connections, and wait for it. Used both at boot and from the ACP child
+# wrapper — under Substrate the actor is checkpointed/restored and any
+# process started before the snapshot (the gateway, or a supervising
+# subshell) cannot be relied on afterwards, so the gateway is re-ensured
+# at connection time by a freshly spawned process.
+COPY --chmod=755 <<'EOF' /usr/local/bin/openclaw-gateway-ensure
+#!/bin/sh
+set -u
+: "${OPENCLAW_GATEWAY_PORT:=18789}"
+probe() { curl -s -o /dev/null --max-time 2 "http://127.0.0.1:${OPENCLAW_GATEWAY_PORT}/"; }
+probe && exit 0
+nohup openclaw gateway run --port "${OPENCLAW_GATEWAY_PORT}" --allow-unconfigured \
+    >> "${HOME}/.openclaw/gateway.log" 2>&1 &
+i=0
+while [ "$i" -lt 60 ]; do
+    probe && exit 0
+    i=$((i+1))
+    sleep 1
+done
+echo "openclaw-gateway-ensure: gateway did not come up on :${OPENCLAW_GATEWAY_PORT}" >&2
+exit 1
+EOF
+# ACP child: spawned by the shim per bridge; guarantees a live gateway
+# before exec'ing the stdio<->WS bridge.
+COPY --chmod=755 <<'EOF' /usr/local/bin/openclaw-acp-child
+#!/bin/sh
+set -eu
+: "${OPENCLAW_GATEWAY_PORT:=18789}"
+/usr/local/bin/openclaw-gateway-ensure
+exec openclaw acp --url "ws://127.0.0.1:${OPENCLAW_GATEWAY_PORT}"
+EOF
+COPY --chmod=755 <<'EOF' /usr/local/bin/openclaw-acp-entrypoint
+#!/bin/sh
+# Write the gateway config (loopback bind, auth mode "none" — gateway and
+# ACP client share this container; the externally reachable surface is the
+# shim, which enforces its own bearer token), pre-warm the gateway so the
+# golden snapshot includes a hot npm/node page cache, then hand off to the
+# shim with the gateway-ensuring child wrapper.
+set -eu
+: "${OPENCLAW_GATEWAY_PORT:=18789}"
+mkdir -p "${HOME}/.openclaw"
+if [ ! -f "${HOME}/.openclaw/openclaw.json" ]; then
+    printf '{"gateway":{"port":%s,"bind":"loopback","auth":{"mode":"none"}}}\n' \
+        "${OPENCLAW_GATEWAY_PORT}" > "${HOME}/.openclaw/openclaw.json"
+fi
+/usr/local/bin/openclaw-gateway-ensure || true
+exec /usr/local/bin/acp-shim "$@" -- /usr/local/bin/openclaw-acp-child
+EOF
+USER agent
+ENTRYPOINT ["/usr/local/bin/openclaw-acp-entrypoint"]
+CMD []
diff --git a/docker/acp-sandbox/README.md b/docker/acp-sandbox/README.md
@@ -0,0 +1,97 @@
+# ACP sandbox images
+
+Prototype image family for kagent's ACP integration (see
+[design/EP-XXXX-acp-integration.md](../../design/EP-XXXX-acp-integration.md)).
+Every image in this family runs the same entrypoint — `acp-shim`
+([go/core/cmd/acp-shim](../../go/core/cmd/acp-shim)) — which exposes a stdio
+ACP agent over `ws://0.0.0.0:9000/acp`, reachable through Substrate's atenet
+ingress (WebSocket upgrades are enabled there).
+
+## Stages and targets
+
+The [Dockerfile](Dockerfile) defines:
+
+| Stage / target | Buildable | What it is |
+|---|---|---|
+| `builder` | internal | Compiles the `acp-shim` static binary from this repo. |
+| `base` | `--target base` | kagent-owned runtime (`debian:trixie-slim`) with a single unprivileged `agent` user and the shim as `ENTRYPOINT`. Contains **no agent** — it is the uniform transport layer. |
+| `node` / `node-base` | internal | `base` + Node.js 22 (trixie apt ships v20, below OpenClaw's >=22.19 requirement). |
+| `hermes` | `--target hermes` | `base` + Hermes installed via pip (`hermes-agent[acp]`). Child command: `hermes acp`. |
+| `openclaw` | `--target openclaw` | `node-base` + the OpenClaw CLI (`npm install -g openclaw`). Runs a sandbox-local `openclaw gateway` alongside the shim via a small launcher. |
+
+The base↔agent contract is intentionally tiny:
+
+- `ENTRYPOINT` is `acp-shim`; it serves `ws://0.0.0.0:9000/acp`.
+- The agent layer provides the child command as `CMD` args after `--`, or via
+  the `ACP_SHIM_CHILD` env var.
+- The bearer token is read from the file in `ACP_SHIM_TOKEN_FILE`
+  (default `/var/run/acp/token`).
+
+### Why a kagent-owned base instead of extending NemoClaw's sandbox-base
+
+`ghcr.io/kagent-dev/nemoclaw/sandbox-base` carries NemoClaw-specific
+contracts: its gateway/sandbox user split, gosu privilege separation, and
+`.openclaw` directory tree are a stable contract with NemoClaw, not with
+Substrate. Substrate sandboxes get isolation from the microVM boundary, so that
+machinery is dead weight here — and adopting the image would couple every agent
+to NemoClaw's release cadence. So the `openclaw` target installs the OpenClaw
+CLI directly on the kagent base (the same `npm install -g` NemoClaw's
+`Dockerfile.base` uses) rather than extending NemoClaw's image.
+
+## Building
+
+From the repo root (build context is `go/`):
+
+```sh
+docker build -f docker/acp-sandbox/Dockerfile --target base     -t kagent/acp-sandbox-base     go/
+docker build -f docker/acp-sandbox/Dockerfile --target hermes   -t kagent/acp-sandbox-hermes   go/
+docker build -f docker/acp-sandbox/Dockerfile --target openclaw -t kagent/acp-sandbox-openclaw go/
+```
+
+## Smoke test (no cluster needed)
+
+`base` has no agent, so smoke-test an agent target (`hermes` is simplest — no
+gateway):
+
+```sh
+echo -n s3cret > /tmp/token
+docker run --rm -p 9000:9000 -v /tmp/token:/var/run/acp/token kagent/acp-sandbox-hermes
+# then from another shell, speak newline-delimited JSON-RPC over WS:
+websocat -H "Authorization: Bearer s3cret" ws://localhost:9000/acp
+{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":1,"clientCapabilities":{}}}
+```
+
+The shim accepts the token either as an `Authorization: Bearer` header or as an
+`access_token` query parameter (`ws://localhost:9000/acp?access_token=s3cret`).
+
+## Standalone cluster tests
+
+Two manifests in this folder exercise the image **without** the AgentHarness
+controller. Each file's header comment has the full step-by-step; in short:
+
+- [test-deployment.yaml](test-deployment.yaml) — runs the `openclaw` image as a
+  plain Kubernetes Deployment + Service. Easiest path; works with a kind-loaded
+  local image.
+
+  ```sh
+  docker build -f docker/acp-sandbox/Dockerfile --target openclaw -t acp-sandbox-openclaw:dev go/
+  kind load docker-image acp-sandbox-openclaw:dev --name kagent
+  kubectl apply -f docker/acp-sandbox/test-deployment.yaml
+  kubectl -n kagent port-forward svc/acp-shim-test 9000:9000
+  websocat "ws://localhost:9000/acp?access_token=dev-token"
+  ```
+
+- [test-substrate.yaml](test-substrate.yaml) — runs the same image as a real
+  Substrate actor (gVisor microVM) via a manual `ActorTemplate` + `ate-api`
+  `CreateActor`. Requires pushing a digest-pinned image to a registry (Substrate
+  workers pull it themselves). See the file header for the `grpcurl` / atenet
+  routing details.
+
+## Open items (tracked in the EP)
+
+- OpenClaw target: the `openclaw gateway` invocation in the launcher is a
+  best-guess prototype — needs the real port/auth wiring verified.
+- Agent credential injection (`~/.hermes/.env`, OpenClaw provider keys, ...)
+  belongs to the harness bootstrap, not these images.
+- Whether the shim is baked (this approach) or injected via init container
+  + shared volume.