From f5d8cbef98e72736bec9f3ebf666132622825c9e Mon Sep 17 00:00:00 2001 From: mike-diff Date: Tue, 23 Jun 2026 20:54:43 -0700 Subject: [PATCH] ci(dispatch): cap agent overload retry at 2 attempts The agent retry recovered transient 429s from the model service but would spend up to 4 attempts (0+30+60+120s backoff) doing so. Each attempt burns real GLM budget, and a 429 that does not clear in two tries likely indicates a sustained outage rather than a blip, so further retries just waste tokens. Cap at 2 attempts with a single 60s backoff. The safety properties are unchanged: only an explicit 429/overload marker is retried; exit 0/3/4 and any non-429 error remain final on the first attempt. Verified by simulating five cases: first-attempt success (1 attempt); 429-then-success recovers (2 attempts); persistent 429 caps at 2; non-429 error does not retry (1 attempt); 429-then-non-429 stops at 2. build/vet green; no em dashes per AGENTS.md. --- .github/workflows/dispatch.yml | 48 +++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/.github/workflows/dispatch.yml b/.github/workflows/dispatch.yml index ee67236..4110b42 100644 --- a/.github/workflows/dispatch.yml +++ b/.github/workflows/dispatch.yml @@ -143,15 +143,49 @@ jobs: id: agent env: ZAI_API_KEY: ${{ secrets.ZAI_API_KEY }} + # Retry ONLY on transient model-service overload. Unlike the install + # retry, each attempt here spends real GLM budget, so: (1) retry only + # when the output explicitly indicates a 429/overload, never on a + # generic error (a deterministic bug would waste tokens retrying); + # (2) back off 60s, since overload clears slowly; (3) cap at 2 + # attempts. A successful run (exit 0/3/4) or a non-429 error (exit 1 + # without the overload marker) is final. run: | set +e - ~/.local/bin/sesh -p "$(cat TASK.md)" \ - -yes \ - -provider zai \ - -model glm-5.2 \ - -max-iters 40 \ - -max-tools 200 - echo "exit=$?" >> "$GITHUB_OUTPUT" + backoffs=(0 60) + max=$(( ${#backoffs[@]} - 1 )) + for i in $(seq 0 "$max"); do + # backoff before every attempt except the first + if [ "$i" -gt 0 ]; then + echo "service was overloaded on attempt $i; backing off ${backoff}s before attempt $((i+1))" + sleep "$backoff" + fi + out=$(~/.local/bin/sesh -p "$(cat TASK.md)" \ + -yes \ + -provider zai \ + -model glm-5.2 \ + -max-iters 40 \ + -max-tools 200 2>&1) + rc=$? + echo "$out" + # exit 0 done | 3 stuck | 4 iter cap: all final-success-ish, do not retry + if [ "$rc" -eq 0 ] || [ "$rc" -eq 3 ] || [ "$rc" -eq 4 ]; then + echo "exit=$rc" >> "$GITHUB_OUTPUT" + exit 0 + fi + # only a 429/overload marker is retryable; anything else is final + if ! printf '%s' "$out" | grep -qiE '429|overloaded|rate.?limit|too many requests'; then + echo "exit=$rc" >> "$GITHUB_OUTPUT" + exit 0 + fi + backoff=${backoffs[$((i+1))]:-60} + echo "attempt $((i+1)) hit transient overload (rc=$rc)" + if [ "$i" -eq "$max" ]; then + echo "exhausted $((max+1)) attempts on overload; giving up" + echo "exit=$rc" >> "$GITHUB_OUTPUT" + exit 0 + fi + done - name: Commit, push, open PR env: