From f5d8cbef98e72736bec9f3ebf666132622825c9e Mon Sep 17 00:00:00 2001
From: mike-diff <cdnmikes@gmail.com>
Date: Tue, 23 Jun 2026 20:54:43 -0700
Subject: [PATCH] ci(dispatch): cap agent overload retry at 2 attempts

The agent retry recovered transient 429s from the model service but would
spend up to 4 attempts (0+30+60+120s backoff) doing so. Each attempt burns
real GLM budget, and a 429 that does not clear in two tries likely indicates
a sustained outage rather than a blip, so further retries just waste tokens.

Cap at 2 attempts with a single 60s backoff. The safety properties are
unchanged: only an explicit 429/overload marker is retried; exit 0/3/4 and
any non-429 error remain final on the first attempt.

Verified by simulating five cases: first-attempt success (1 attempt);
429-then-success recovers (2 attempts); persistent 429 caps at 2; non-429
error does not retry (1 attempt); 429-then-non-429 stops at 2. build/vet
green; no em dashes per AGENTS.md.
---
 .github/workflows/dispatch.yml | 48 +++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/dispatch.yml b/.github/workflows/dispatch.yml
index ee67236..4110b42 100644
--- a/.github/workflows/dispatch.yml
+++ b/.github/workflows/dispatch.yml
@@ -143,15 +143,49 @@ jobs:
         id: agent
         env:
           ZAI_API_KEY: ${{ secrets.ZAI_API_KEY }}
+        # Retry ONLY on transient model-service overload. Unlike the install
+        # retry, each attempt here spends real GLM budget, so: (1) retry only
+        # when the output explicitly indicates a 429/overload, never on a
+        # generic error (a deterministic bug would waste tokens retrying);
+        # (2) back off 60s, since overload clears slowly; (3) cap at 2
+        # attempts. A successful run (exit 0/3/4) or a non-429 error (exit 1
+        # without the overload marker) is final.
         run: |
           set +e
-          ~/.local/bin/sesh -p "$(cat TASK.md)" \
-            -yes \
-            -provider zai \
-            -model glm-5.2 \
-            -max-iters 40 \
-            -max-tools 200
-          echo "exit=$?" >> "$GITHUB_OUTPUT"
+          backoffs=(0 60)
+          max=$(( ${#backoffs[@]} - 1 ))
+          for i in $(seq 0 "$max"); do
+            # backoff before every attempt except the first
+            if [ "$i" -gt 0 ]; then
+              echo "service was overloaded on attempt $i; backing off ${backoff}s before attempt $((i+1))"
+              sleep "$backoff"
+            fi
+            out=$(~/.local/bin/sesh -p "$(cat TASK.md)" \
+              -yes \
+              -provider zai \
+              -model glm-5.2 \
+              -max-iters 40 \
+              -max-tools 200 2>&1)
+            rc=$?
+            echo "$out"
+            # exit 0 done | 3 stuck | 4 iter cap: all final-success-ish, do not retry
+            if [ "$rc" -eq 0 ] || [ "$rc" -eq 3 ] || [ "$rc" -eq 4 ]; then
+              echo "exit=$rc" >> "$GITHUB_OUTPUT"
+              exit 0
+            fi
+            # only a 429/overload marker is retryable; anything else is final
+            if ! printf '%s' "$out" | grep -qiE '429|overloaded|rate.?limit|too many requests'; then
+              echo "exit=$rc" >> "$GITHUB_OUTPUT"
+              exit 0
+            fi
+            backoff=${backoffs[$((i+1))]:-60}
+            echo "attempt $((i+1)) hit transient overload (rc=$rc)"
+            if [ "$i" -eq "$max" ]; then
+              echo "exhausted $((max+1)) attempts on overload; giving up"
+              echo "exit=$rc" >> "$GITHUB_OUTPUT"
+              exit 0
+            fi
+          done
 
       - name: Commit, push, open PR
         env: