diff --git a/.github/workflows/dispatch.yml b/.github/workflows/dispatch.yml index ee67236..4110b42 100644 --- a/.github/workflows/dispatch.yml +++ b/.github/workflows/dispatch.yml @@ -143,15 +143,49 @@ jobs: id: agent env: ZAI_API_KEY: ${{ secrets.ZAI_API_KEY }} + # Retry ONLY on transient model-service overload. Unlike the install + # retry, each attempt here spends real GLM budget, so: (1) retry only + # when the output explicitly indicates a 429/overload, never on a + # generic error (a deterministic bug would waste tokens retrying); + # (2) back off 60s, since overload clears slowly; (3) cap at 2 + # attempts. A successful run (exit 0/3/4) or a non-429 error (exit 1 + # without the overload marker) is final. run: | set +e - ~/.local/bin/sesh -p "$(cat TASK.md)" \ - -yes \ - -provider zai \ - -model glm-5.2 \ - -max-iters 40 \ - -max-tools 200 - echo "exit=$?" >> "$GITHUB_OUTPUT" + backoffs=(0 60) + max=$(( ${#backoffs[@]} - 1 )) + for i in $(seq 0 "$max"); do + # backoff before every attempt except the first + if [ "$i" -gt 0 ]; then + echo "service was overloaded on attempt $i; backing off ${backoff}s before attempt $((i+1))" + sleep "$backoff" + fi + out=$(~/.local/bin/sesh -p "$(cat TASK.md)" \ + -yes \ + -provider zai \ + -model glm-5.2 \ + -max-iters 40 \ + -max-tools 200 2>&1) + rc=$? + echo "$out" + # exit 0 done | 3 stuck | 4 iter cap: all final-success-ish, do not retry + if [ "$rc" -eq 0 ] || [ "$rc" -eq 3 ] || [ "$rc" -eq 4 ]; then + echo "exit=$rc" >> "$GITHUB_OUTPUT" + exit 0 + fi + # only a 429/overload marker is retryable; anything else is final + if ! printf '%s' "$out" | grep -qiE '429|overloaded|rate.?limit|too many requests'; then + echo "exit=$rc" >> "$GITHUB_OUTPUT" + exit 0 + fi + backoff=${backoffs[$((i+1))]:-60} + echo "attempt $((i+1)) hit transient overload (rc=$rc)" + if [ "$i" -eq "$max" ]; then + echo "exhausted $((max+1)) attempts on overload; giving up" + echo "exit=$rc" >> "$GITHUB_OUTPUT" + exit 0 + fi + done - name: Commit, push, open PR env: