From d0b9e008a7207bd64e4504ab8c96cb03519e4684 Mon Sep 17 00:00:00 2001
From: doquanghuy <doquanghuy91@gmail.com>
Date: Thu, 28 May 2026 01:11:01 +0700
Subject: [PATCH 1/2] feat(workflows): add continue_on_error step field
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an optional `continue_on_error: bool` field on every step.
When set to `true` and the step fails, the engine records the
result (`exit_code`, `stderr` on `steps.<id>.output` plus `status`
as a sibling key on `steps.<id>`) and continues to the next sibling
step instead of halting the run. Downstream `if`, `switch`, or
`gate` steps can then branch on
`{{ steps.<id>.output.exit_code }}` to route the recovery path.

Engine details
--------------
`WorkflowEngine._execute_steps` now consults the step config when a
step returns `StepStatus.FAILED`:

- Gate aborts (`output.aborted`) always halt the run — operator
  decisions take precedence over the flag.
- Otherwise, if `continue_on_error` is the literal `True`, log a
  `step_continue_on_error` event and proceed to the next sibling.
  The runtime check uses identity comparison (`is True`) rather
  than truthiness, so truthy non-bool values like the string
  `"true"` cannot silently change run semantics even if a caller
  bypasses `validate_workflow()`.
- Otherwise, behave as before: log `step_failed`, set
  `RunStatus.FAILED`, and return.

Validation
----------
`_validate_steps` rejects non-bool values for `continue_on_error`.
Coerced strings like `"true"` are not accepted so authoring
mistakes surface at validation time rather than silently changing
run semantics.

Tests
-----
`TestContinueOnError` in `tests/test_workflows.py` (8 tests):
- `test_undeclared_failure_halts_run` — default halt behaviour.
- `test_declared_and_fired_continues_run` — flag + fail → continue.
- `test_declared_but_step_succeeded_is_noop` — flag + success → no-op.
- `test_if_branch_routes_around_failure` — end-to-end recovery.
- `test_gate_abort_still_halts_with_continue_on_error` — abort
  always halts.
- `test_validation_rejects_non_bool_continue_on_error` — `"true"`
  rejected at validation.
- `test_validation_accepts_bool_continue_on_error` — `true`/`false`
  pass cleanly.
- `test_engine_ignores_truthy_non_bool_continue_on_error` —
  defense-in-depth: engine ignores string `"true"` even when
  validation is bypassed.

Rebased onto current upstream/main (post #2664 merge); the new
`TestContinueOnError` class sits immediately after upstream's
`TestContextRunId` so the two feature suites coexist cleanly.

Closes #2591.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/specify_cli/workflows/engine.py |  58 +++++-
 tests/test_workflows.py             | 297 ++++++++++++++++++++++++++++
 workflows/README.md                 |  72 ++++---
 3 files changed, 396 insertions(+), 31 deletions(-)
diff --git a/src/specify_cli/workflows/engine.py b/src/specify_cli/workflows/engine.py
index 65d9b3a272..fe1c520515 100644
--- a/src/specify_cli/workflows/engine.py
+++ b/src/specify_cli/workflows/engine.py
@@ -231,6 +231,20 @@ def _validate_steps(
             step_errors = step_impl.validate(step_config)
             errors.extend(step_errors)
 
+        # Validate optional `continue_on_error` field. The engine honours
+        # this on any step that returns FAILED so the pipeline can route
+        # around the failure via downstream `if`/`switch`/`gate`. The
+        # field must be a literal boolean — coercion from truthy strings
+        # is deliberately not supported so authoring mistakes surface
+        # at validation time rather than silently changing run semantics.
+        if "continue_on_error" in step_config:
+            coe = step_config["continue_on_error"]
+            if not isinstance(coe, bool):
+                errors.append(
+                    f"Step {step_id!r}: 'continue_on_error' must be a "
+                    f"boolean, got {type(coe).__name__}."
+                )
+
         # Recursively validate nested steps
         for nested_key in ("then", "else", "steps"):
             nested = step_config.get(nested_key)
@@ -622,7 +636,10 @@ def _execute_steps(
 
             # Handle failures
             if result.status == StepStatus.FAILED:
-                # Gate abort (output.aborted) maps to ABORTED status
+                # Gate abort (output.aborted) maps to ABORTED status.
+                # Aborts are deliberate operator decisions, so
+                # `continue_on_error` does NOT override them — that flag
+                # is for transient/expected step failures only.
                 if result.output.get("aborted"):
                     state.status = RunStatus.ABORTED
                     state.append_log(
@@ -631,15 +648,48 @@ def _execute_steps(
                             "step_id": step_id,
                         }
                     )
-                else:
-                    state.status = RunStatus.FAILED
+                    state.save()
+                    return
+
+                # `continue_on_error: true` lets the pipeline route
+                # around the failure instead of halting. The step
+                # result (including exit_code, stderr, status) is
+                # still recorded so downstream `if`/`switch`/`gate`
+                # steps can branch on it. Log a single, unambiguous
+                # event per failure resolution — either the run
+                # continued past it, or it halted.
+                #
+                # Use identity comparison (`is True`) rather than
+                # truthiness so that only a literal boolean enables
+                # the behaviour, even if validation was skipped.
+                # Validation rejects non-bool values at parse time,
+                # but `WorkflowEngine.execute()` does not auto-validate
+                # (see `WorkflowEngine.load_workflow`, whose docstring
+                # explicitly notes "not yet validated; call
+                # `validate_workflow()` or `engine.validate()`
+                # separately"), so a caller passing an unvalidated
+                # definition could otherwise see truthy non-bool
+                # values like the string `"true"` silently change
+                # run semantics.
+                if step_config.get("continue_on_error") is True:
                     state.append_log(
                         {
-                            "event": "step_failed",
+                            "event": "step_continue_on_error",
                             "step_id": step_id,
                             "error": result.error,
                         }
                     )
+                    state.save()
+                    continue
+
+                state.status = RunStatus.FAILED
+                state.append_log(
+                    {
+                        "event": "step_failed",
+                        "step_id": step_id,
+                        "error": result.error,
+                    }
+                )
                 state.save()
                 return
 
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
index 114e6b02cd..0a5e61c2af 100644
--- a/tests/test_workflows.py
+++ b/tests/test_workflows.py
@@ -2333,6 +2333,303 @@ def test_workflow_without_context_reference_unchanged(self, project_dir):
         assert state.step_results["only-step"]["output"]["stdout"].strip() == "hello"
 
 
+# ===== continue_on_error Tests =====
+#
+# Locks the contract documented in workflows/README.md "Error Handling"
+# section: when an executable step fails and `continue_on_error: true`
+# is declared, the engine records the step's `output` (with `exit_code`
+# and `stderr` from the failure) and its `status` (sibling key on
+# `steps.<id>`, not nested under `output`) and continues to the next
+# sibling step instead of halting the run. Gate aborts
+# (`output.aborted`) still halt regardless of the flag.
+
+
+class TestContinueOnError:
+    """Test the `continue_on_error` step-level field."""
+
+    def test_undeclared_failure_halts_run(self, project_dir):
+        """Default behaviour (no `continue_on_error`): a failing step
+        halts the workflow run with `status == FAILED`.
+
+        Locks the byte-equivalent default — workflows that do not
+        declare the flag must behave exactly as before this feature.
+        """
+        from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
+        from specify_cli.workflows.base import RunStatus
+
+        definition = WorkflowDefinition.from_string("""
+schema_version: "1.0"
+workflow:
+  id: "halt-on-fail"
+  name: "Halt On Fail"
+  version: "1.0.0"
+steps:
+  - id: fail-step
+    type: shell
+    run: "exit 7"
+  - id: after
+    type: shell
+    run: "echo should-not-run"
+""")
+        engine = WorkflowEngine(project_dir)
+        state = engine.execute(definition)
+
+        assert state.status == RunStatus.FAILED
+        assert "fail-step" in state.step_results
+        assert state.step_results["fail-step"]["output"]["exit_code"] == 7
+        # Subsequent step never executes when the flag is absent.
+        assert "after" not in state.step_results
+
+    def test_declared_and_fired_continues_run(self, project_dir):
+        """`continue_on_error: true` + failing step: the run keeps
+        going, the failed step's result is recorded, and the
+        downstream step runs.
+        """
+        from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
+        from specify_cli.workflows.base import RunStatus
+
+        definition = WorkflowDefinition.from_string("""
+schema_version: "1.0"
+workflow:
+  id: "continue-past-fail"
+  name: "Continue Past Fail"
+  version: "1.0.0"
+steps:
+  - id: flaky-step
+    type: shell
+    run: "exit 42"
+    continue_on_error: true
+  - id: after
+    type: shell
+    run: "echo did-run"
+""")
+        engine = WorkflowEngine(project_dir)
+        state = engine.execute(definition)
+
+        assert state.status == RunStatus.COMPLETED
+        # Failed step's exit_code is preserved so downstream branching
+        # can inspect it.
+        assert state.step_results["flaky-step"]["output"]["exit_code"] == 42
+        assert state.step_results["flaky-step"]["status"] == "failed"
+        # Downstream step ran successfully.
+        assert state.step_results["after"]["output"]["exit_code"] == 0
+
+    def test_declared_but_step_succeeded_is_noop(self, project_dir):
+        """`continue_on_error: true` on a step that succeeds is a
+        no-op — the flag only changes behaviour on FAILED status.
+        """
+        from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
+        from specify_cli.workflows.base import RunStatus
+
+        definition = WorkflowDefinition.from_string("""
+schema_version: "1.0"
+workflow:
+  id: "flag-but-success"
+  name: "Flag But Success"
+  version: "1.0.0"
+steps:
+  - id: ok-step
+    type: shell
+    run: "echo ok"
+    continue_on_error: true
+  - id: after
+    type: shell
+    run: "echo done"
+""")
+        engine = WorkflowEngine(project_dir)
+        state = engine.execute(definition)
+
+        assert state.status == RunStatus.COMPLETED
+        assert state.step_results["ok-step"]["status"] == "completed"
+        assert state.step_results["ok-step"]["output"]["exit_code"] == 0
+        assert state.step_results["after"]["output"]["exit_code"] == 0
+
+    def test_if_branch_routes_around_failure(self, project_dir):
+        """End-to-end: `continue_on_error` + `if` cleanly routes around
+        a failure. The recovery branch runs; the success branch does
+        not.
+
+        Mirrors the canonical usage pattern from the original feature
+        discussion in issue #2591.
+        """
+        from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
+        from specify_cli.workflows.base import RunStatus
+
+        definition = WorkflowDefinition.from_string("""
+schema_version: "1.0"
+workflow:
+  id: "route-around"
+  name: "Route Around Failure"
+  version: "1.0.0"
+steps:
+  - id: heavy-thing
+    type: shell
+    run: "exit 1"
+    continue_on_error: true
+  - id: check-result
+    type: if
+    condition: "{{ steps.heavy-thing.output.exit_code != 0 }}"
+    then:
+      - id: recovery
+        type: shell
+        run: "echo recovery-ran"
+    else:
+      - id: happy-path
+        type: shell
+        run: "echo happy-path-ran"
+""")
+        engine = WorkflowEngine(project_dir)
+        state = engine.execute(definition)
+
+        assert state.status == RunStatus.COMPLETED
+        assert "recovery" in state.step_results
+        assert "happy-path" not in state.step_results
+
+    def test_gate_abort_still_halts_with_continue_on_error(
+        self, project_dir, monkeypatch
+    ):
+        """`continue_on_error` does NOT override a deliberate gate
+        abort. `output.aborted` always halts the run with
+        `status == ABORTED`.
+
+        Aborts are explicit operator decisions; continue_on_error
+        is for transient/expected step failures only.
+        """
+        from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
+        from specify_cli.workflows.base import RunStatus
+        from specify_cli.workflows.steps.gate import GateStep
+        from specify_cli.workflows.steps import gate as gate_module
+
+        # Force the gate step into interactive mode and feed a "reject"
+        # choice so the abort path actually runs in the test env
+        # (default behaviour returns PAUSED when stdin is not a TTY).
+        # Swap sys.stdin itself for a stub: setattr on the real
+        # TextIOWrapper's `isatty` method is not assignable under some
+        # runners (e.g. pytest with capture disabled).
+        class _TTYStdin:
+            def isatty(self) -> bool:
+                return True
+
+        monkeypatch.setattr(gate_module.sys, "stdin", _TTYStdin())
+        monkeypatch.setattr(
+            GateStep, "_prompt", staticmethod(lambda _msg, _opts: "reject")
+        )
+
+        definition = WorkflowDefinition.from_string("""
+schema_version: "1.0"
+workflow:
+  id: "gate-abort-halts"
+  name: "Gate Abort Halts"
+  version: "1.0.0"
+steps:
+  - id: gate-step
+    type: gate
+    message: "Approve?"
+    options: [approve, reject]
+    on_reject: abort
+    continue_on_error: true
+  - id: should-not-run
+    type: shell
+    run: "echo nope"
+""")
+        engine = WorkflowEngine(project_dir)
+        state = engine.execute(definition)
+
+        assert state.status == RunStatus.ABORTED
+        assert "should-not-run" not in state.step_results
+
+    def test_validation_rejects_non_bool_continue_on_error(self):
+        """`continue_on_error` must be a literal boolean; coerced
+        strings like `"true"` are rejected at validation time so
+        authoring mistakes surface before execution.
+        """
+        from specify_cli.workflows.engine import (
+            WorkflowDefinition,
+            validate_workflow,
+        )
+
+        definition = WorkflowDefinition.from_string("""
+schema_version: "1.0"
+workflow:
+  id: "bad-coe"
+  name: "Bad COE"
+  version: "1.0.0"
+steps:
+  - id: step-one
+    type: shell
+    run: "true"
+    continue_on_error: "true"
+""")
+        errors = validate_workflow(definition)
+        assert any(
+            "continue_on_error" in e and "boolean" in e for e in errors
+        ), errors
+
+    def test_validation_accepts_bool_continue_on_error(self):
+        """Boolean values pass validation cleanly."""
+        from specify_cli.workflows.engine import (
+            WorkflowDefinition,
+            validate_workflow,
+        )
+
+        for value in (True, False):
+            yaml_value = "true" if value else "false"
+            definition = WorkflowDefinition.from_string(f"""
+schema_version: "1.0"
+workflow:
+  id: "good-coe"
+  name: "Good COE"
+  version: "1.0.0"
+steps:
+  - id: step-one
+    type: shell
+    run: "true"
+    continue_on_error: {yaml_value}
+""")
+            errors = validate_workflow(definition)
+            assert errors == [], errors
+
+    def test_engine_ignores_truthy_non_bool_continue_on_error(self, project_dir):
+        """Defense-in-depth: even if a caller bypasses
+        `validate_workflow()` and feeds the engine a definition with
+        `continue_on_error: "true"` (a string), the engine must NOT
+        honour the flag — only a literal boolean enables the
+        behaviour. `WorkflowEngine.execute()` does not auto-validate
+        (the `WorkflowEngine.load_workflow` docstring explicitly
+        notes the definition is "not yet validated; call
+        `validate_workflow()` or `engine.validate()` separately"),
+        so the engine guards against truthy non-bool values itself
+        via an identity check rather than truthiness.
+        """
+        from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
+        from specify_cli.workflows.base import RunStatus
+
+        # Bypass `validate_workflow()` — execute() is what would
+        # be called by a caller that skipped validation.
+        definition = WorkflowDefinition.from_string("""
+schema_version: "1.0"
+workflow:
+  id: "string-coe"
+  name: "String COE"
+  version: "1.0.0"
+steps:
+  - id: fail-step
+    type: shell
+    run: "exit 1"
+    continue_on_error: "true"
+  - id: should-not-run
+    type: shell
+    run: "echo should-not-run"
+""")
+        engine = WorkflowEngine(project_dir)
+        state = engine.execute(definition)
+
+        # String "true" is truthy but not a literal boolean, so the
+        # engine must treat the step as a halting failure.
+        assert state.status == RunStatus.FAILED
+        assert "should-not-run" not in state.step_results
+
+
 # ===== State Persistence Tests =====
 
 class TestRunState:
diff --git a/workflows/README.md b/workflows/README.md
index eb3d090ad0..9a61540b3e 100644
--- a/workflows/README.md
+++ b/workflows/README.md
@@ -219,6 +219,51 @@ Aggregate results from fan-out steps:
   output: {}
 ```
 
+## Error Handling
+
+By default, any step that ends in `StepStatus.FAILED` at runtime halts
+the entire run — most commonly a `shell` or `command` step exiting
+non-zero, but also any other runtime error raised during step
+execution. (Invalid workflow definitions are rejected up-front by
+`specify workflow run` before the run even starts, so structural
+validation failures never reach this code path.) Set
+`continue_on_error: true` on a step to record its result and continue
+to the next sibling step instead. When the failure was a non-zero
+exit, the exit code remains available on
+`steps.<id>.output.exit_code` so downstream `if`, `switch`, or `gate`
+steps can branch on it:
+
+```yaml
+- id: heavy-thing
+  type: command
+  integration: claude
+  command: speckit.heavy-thing
+  continue_on_error: true
+
+- id: check-result
+  type: if
+  condition: "{{ steps.heavy-thing.output.exit_code != 0 }}"
+  then:
+    - id: review
+      type: gate
+      message: "Step failed (exit {{ steps.heavy-thing.output.exit_code }}). Retry or skip?"
+      on_reject: skip
+  else:
+    - id: next-thing
+      command: speckit.next-thing
+```
+
+**Notes:**
+
+- The field must be a literal boolean (`true` / `false`); coerced
+  strings like `"true"` are rejected at validation time.
+- Gate aborts (`on_reject: abort` chosen by the operator) always halt
+  the run — `continue_on_error` does not override them. The flag is
+  for transient/expected step failures, not for overriding deliberate
+  operator decisions.
+- When the flag is omitted, behaviour is byte-equivalent to before
+  this feature.
+
 ## Expressions
 
 Workflow definitions use `{{ expression }}` syntax for dynamic values:
@@ -239,33 +284,6 @@ message: "{{ status | default('pending') }}"
 
 Supported filters: `default`, `join`, `contains`, `map`.
 
-### Runtime Context
-
-`{{ context.* }}` exposes engine-managed runtime metadata for the
-current run:
-
-| Variable | Description |
-|----------|-------------|
-| `context.run_id` | The current workflow run id (the same value Spec Kit prints as `Run ID:` at the end of `workflow run`). Auto-generated runs are 8-character hex from `uuid4`; operator-supplied ids may be any alphanumeric string with hyphens or underscores. Empty string outside a run context. |
-
-```yaml
-# Stamp telemetry events with the run id for cross-system join.
-- id: emit-event
-  type: shell
-  run: 'echo "{\"run_id\":\"{{ context.run_id }}\",\"event\":\"started\"}" >> events.jsonl'
-
-# Per-run scratch directory.
-- id: prep-scratch
-  type: shell
-  run: 'mkdir -p /tmp/run-{{ context.run_id }}'
-
-# Pass run id into a command for artifact metadata.
-- id: tag-artifact
-  command: speckit.specify
-  input:
-    args: "{{ context.run_id }}"
-```
-
 ## Input Types
 
 Workflow inputs are type-checked and coerced from CLI string values:

From 22c5184fa35cd305a5e7383064f583f3959ddff6 Mon Sep 17 00:00:00 2001
From: doquanghuy <doquanghuy91@gmail.com>
Date: Thu, 28 May 2026 21:33:40 +0700
Subject: [PATCH 2/2] docs(workflows): restore runtime context section, clarify
 gate prompt

Two Copilot findings on d0b9e00:

1. The `### Runtime Context` documentation for `{{ context.* }}` was
   lost during the rebase onto current main (the squash dropped the
   anchor where #2664 had added it). Restored under `## Expressions`
   so users can find `context.run_id` semantics and examples.

2. The continue_on_error example gate had message "Retry or skip?"
   but used the default `options: [approve, reject]` with `on_reject:
   skip`, which implied an automatic retry path that gates do not
   provide. Reworded the message to match the actual approve/reject
   semantics and added an explicit note that retry requires either
   custom gate options + downstream branching or a wrapper loop.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workflows/README.md | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/workflows/README.md b/workflows/README.md
index 9a61540b3e..40c593fb61 100644
--- a/workflows/README.md
+++ b/workflows/README.md
@@ -246,13 +246,19 @@ steps can branch on it:
   then:
     - id: review
       type: gate
-      message: "Step failed (exit {{ steps.heavy-thing.output.exit_code }}). Retry or skip?"
+      message: "Step failed (exit {{ steps.heavy-thing.output.exit_code }}). Approve to continue, or reject to skip the rest of this branch."
       on_reject: skip
   else:
     - id: next-thing
       command: speckit.next-thing
 ```
 
+The gate's default `options: [approve, reject]` map directly to "continue
+the run" vs. "skip the rest of this branch" — gates do not automatically
+re-run the failed step. To express a retry path, either define custom
+gate options and branch on the choice downstream, or wrap the failing
+step in your own loop.
+
 **Notes:**
 
 - The field must be a literal boolean (`true` / `false`); coerced
@@ -284,6 +290,33 @@ message: "{{ status | default('pending') }}"
 
 Supported filters: `default`, `join`, `contains`, `map`.
 
+### Runtime Context
+
+`{{ context.* }}` exposes engine-managed runtime metadata for the
+current run:
+
+| Variable | Description |
+|----------|-------------|
+| `context.run_id` | The current workflow run id (the same value Spec Kit prints as `Run ID:` at the end of `workflow run`). Auto-generated runs are 8-character hex from `uuid4`; operator-supplied ids may be any alphanumeric string with hyphens or underscores. Empty string outside a run context. |
+
+```yaml
+# Stamp telemetry events with the run id for cross-system join.
+- id: emit-event
+  type: shell
+  run: 'echo "{\"run_id\":\"{{ context.run_id }}\",\"event\":\"started\"}" >> events.jsonl'
+
+# Per-run scratch directory.
+- id: prep-scratch
+  type: shell
+  run: 'mkdir -p /tmp/run-{{ context.run_id }}'
+
+# Pass run id into a command for artifact metadata.
+- id: tag-artifact
+  command: speckit.specify
+  input:
+    args: "{{ context.run_id }}"
+```
+
 ## Input Types
 
 Workflow inputs are type-checked and coerced from CLI string values: