diff --git a/CHANGELOG.md b/CHANGELOG.md index d127194..d7934f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is intentionally lightweight and human-readable. Group entries by rel - Added an opt-in `auto_update` policy block plus `foundrygate-auto-update` so controlled deployments can gate helper-driven updates without enabling silent self-updates - Added `GET /api/operator-events` plus operator-event metrics for update checks and helper-driven auto-update attempts - Added dashboard cards and tables for operator-side update checks and apply attempts +- Added provider-health rollout guardrails so helper-driven auto-updates can block when gateway health is already degraded ## v0.6.0 - 2026-03-12 diff --git a/README.md b/README.md index e76dd6d..0bee303 100644 --- a/README.md +++ b/README.md @@ -543,6 +543,8 @@ Supported fields in `auto_update`: - `enabled` - `allow_major` +- `require_healthy_providers` +- `max_unhealthy_providers` - `apply_command` Example: @@ -551,6 +553,8 @@ Example: auto_update: enabled: true allow_major: false + require_healthy_providers: true + max_unhealthy_providers: 0 apply_command: "foundrygate-update" ``` @@ -559,6 +563,7 @@ What the current runtime does with it: - exposes eligibility in `GET /api/update` under `auto_update` - shows the same state in the dashboard - lets `foundrygate-auto-update --apply` run only when the current release state is eligible +- can block helper-driven rollout when provider health is already degraded What it still does not do: diff --git a/config.yaml b/config.yaml index b9bbe69..64364d8 100644 --- a/config.yaml +++ b/config.yaml @@ -888,6 +888,8 @@ update_check: auto_update: enabled: false allow_major: false + require_healthy_providers: true + max_unhealthy_providers: 0 apply_command: "foundrygate-update" diff --git a/docs/PUBLISHING.md b/docs/PUBLISHING.md index 857d121..74b7867 100644 --- a/docs/PUBLISHING.md +++ b/docs/PUBLISHING.md @@ -62,6 +62,7 @@ If you want scheduled update application: - keep `auto_update.enabled: true` explicit in `config.yaml` - keep `allow_major: false` unless you are ready to absorb breaking changes automatically +- keep `require_healthy_providers: true` unless you are intentionally allowing rollouts while the gateway is degraded - prefer the reviewed examples in [examples/foundrygate-auto-update.service](./examples/foundrygate-auto-update.service) and [examples/foundrygate-auto-update.timer](./examples/foundrygate-auto-update.timer) - use the cron example in [examples/foundrygate-auto-update.cron](./examples/foundrygate-auto-update.cron) only when `systemd` timers are not practical diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index e960205..cd66ab5 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -181,4 +181,6 @@ If `foundrygate-auto-update --apply` refuses to run, inspect the `auto_update` b - `auto_update.enabled: false` - the latest release is a major upgrade while `allow_major: false` +- one or more providers are unhealthy while `require_healthy_providers: true` +- the number of unhealthy providers exceeds `max_unhealthy_providers` - the release lookup itself is unavailable diff --git a/foundrygate/config.py b/foundrygate/config.py index 222d483..6baa372 100644 --- a/foundrygate/config.py +++ b/foundrygate/config.py @@ -892,6 +892,16 @@ def _normalize_auto_update(data: dict[str, Any]) -> dict[str, Any]: if not isinstance(allow_major, bool): raise ConfigError("'auto_update.allow_major' must be a boolean") + require_healthy_providers = raw.get("require_healthy_providers", True) + if not isinstance(require_healthy_providers, bool): + raise ConfigError("'auto_update.require_healthy_providers' must be a boolean") + + max_unhealthy_providers = raw.get("max_unhealthy_providers", 0) + if isinstance(max_unhealthy_providers, bool) or not isinstance(max_unhealthy_providers, int): + raise ConfigError("'auto_update.max_unhealthy_providers' must be a non-negative integer") + if max_unhealthy_providers < 0: + raise ConfigError("'auto_update.max_unhealthy_providers' must be non-negative") + apply_command = raw.get("apply_command", "foundrygate-update") if not isinstance(apply_command, str) or not apply_command.strip(): raise ConfigError("'auto_update.apply_command' must be a non-empty string") @@ -900,6 +910,8 @@ def _normalize_auto_update(data: dict[str, Any]) -> dict[str, Any]: normalized["auto_update"] = { "enabled": enabled, "allow_major": allow_major, + "require_healthy_providers": require_healthy_providers, + "max_unhealthy_providers": max_unhealthy_providers, "apply_command": apply_command.strip(), } return normalized @@ -989,6 +1001,8 @@ def auto_update(self) -> dict: { "enabled": False, "allow_major": False, + "require_healthy_providers": True, + "max_unhealthy_providers": 0, "apply_command": "foundrygate-update", }, ) diff --git a/foundrygate/main.py b/foundrygate/main.py index 80093b2..3cf2473 100644 --- a/foundrygate/main.py +++ b/foundrygate/main.py @@ -23,7 +23,7 @@ from .metrics import MetricsStore, calc_cost from .providers import ProviderBackend, ProviderError from .router import Router, RoutingDecision -from .updates import UpdateChecker +from .updates import UpdateChecker, apply_auto_update_guardrails logger = logging.getLogger("foundrygate") @@ -248,6 +248,17 @@ def _build_capability_coverage() -> dict[str, dict[str, Any]]: return dict(sorted(coverage.items())) +def _health_summary() -> dict[str, int]: + """Return a compact provider-health summary for operator guardrails.""" + providers_healthy = sum(1 for provider in _providers.values() if provider.health.healthy) + providers_unhealthy = sum(1 for provider in _providers.values() if not provider.health.healthy) + return { + "providers_total": len(_providers), + "providers_healthy": providers_healthy, + "providers_unhealthy": providers_unhealthy, + } + + def _estimate_request_dimensions(body: dict[str, Any]) -> dict[str, int | str]: """Return lightweight request-dimension estimates for debugging and routing preview.""" messages = body.get("messages", []) @@ -675,13 +686,7 @@ async def health(): } return { "status": "ok", - "summary": { - "providers_total": len(providers), - "providers_healthy": sum(1 for provider in providers.values() if provider["healthy"]), - "providers_unhealthy": sum( - 1 for provider in providers.values() if not provider["healthy"] - ), - }, + "summary": _health_summary(), "coverage": _build_capability_coverage(), "providers": providers, } @@ -821,6 +826,11 @@ async def update_status(request: Request, force: bool = False): """Return cached or fresh release update metadata.""" headers = _collect_routing_headers(request) status = await _update_checker.get_status(force=force) + status.auto_update = apply_auto_update_guardrails( + status.auto_update or {}, + providers_healthy=_health_summary()["providers_healthy"], + providers_unhealthy=_health_summary()["providers_unhealthy"], + ) operator_action, client_tag = _collect_operator_context(headers) auto_update = status.auto_update or {} _metrics.log_operator_event( diff --git a/foundrygate/updates.py b/foundrygate/updates.py index fd5ed8a..8e39c84 100644 --- a/foundrygate/updates.py +++ b/foundrygate/updates.py @@ -74,6 +74,38 @@ def alert_level_for_update(update_type: str, *, available: bool, status: str) -> return "warning" +def apply_auto_update_guardrails( + auto_update: dict[str, Any], + *, + providers_healthy: int, + providers_unhealthy: int, +) -> dict[str, Any]: + """Apply provider-health guardrails to one auto-update eligibility result.""" + result = dict(auto_update or {}) + if not result.get("enabled") or not result.get("eligible"): + return result + + require_healthy_providers = bool(result.get("require_healthy_providers", True)) + max_unhealthy_providers = int(result.get("max_unhealthy_providers", 0)) + + if not require_healthy_providers: + return result + + if providers_healthy <= 0: + result["eligible"] = False + result["blocked_reason"] = "No healthy providers available" + return result + + if providers_unhealthy > max_unhealthy_providers: + result["eligible"] = False + result["blocked_reason"] = ( + f"Too many unhealthy providers ({providers_unhealthy} > {max_unhealthy_providers})" + ) + return result + + return result + + @dataclass class UpdateStatus: """Structured update-check result.""" @@ -133,6 +165,10 @@ def __init__( self.auto_update = { "enabled": bool((auto_update or {}).get("enabled", False)), "allow_major": bool((auto_update or {}).get("allow_major", False)), + "require_healthy_providers": bool( + (auto_update or {}).get("require_healthy_providers", True) + ), + "max_unhealthy_providers": int((auto_update or {}).get("max_unhealthy_providers", 0)), "apply_command": str((auto_update or {}).get("apply_command", "foundrygate-update")), } self._cached = UpdateStatus( @@ -187,6 +223,10 @@ def _auto_update_status( "strategy": "script", "allowed_update_types": allowed_types, "allow_major": allow_major, + "require_healthy_providers": bool( + self.auto_update.get("require_healthy_providers", True) + ), + "max_unhealthy_providers": int(self.auto_update.get("max_unhealthy_providers", 0)), "eligible": eligible, "blocked_reason": blocked_reason, "apply_command": apply_command, diff --git a/tests/test_config.py b/tests/test_config.py index 09b00fc..f9409be 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -87,4 +87,6 @@ def test_auto_update_defaults_are_exposed(): cfg = load_config(Path(__file__).parent.parent / "config.yaml") assert cfg.auto_update["enabled"] is False assert cfg.auto_update["allow_major"] is False + assert cfg.auto_update["require_healthy_providers"] is True + assert cfg.auto_update["max_unhealthy_providers"] == 0 assert cfg.auto_update["apply_command"] == "foundrygate-update" diff --git a/tests/test_updates.py b/tests/test_updates.py index 0dd4da6..559f545 100644 --- a/tests/test_updates.py +++ b/tests/test_updates.py @@ -7,6 +7,7 @@ from foundrygate.updates import ( UpdateChecker, alert_level_for_update, + apply_auto_update_guardrails, classify_update, is_update_available, ) @@ -57,6 +58,56 @@ def test_alert_level_maps_update_type_and_status(): assert alert_level_for_update("unknown", available=False, status="unavailable") == "warning" +def test_auto_update_guardrails_block_when_too_many_providers_are_unhealthy(): + guarded = apply_auto_update_guardrails( + { + "enabled": True, + "eligible": True, + "require_healthy_providers": True, + "max_unhealthy_providers": 0, + "blocked_reason": "", + }, + providers_healthy=1, + providers_unhealthy=1, + ) + + assert guarded["eligible"] is False + assert guarded["blocked_reason"] == "Too many unhealthy providers (1 > 0)" + + +def test_auto_update_guardrails_allow_updates_when_health_budget_is_met(): + guarded = apply_auto_update_guardrails( + { + "enabled": True, + "eligible": True, + "require_healthy_providers": True, + "max_unhealthy_providers": 1, + "blocked_reason": "", + }, + providers_healthy=2, + providers_unhealthy=1, + ) + + assert guarded["eligible"] is True + + +def test_auto_update_guardrails_block_when_no_provider_is_healthy(): + guarded = apply_auto_update_guardrails( + { + "enabled": True, + "eligible": True, + "require_healthy_providers": True, + "max_unhealthy_providers": 2, + "blocked_reason": "", + }, + providers_healthy=0, + providers_unhealthy=2, + ) + + assert guarded["eligible"] is False + assert guarded["blocked_reason"] == "No healthy providers available" + + @pytest.mark.asyncio async def test_update_checker_reports_latest_release(): checker = UpdateChecker(