From 0f39b9637ef79e5924db96726f064b18ad8ef8ec Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 1 Jun 2026 22:18:03 +0200 Subject: [PATCH 01/22] Add Cloud Run ASGI runtime --- gcp/cloud_run/Dockerfile | 22 ++++++++++++++++++++ gcp/cloud_run/Dockerfile.dockerignore | 17 +++++++++++++++ gcp/cloud_run/start.sh | 30 +++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 gcp/cloud_run/Dockerfile create mode 100644 gcp/cloud_run/Dockerfile.dockerignore create mode 100755 gcp/cloud_run/start.sh diff --git a/gcp/cloud_run/Dockerfile b/gcp/cloud_run/Dockerfile new file mode 100644 index 000000000..d2a9191f9 --- /dev/null +++ b/gcp/cloud_run/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.12-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential redis-server \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY pyproject.toml README.md ./ +COPY policyengine_api ./policyengine_api +COPY gcp/cloud_run/start.sh ./start.sh + +RUN chmod +x ./start.sh \ + && pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -e . + +ENV GATEWAY_AUTH_REQUIRED=1 +ENV CACHE_REDIS_HOST=127.0.0.1 +ENV CACHE_REDIS_PORT=6379 +ENV CACHE_REDIS_DB=0 + +CMD ["/bin/sh", "/app/start.sh"] diff --git a/gcp/cloud_run/Dockerfile.dockerignore b/gcp/cloud_run/Dockerfile.dockerignore new file mode 100644 index 000000000..b14b5958e --- /dev/null +++ b/gcp/cloud_run/Dockerfile.dockerignore @@ -0,0 +1,17 @@ +# Cloud Run builds from the repository root, but the runtime image only needs +# the package and the entrypoint script. +* + +!README.md +!pyproject.toml +!policyengine_api/ +!policyengine_api/** +!gcp/ +!gcp/cloud_run/ +!gcp/cloud_run/start.sh + +**/__pycache__/ +**/*.py[cod] +**/.DS_Store +policyengine_api/data/*.db +policyengine_api/data/*.db-journal diff --git a/gcp/cloud_run/start.sh b/gcp/cloud_run/start.sh new file mode 100755 index 000000000..7fc4df859 --- /dev/null +++ b/gcp/cloud_run/start.sh @@ -0,0 +1,30 @@ +#!/bin/sh +set -eu + +PORT="${PORT:-8080}" +CACHE_REDIS_HOST="${CACHE_REDIS_HOST:-127.0.0.1}" +CACHE_REDIS_PORT="${CACHE_REDIS_PORT:-6379}" +CACHE_REDIS_DB="${CACHE_REDIS_DB:-0}" +WEB_CONCURRENCY="${WEB_CONCURRENCY:-1}" +export CACHE_REDIS_HOST CACHE_REDIS_PORT CACHE_REDIS_DB + +redis-server --bind "$CACHE_REDIS_HOST" \ + --port "$CACHE_REDIS_PORT" \ + --protected-mode yes \ + --maxclients 10000 \ + --timeout 0 & + +until redis-cli -h "$CACHE_REDIS_HOST" -p "$CACHE_REDIS_PORT" ping >/dev/null 2>&1; do + sleep 1 +done + +uvicorn policyengine_api.asgi:app \ + --host 0.0.0.0 \ + --port "$PORT" \ + --workers "$WEB_CONCURRENCY" \ + --proxy-headers \ + --forwarded-allow-ips '*' & + +trap "pkill -P $$; exit 1" INT TERM + +wait From 193554f88dbbbab8e1d9d0c19b900d8dd55f16f9 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 1 Jun 2026 22:18:35 +0200 Subject: [PATCH 02/22] Add Cloud Run candidate deployment helpers --- .github/scripts/build_cloud_run_image.sh | 31 +++++ .github/scripts/cloud_run_env.sh | 68 ++++++++++ .github/scripts/deploy_cloud_run_candidate.sh | 47 +++++++ .github/scripts/get_cloud_run_tag_url.sh | 31 +++++ .../scripts/validate_cloud_run_deploy_env.sh | 26 ++++ tests/unit/test_cloud_run_deploy_scripts.py | 118 ++++++++++++++++++ 6 files changed, 321 insertions(+) create mode 100755 .github/scripts/build_cloud_run_image.sh create mode 100755 .github/scripts/cloud_run_env.sh create mode 100755 .github/scripts/deploy_cloud_run_candidate.sh create mode 100755 .github/scripts/get_cloud_run_tag_url.sh create mode 100755 .github/scripts/validate_cloud_run_deploy_env.sh create mode 100644 tests/unit/test_cloud_run_deploy_scripts.py diff --git a/.github/scripts/build_cloud_run_image.sh b/.github/scripts/build_cloud_run_image.sh new file mode 100755 index 000000000..27eeb7345 --- /dev/null +++ b/.github/scripts/build_cloud_run_image.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source .github/scripts/cloud_run_env.sh +cloud_run_set_defaults + +if [[ "${CLOUD_RUN_DRY_RUN:-0}" == "1" ]]; then + cloud_run_run gcloud services enable run.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com --project "${CLOUD_RUN_PROJECT}" + cloud_run_run gcloud artifacts repositories describe "${CLOUD_RUN_ARTIFACT_REPOSITORY}" --project "${CLOUD_RUN_PROJECT}" --location "${CLOUD_RUN_REGION}" + cloud_run_run gcloud artifacts repositories create "${CLOUD_RUN_ARTIFACT_REPOSITORY}" --repository-format docker --location "${CLOUD_RUN_REGION}" --description "Docker repository for PolicyEngine API Cloud Run" + cloud_run_run gcloud auth configure-docker "${CLOUD_RUN_REGION}-docker.pkg.dev" --quiet + cloud_run_run docker build -f gcp/cloud_run/Dockerfile -t "${CLOUD_RUN_IMAGE_URI}" . + cloud_run_run docker push "${CLOUD_RUN_IMAGE_URI}" + exit 0 +fi + +gcloud services enable run.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com --project "${CLOUD_RUN_PROJECT}" + +if ! gcloud artifacts repositories describe "${CLOUD_RUN_ARTIFACT_REPOSITORY}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --location "${CLOUD_RUN_REGION}" >/dev/null 2>&1; then + gcloud artifacts repositories create "${CLOUD_RUN_ARTIFACT_REPOSITORY}" \ + --repository-format docker \ + --location "${CLOUD_RUN_REGION}" \ + --description "Docker repository for PolicyEngine API Cloud Run" +fi + +gcloud auth configure-docker "${CLOUD_RUN_REGION}-docker.pkg.dev" --quiet +docker build -f gcp/cloud_run/Dockerfile -t "${CLOUD_RUN_IMAGE_URI}" . +docker push "${CLOUD_RUN_IMAGE_URI}" diff --git a/.github/scripts/cloud_run_env.sh b/.github/scripts/cloud_run_env.sh new file mode 100755 index 000000000..bcefe97aa --- /dev/null +++ b/.github/scripts/cloud_run_env.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +cloud_run_set_defaults() { + CLOUD_RUN_PROJECT="${CLOUD_RUN_PROJECT:-policyengine-api}" + CLOUD_RUN_REGION="${CLOUD_RUN_REGION:-us-central1}" + CLOUD_RUN_SERVICE="${CLOUD_RUN_SERVICE:-policyengine-api}" + CLOUD_RUN_ARTIFACT_REPOSITORY="${CLOUD_RUN_ARTIFACT_REPOSITORY:-policyengine-api}" + CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT="${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT:-github-deployment@policyengine-api.iam.gserviceaccount.com}" + CLOUD_RUN_CLOUD_SQL_INSTANCE="${CLOUD_RUN_CLOUD_SQL_INSTANCE:-policyengine-api:us-central1:policyengine-api-data}" + CLOUD_RUN_CPU="${CLOUD_RUN_CPU:-4}" + CLOUD_RUN_MEMORY="${CLOUD_RUN_MEMORY:-16Gi}" + CLOUD_RUN_TIMEOUT="${CLOUD_RUN_TIMEOUT:-300}" + CLOUD_RUN_MIN_INSTANCES="${CLOUD_RUN_MIN_INSTANCES:-0}" + CLOUD_RUN_MAX_INSTANCES="${CLOUD_RUN_MAX_INSTANCES:-1}" + CLOUD_RUN_PORT="${CLOUD_RUN_PORT:-8080}" + + local sha + sha="${GITHUB_SHA:-local}" + CLOUD_RUN_IMAGE_TAG="${CLOUD_RUN_IMAGE_TAG:-${sha}}" + CLOUD_RUN_IMAGE_URI="${CLOUD_RUN_IMAGE_URI:-${CLOUD_RUN_REGION}-docker.pkg.dev/${CLOUD_RUN_PROJECT}/${CLOUD_RUN_ARTIFACT_REPOSITORY}/${CLOUD_RUN_SERVICE}:${CLOUD_RUN_IMAGE_TAG}}" + + local short_sha + short_sha="${sha:0:7}" + CLOUD_RUN_TAG="${CLOUD_RUN_TAG:-stage3-${GITHUB_RUN_NUMBER:-local}-${short_sha}}" + + export CLOUD_RUN_PROJECT + export CLOUD_RUN_REGION + export CLOUD_RUN_SERVICE + export CLOUD_RUN_ARTIFACT_REPOSITORY + export CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT + export CLOUD_RUN_CLOUD_SQL_INSTANCE + export CLOUD_RUN_CPU + export CLOUD_RUN_MEMORY + export CLOUD_RUN_TIMEOUT + export CLOUD_RUN_MIN_INSTANCES + export CLOUD_RUN_MAX_INSTANCES + export CLOUD_RUN_PORT + export CLOUD_RUN_IMAGE_TAG + export CLOUD_RUN_IMAGE_URI + export CLOUD_RUN_TAG +} + +cloud_run_require_env() { + local missing=() + local name + + for name in "$@"; do + if [[ -z "${!name:-}" ]]; then + missing+=("${name}") + fi + done + + if (( ${#missing[@]} > 0 )); then + echo "Missing required Cloud Run deployment configuration: ${missing[*]}" >&2 + return 1 + fi +} + +cloud_run_run() { + if [[ "${CLOUD_RUN_DRY_RUN:-0}" == "1" ]]; then + printf '+' + printf ' %q' "$@" + printf '\n' + return 0 + fi + + "$@" +} diff --git a/.github/scripts/deploy_cloud_run_candidate.sh b/.github/scripts/deploy_cloud_run_candidate.sh new file mode 100755 index 000000000..4bacf2ece --- /dev/null +++ b/.github/scripts/deploy_cloud_run_candidate.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source .github/scripts/cloud_run_env.sh +cloud_run_set_defaults + +bash .github/scripts/validate_cloud_run_deploy_env.sh + +env_vars=( + "POLICYENGINE_DB_PASSWORD=${POLICYENGINE_DB_PASSWORD}" + "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN=${POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}" + "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}" + "OPENAI_API_KEY=${OPENAI_API_KEY}" + "HUGGING_FACE_TOKEN=${HUGGING_FACE_TOKEN}" + "SIMULATION_API_URL=${SIMULATION_API_URL}" + "GATEWAY_AUTH_REQUIRED=1" + "GATEWAY_AUTH_ISSUER=${GATEWAY_AUTH_ISSUER}" + "GATEWAY_AUTH_AUDIENCE=${GATEWAY_AUTH_AUDIENCE}" + "GATEWAY_AUTH_CLIENT_ID=${GATEWAY_AUTH_CLIENT_ID}" + "GATEWAY_AUTH_CLIENT_SECRET_RESOURCE=${GATEWAY_AUTH_CLIENT_SECRET_RESOURCE}" + "API_HOST_BACKEND=cloud_run" + "SIM_FRONT_DOOR=old_gateway_direct" + "SIM_COMPUTE_ECONOMY=old_gateway" + "CLOUD_RUN_REVISION_TAG=${CLOUD_RUN_TAG}" +) + +set_env_vars="$(IFS='|'; echo "^|^${env_vars[*]}")" + +cloud_run_run gcloud run deploy "${CLOUD_RUN_SERVICE}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --region "${CLOUD_RUN_REGION}" \ + --platform managed \ + --image "${CLOUD_RUN_IMAGE_URI}" \ + --tag "${CLOUD_RUN_TAG}" \ + --no-traffic \ + --allow-unauthenticated \ + --execution-environment gen2 \ + --service-account "${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT}" \ + --add-cloudsql-instances "${CLOUD_RUN_CLOUD_SQL_INSTANCE}" \ + --port "${CLOUD_RUN_PORT}" \ + --cpu "${CLOUD_RUN_CPU}" \ + --memory "${CLOUD_RUN_MEMORY}" \ + --timeout "${CLOUD_RUN_TIMEOUT}" \ + --min-instances "${CLOUD_RUN_MIN_INSTANCES}" \ + --max-instances "${CLOUD_RUN_MAX_INSTANCES}" \ + --set-env-vars "${set_env_vars}" diff --git a/.github/scripts/get_cloud_run_tag_url.sh b/.github/scripts/get_cloud_run_tag_url.sh new file mode 100755 index 000000000..e91d91462 --- /dev/null +++ b/.github/scripts/get_cloud_run_tag_url.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source .github/scripts/cloud_run_env.sh +cloud_run_set_defaults + +if [[ "${CLOUD_RUN_DRY_RUN:-0}" == "1" ]]; then + echo "https://${CLOUD_RUN_TAG}---${CLOUD_RUN_SERVICE}-dry-run.a.run.app" + exit 0 +fi + +gcloud run services describe "${CLOUD_RUN_SERVICE}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --region "${CLOUD_RUN_REGION}" \ + --platform managed \ + --format json | python -c ' +import json +import os +import sys + +service = json.load(sys.stdin) +tag = os.environ["CLOUD_RUN_TAG"] +for traffic_target in service.get("status", {}).get("traffic", []): + if traffic_target.get("tag") == tag and traffic_target.get("url"): + print(traffic_target["url"]) + raise SystemExit(0) + +print(f"Failed to determine Cloud Run URL for tag {tag}", file=sys.stderr) +raise SystemExit(1) +' diff --git a/.github/scripts/validate_cloud_run_deploy_env.sh b/.github/scripts/validate_cloud_run_deploy_env.sh new file mode 100755 index 000000000..ab1141c37 --- /dev/null +++ b/.github/scripts/validate_cloud_run_deploy_env.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source .github/scripts/cloud_run_env.sh +cloud_run_set_defaults + +cloud_run_require_env \ + CLOUD_RUN_PROJECT \ + CLOUD_RUN_REGION \ + CLOUD_RUN_SERVICE \ + CLOUD_RUN_ARTIFACT_REPOSITORY \ + CLOUD_RUN_IMAGE_URI \ + CLOUD_RUN_TAG \ + CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT \ + CLOUD_RUN_CLOUD_SQL_INSTANCE \ + POLICYENGINE_DB_PASSWORD \ + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN \ + ANTHROPIC_API_KEY \ + OPENAI_API_KEY \ + HUGGING_FACE_TOKEN \ + SIMULATION_API_URL \ + GATEWAY_AUTH_ISSUER \ + GATEWAY_AUTH_AUDIENCE \ + GATEWAY_AUTH_CLIENT_ID \ + GATEWAY_AUTH_CLIENT_SECRET_RESOURCE diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py new file mode 100644 index 000000000..0ca3f0f52 --- /dev/null +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import os +import subprocess +from pathlib import Path + + +REPO = Path(__file__).resolve().parents[2] + + +def _script_env(**overrides: str) -> dict[str, str]: + env = { + "HOME": os.environ.get("HOME", ""), + "PATH": os.environ["PATH"], + "CLOUD_RUN_DRY_RUN": "1", + } + env.update(overrides) + return env + + +def _required_runtime_env() -> dict[str, str]: + return { + "POLICYENGINE_DB_PASSWORD": "db-password", + "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN": "github-token", + "ANTHROPIC_API_KEY": "anthropic-key", + "OPENAI_API_KEY": "openai-key", + "HUGGING_FACE_TOKEN": "hf-token", + "SIMULATION_API_URL": "https://simulation.example.test", + "GATEWAY_AUTH_ISSUER": "https://issuer.example.test", + "GATEWAY_AUTH_AUDIENCE": "simulation-gateway", + "GATEWAY_AUTH_CLIENT_ID": "client-id", + "GATEWAY_AUTH_CLIENT_SECRET_RESOURCE": ( + "projects/policyengine-api/secrets/gateway-client-secret/versions/latest" + ), + } + + +def _run_script(path: str, env: dict[str, str]) -> subprocess.CompletedProcess[str]: + return subprocess.run( + ["bash", path], + cwd=REPO, + env=env, + text=True, + capture_output=True, + check=False, + ) + + +def test_cloud_run_startup_uses_asgi_entrypoint(): + start_script = (REPO / "gcp/cloud_run/start.sh").read_text(encoding="utf-8") + + assert "policyengine_api.asgi:app" in start_script + assert "policyengine_api.api" not in start_script + + +def test_validate_cloud_run_deploy_env_reports_missing_runtime_config(): + result = _run_script( + ".github/scripts/validate_cloud_run_deploy_env.sh", + _script_env(), + ) + + assert result.returncode == 1 + assert "Missing required Cloud Run deployment configuration" in result.stderr + assert "POLICYENGINE_DB_PASSWORD" in result.stderr + assert "GATEWAY_AUTH_CLIENT_SECRET_RESOURCE" in result.stderr + + +def test_build_cloud_run_image_dry_run_uses_cloud_run_dockerfile(): + dockerignore = REPO / "gcp/cloud_run/Dockerfile.dockerignore" + + assert dockerignore.exists() + assert "policyengine_api/data/*.db" in dockerignore.read_text(encoding="utf-8") + + result = _run_script( + ".github/scripts/build_cloud_run_image.sh", + _script_env( + GITHUB_SHA="1234567890abcdef", + GITHUB_RUN_NUMBER="42", + ), + ) + + assert result.returncode == 0, result.stderr + assert "gcp/cloud_run/Dockerfile" in result.stdout + assert "docker push" in result.stdout + assert ( + "us-central1-docker.pkg.dev/policyengine-api/policyengine-api/" + "policyengine-api:1234567890abcdef" + ) in result.stdout + + +def test_deploy_cloud_run_candidate_dry_run_never_shifts_traffic(): + result = _run_script( + ".github/scripts/deploy_cloud_run_candidate.sh", + _script_env( + **_required_runtime_env(), + CLOUD_RUN_IMAGE_URI="us-central1-docker.pkg.dev/project/repo/api:sha", + CLOUD_RUN_TAG="stage3-test", + ), + ) + + assert result.returncode == 0, result.stderr + assert "gcloud run deploy" in result.stdout + assert "--no-traffic" in result.stdout + assert "stage3-test" in result.stdout + assert "--to-latest" not in result.stdout + assert "update-traffic" not in result.stdout + + +def test_get_cloud_run_tag_url_dry_run_uses_candidate_tag(): + result = _run_script( + ".github/scripts/get_cloud_run_tag_url.sh", + _script_env(CLOUD_RUN_TAG="stage3-test", CLOUD_RUN_SERVICE="policyengine-api"), + ) + + assert result.returncode == 0, result.stderr + assert result.stdout.strip() == ( + "https://stage3-test---policyengine-api-dry-run.a.run.app" + ) From 01e73e47d0447e7539ca93a81f808f5f55d57ba9 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 1 Jun 2026 22:19:02 +0200 Subject: [PATCH 03/22] Add Cloud Run candidate smoke tests --- tests/integration/test_cloud_run_candidate.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/integration/test_cloud_run_candidate.py diff --git a/tests/integration/test_cloud_run_candidate.py b/tests/integration/test_cloud_run_candidate.py new file mode 100644 index 000000000..59482f812 --- /dev/null +++ b/tests/integration/test_cloud_run_candidate.py @@ -0,0 +1,57 @@ +import os + + +def test_cloud_run_candidate_health_routes(api_client): + health_response = api_client.get("/health") + assert health_response.status_code == 200, health_response.text + assert health_response.json() == {"status": "healthy"} + + liveness_response = api_client.get("/liveness-check") + assert liveness_response.status_code == 200, liveness_response.text + assert liveness_response.text == "OK" + + readiness_response = api_client.get("/readiness-check") + assert readiness_response.status_code == 200, readiness_response.text + assert readiness_response.text == "OK" + + +def test_cloud_run_candidate_metadata_policy_and_household( + api_client, + integration_probe_id, +): + metadata_response = api_client.get("/us/metadata") + assert metadata_response.status_code == 200, metadata_response.text + metadata = metadata_response.json()["result"] + current_law_id = metadata["current_law_id"] + + policy_response = api_client.get(f"/us/policy/{current_law_id}") + assert policy_response.status_code == 200, policy_response.text + policy_payload = policy_response.json() + assert policy_payload["status"] == "ok" + assert policy_payload["result"]["id"] == current_law_id + + household_id = os.environ.get("CLOUD_RUN_SMOKE_HOUSEHOLD_ID") or None + if household_id is None: + create_household_response = api_client.post( + "/us/household", + json={ + "label": f"cloud-run-smoke-{integration_probe_id}", + "data": { + "people": { + "you": { + "age": {"2026": 40}, + } + } + }, + }, + ) + assert create_household_response.status_code == 201, ( + create_household_response.text + ) + household_id = str(create_household_response.json()["result"]["household_id"]) + + household_response = api_client.get(f"/us/household/{household_id}") + assert household_response.status_code == 200, household_response.text + household_payload = household_response.json() + assert household_payload["status"] == "ok" + assert str(household_payload["result"]["id"]) == household_id From 7e36565b93e2e6bb210f56ee118777b7f750da8d Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 1 Jun 2026 22:19:31 +0200 Subject: [PATCH 04/22] Wire Cloud Run candidate into CI --- .github/workflows/pr.yml | 10 ++++++ .github/workflows/push.yml | 64 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 72269106e..265c83363 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -61,6 +61,16 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Build container run: docker build -t ghcr.io/policyengine/policyengine docker + test_cloud_run_container_builds: + name: Cloud Run container + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Build Cloud Run container + run: docker build -f gcp/cloud_run/Dockerfile -t policyengine-api-cloud-run:test . test_env_vars: name: Test environment variables runs-on: ubuntu-latest diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index cb5289d74..644f05a97 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -329,6 +329,70 @@ jobs: env: APP_ENGINE_VERSION: ${{ steps.version.outputs.version }} + deploy-cloud-run-candidate: + name: Deploy Cloud Run candidate + runs-on: ubuntu-latest + needs: deploy-production + if: | + (github.repository == 'PolicyEngine/policyengine-api') + && (github.event.head_commit.message == 'Update PolicyEngine API') + environment: production + permissions: + contents: read + id-token: write + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: GCP authentication + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}" + service_account: "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" + - name: Set up GCloud + uses: "google-github-actions/setup-gcloud@v2" + - name: Compute Cloud Run candidate metadata + id: cloud_run + run: | + echo "image_tag=${GITHUB_SHA}" >> "$GITHUB_OUTPUT" + echo "revision_tag=stage3-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" + - name: Build and push Cloud Run image + run: bash .github/scripts/build_cloud_run_image.sh + env: + CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} + CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + - name: Deploy tagged Cloud Run candidate + run: bash .github/scripts/deploy_cloud_run_candidate.sh + env: + CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} + CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }} + POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + SIMULATION_API_URL: ${{ secrets.SIMULATION_API_URL }} + GATEWAY_AUTH_ISSUER: ${{ secrets.GATEWAY_AUTH_ISSUER }} + GATEWAY_AUTH_AUDIENCE: ${{ secrets.GATEWAY_AUTH_AUDIENCE }} + GATEWAY_AUTH_CLIENT_ID: ${{ secrets.GATEWAY_AUTH_CLIENT_ID }} + GATEWAY_AUTH_CLIENT_SECRET_RESOURCE: ${{ secrets.GATEWAY_AUTH_CLIENT_SECRET_RESOURCE }} + - name: Resolve Cloud Run candidate URL + id: cloud_run_url + run: | + url="$(bash .github/scripts/get_cloud_run_tag_url.sh)" + echo "url=${url}" >> "$GITHUB_OUTPUT" + echo "Cloud Run candidate URL: ${url}" + env: + CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + - name: Install Cloud Run smoke test dependencies + run: pip install pytest httpx + - name: Run Cloud Run candidate smoke tests + run: python -m pytest tests/integration/test_cloud_run_candidate.py -v + env: + API_BASE_URL: ${{ steps.cloud_run_url.outputs.url }} + STAGING_API_TEST_PROBE_ID: cloud-run-${{ steps.cloud_run.outputs.revision_tag }} + CLOUD_RUN_SMOKE_HOUSEHOLD_ID: ${{ vars.CLOUD_RUN_SMOKE_HOUSEHOLD_ID }} + docker: name: Docker runs-on: ubuntu-latest From 97d11c8584f2c11eec367f9cfff8c29e4ec4195d Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 1 Jun 2026 22:20:00 +0200 Subject: [PATCH 05/22] Document Cloud Run candidate stage --- ...migration-pr3-cloud-run-candidate.added.md | 1 + docs/engineering/skills/testing.md | 15 ++++ ...gration-pr3-cloud-run-candidate-runbook.md | 73 +++++++++++++++++++ 3 files changed, 89 insertions(+) create mode 100644 changelog.d/migration-pr3-cloud-run-candidate.added.md create mode 100644 docs/migration-pr3-cloud-run-candidate-runbook.md diff --git a/changelog.d/migration-pr3-cloud-run-candidate.added.md b/changelog.d/migration-pr3-cloud-run-candidate.added.md new file mode 100644 index 000000000..4e77c96f0 --- /dev/null +++ b/changelog.d/migration-pr3-cloud-run-candidate.added.md @@ -0,0 +1 @@ +Added a no-traffic Cloud Run candidate deployment path for the FastAPI shell. diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index 83b4b21a9..f0c7fcf26 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -41,5 +41,20 @@ Regenerate and review `docs/engineering/generated/migration_contracts.md` when route inventory, migration registry flags, or v1 contract expectations change. FastAPI shell-only fallback changes should not change the route catalog. +For PR 3 Cloud Run candidate deployment changes, verify the command-building +guards, ASGI compatibility, and container build: + +```bash +python -m pytest tests/unit/test_cloud_run_deploy_scripts.py tests/unit/test_asgi_factory.py -q +docker build -f gcp/cloud_run/Dockerfile -t policyengine-api-cloud-run:test . +``` + +Live Cloud Run candidate checks must be explicit deployed probes. They require +`API_BASE_URL` and should not run as part of ordinary local test commands: + +```bash +API_BASE_URL=https://candidate-url python -m pytest tests/integration/test_cloud_run_candidate.py -v +``` + Run `ruff format --check` and `ruff check` on changed Python files before handoff. diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md new file mode 100644 index 000000000..5cf561315 --- /dev/null +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -0,0 +1,73 @@ +# PR 3 Cloud Run Candidate Runbook + +PR 3 adds a production-configured Cloud Run candidate for the FastAPI ASGI +shell. It does not move user traffic. + +## Included + +- Cloud Run Docker runtime for `policyengine_api.asgi:app`. +- Tagged no-traffic Cloud Run revisions deployed after App Engine production + promotion. +- Runtime environment configuration for existing Cloud SQL and the existing + simulation gateway. +- Smoke tests against the tagged Cloud Run URL. + +## Not Included + +- No public API host traffic shift. +- No Cloud Run traffic ramp. +- No native FastAPI route migration beyond `/health`. +- No Supabase, Alembic, SQLAlchemy model, or Modal compute migration. +- No App Engine retirement. + +## Resource Defaults + +- Project: `policyengine-api` +- Region: `us-central1` +- Service: `policyengine-api` +- Artifact Registry repository: `policyengine-api` +- Cloud SQL instance: `policyengine-api:us-central1:policyengine-api-data` +- Revision tag: `stage3-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` + +## Post-Merge Flow + +The `Push` workflow still deploys and promotes App Engine production first. Only +after that succeeds, it builds and deploys a Cloud Run revision with: + +```bash +gcloud run deploy policyengine-api \ + --tag "$CLOUD_RUN_TAG" \ + --no-traffic +``` + +The workflow then resolves the tagged URL and runs: + +```bash +python -m pytest tests/integration/test_cloud_run_candidate.py -v +``` + +Failure marks the deployment workflow red, but App Engine remains the production +traffic target. + +## Manual Smoke + +After GitHub Actions prints the candidate URL: + +```bash +curl -i "$CLOUD_RUN_CANDIDATE_URL/health" +curl -i "$CLOUD_RUN_CANDIDATE_URL/readiness-check" +curl -i "$CLOUD_RUN_CANDIDATE_URL/liveness-check" +curl -i "$CLOUD_RUN_CANDIDATE_URL/us/metadata" +``` + +Expected behavior: + +- `/health` returns FastAPI JSON: `{"status":"healthy"}`. +- `/readiness-check` and `/liveness-check` return existing Flask text `OK`. +- `/us/metadata` returns the existing v1 metadata contract from Cloud SQL. + +## Rollback + +No user traffic is routed to the Cloud Run candidate in this PR. If the candidate +fails, leave App Engine as production-primary and fix the Cloud Run deploy path +in a follow-up commit. From 38c78f7589056f131e801fc16b390ef6a5fe7188 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 1 Jun 2026 22:42:31 +0200 Subject: [PATCH 06/22] Keep Cloud Run candidate off production DB --- .github/scripts/cloud_run_env.sh | 14 +++++++- .github/scripts/deploy_cloud_run_candidate.sh | 3 ++ .../scripts/validate_cloud_run_deploy_env.sh | 2 ++ .github/workflows/push.yml | 9 +++-- docs/engineering/skills/testing.md | 5 ++- ...gration-pr3-cloud-run-candidate-runbook.md | 25 ++++++++------ policyengine_api/data/data.py | 27 +++++++++++---- tests/integration/test_cloud_run_candidate.py | 22 ++++-------- .../unit/data/test_remote_database_config.py | 34 +++++++++++++++++++ tests/unit/test_cloud_run_deploy_scripts.py | 21 ++++++++++++ 10 files changed, 125 insertions(+), 37 deletions(-) create mode 100644 tests/unit/data/test_remote_database_config.py diff --git a/.github/scripts/cloud_run_env.sh b/.github/scripts/cloud_run_env.sh index bcefe97aa..a17835c68 100755 --- a/.github/scripts/cloud_run_env.sh +++ b/.github/scripts/cloud_run_env.sh @@ -6,7 +6,6 @@ cloud_run_set_defaults() { CLOUD_RUN_SERVICE="${CLOUD_RUN_SERVICE:-policyengine-api}" CLOUD_RUN_ARTIFACT_REPOSITORY="${CLOUD_RUN_ARTIFACT_REPOSITORY:-policyengine-api}" CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT="${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT:-github-deployment@policyengine-api.iam.gserviceaccount.com}" - CLOUD_RUN_CLOUD_SQL_INSTANCE="${CLOUD_RUN_CLOUD_SQL_INSTANCE:-policyengine-api:us-central1:policyengine-api-data}" CLOUD_RUN_CPU="${CLOUD_RUN_CPU:-4}" CLOUD_RUN_MEMORY="${CLOUD_RUN_MEMORY:-16Gi}" CLOUD_RUN_TIMEOUT="${CLOUD_RUN_TIMEOUT:-300}" @@ -40,6 +39,19 @@ cloud_run_set_defaults() { export CLOUD_RUN_TAG } +cloud_run_reject_production_db() { + local production_instance="policyengine-api:us-central1:policyengine-api-data" + if [[ "${CLOUD_RUN_CLOUD_SQL_INSTANCE:-}" == "${production_instance}" ]]; then + echo "Cloud Run candidate must not use the production Cloud SQL instance: ${production_instance}" >&2 + return 1 + fi + + if [[ "${POLICYENGINE_DB_INSTANCE_CONNECTION_NAME:-}" == "${production_instance}" ]]; then + echo "Cloud Run candidate must not use the production DB instance connection name: ${production_instance}" >&2 + return 1 + fi +} + cloud_run_require_env() { local missing=() local name diff --git a/.github/scripts/deploy_cloud_run_candidate.sh b/.github/scripts/deploy_cloud_run_candidate.sh index 4bacf2ece..989990160 100755 --- a/.github/scripts/deploy_cloud_run_candidate.sh +++ b/.github/scripts/deploy_cloud_run_candidate.sh @@ -8,6 +8,9 @@ cloud_run_set_defaults bash .github/scripts/validate_cloud_run_deploy_env.sh env_vars=( + "POLICYENGINE_DB_INSTANCE_CONNECTION_NAME=${CLOUD_RUN_CLOUD_SQL_INSTANCE}" + "POLICYENGINE_DB_USER=${POLICYENGINE_DB_USER:-policyengine}" + "POLICYENGINE_DB_NAME=${POLICYENGINE_DB_NAME:-policyengine}" "POLICYENGINE_DB_PASSWORD=${POLICYENGINE_DB_PASSWORD}" "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN=${POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}" "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}" diff --git a/.github/scripts/validate_cloud_run_deploy_env.sh b/.github/scripts/validate_cloud_run_deploy_env.sh index ab1141c37..b74df14d7 100755 --- a/.github/scripts/validate_cloud_run_deploy_env.sh +++ b/.github/scripts/validate_cloud_run_deploy_env.sh @@ -24,3 +24,5 @@ cloud_run_require_env \ GATEWAY_AUTH_AUDIENCE \ GATEWAY_AUTH_CLIENT_ID \ GATEWAY_AUTH_CLIENT_SECRET_RESOURCE + +cloud_run_reject_production_db diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 644f05a97..0d5d9310a 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -332,11 +332,11 @@ jobs: deploy-cloud-run-candidate: name: Deploy Cloud Run candidate runs-on: ubuntu-latest - needs: deploy-production + needs: integration-tests-staging if: | (github.repository == 'PolicyEngine/policyengine-api') && (github.event.head_commit.message == 'Update PolicyEngine API') - environment: production + environment: staging permissions: contents: read id-token: write @@ -360,12 +360,17 @@ jobs: env: CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + CLOUD_RUN_CLOUD_SQL_INSTANCE: ${{ vars.CLOUD_RUN_CLOUD_SQL_INSTANCE }} - name: Deploy tagged Cloud Run candidate run: bash .github/scripts/deploy_cloud_run_candidate.sh env: CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + CLOUD_RUN_CLOUD_SQL_INSTANCE: ${{ vars.CLOUD_RUN_CLOUD_SQL_INSTANCE }} CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }} + POLICYENGINE_DB_INSTANCE_CONNECTION_NAME: ${{ vars.CLOUD_RUN_CLOUD_SQL_INSTANCE }} + POLICYENGINE_DB_USER: ${{ vars.POLICYENGINE_DB_USER }} + POLICYENGINE_DB_NAME: ${{ vars.POLICYENGINE_DB_NAME }} POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index f0c7fcf26..d29580831 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -50,7 +50,10 @@ docker build -f gcp/cloud_run/Dockerfile -t policyengine-api-cloud-run:test . ``` Live Cloud Run candidate checks must be explicit deployed probes. They require -`API_BASE_URL` and should not run as part of ordinary local test commands: +`API_BASE_URL` and `CLOUD_RUN_SMOKE_HOUSEHOLD_ID`, and should not run as part of +ordinary local test commands. `CLOUD_RUN_SMOKE_HOUSEHOLD_ID` must point to a +pre-existing non-production household fixture; smoke tests must not create +households or point at the production Cloud SQL instance: ```bash API_BASE_URL=https://candidate-url python -m pytest tests/integration/test_cloud_run_candidate.py -v diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index 5cf561315..dac822a7c 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -1,15 +1,15 @@ # PR 3 Cloud Run Candidate Runbook -PR 3 adds a production-configured Cloud Run candidate for the FastAPI ASGI -shell. It does not move user traffic. +PR 3 adds a Cloud Run candidate for the FastAPI ASGI shell. It uses staging +data-plane configuration and does not move user traffic. ## Included - Cloud Run Docker runtime for `policyengine_api.asgi:app`. -- Tagged no-traffic Cloud Run revisions deployed after App Engine production - promotion. -- Runtime environment configuration for existing Cloud SQL and the existing - simulation gateway. +- Tagged no-traffic Cloud Run revisions deployed after staging integration + tests pass. +- Runtime environment configuration for a non-production Cloud SQL instance and + the existing simulation gateway. - Smoke tests against the tagged Cloud Run URL. ## Not Included @@ -26,13 +26,14 @@ shell. It does not move user traffic. - Region: `us-central1` - Service: `policyengine-api` - Artifact Registry repository: `policyengine-api` -- Cloud SQL instance: `policyengine-api:us-central1:policyengine-api-data` +- Cloud SQL instance: supplied by staging `CLOUD_RUN_CLOUD_SQL_INSTANCE`; this + must not be `policyengine-api:us-central1:policyengine-api-data`. - Revision tag: `stage3-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` ## Post-Merge Flow -The `Push` workflow still deploys and promotes App Engine production first. Only -after that succeeds, it builds and deploys a Cloud Run revision with: +The `Push` workflow deploys and tests App Engine staging first. Only after +staging integration tests pass, it builds and deploys a Cloud Run revision with: ```bash gcloud run deploy policyengine-api \ @@ -47,7 +48,8 @@ python -m pytest tests/integration/test_cloud_run_candidate.py -v ``` Failure marks the deployment workflow red, but App Engine remains the production -traffic target. +traffic target. The Cloud Run candidate must use staging DB credentials and a +non-production Cloud SQL instance. ## Manual Smoke @@ -64,7 +66,8 @@ Expected behavior: - `/health` returns FastAPI JSON: `{"status":"healthy"}`. - `/readiness-check` and `/liveness-check` return existing Flask text `OK`. -- `/us/metadata` returns the existing v1 metadata contract from Cloud SQL. +- `/us/metadata` returns the existing v1 metadata contract from the + non-production Cloud SQL instance. ## Rollback diff --git a/policyengine_api/data/data.py b/policyengine_api/data/data.py index 6b16e713e..958021985 100644 --- a/policyengine_api/data/data.py +++ b/policyengine_api/data/data.py @@ -12,6 +12,23 @@ load_dotenv() +DEFAULT_REMOTE_DB_INSTANCE_CONNECTION_NAME = ( + "policyengine-api:us-central1:policyengine-api-data" +) +DEFAULT_REMOTE_DB_USER = "policyengine" +DEFAULT_REMOTE_DB_NAME = "policyengine" + + +def get_remote_database_config() -> dict[str, str]: + return { + "instance_connection_name": os.environ.get( + "POLICYENGINE_DB_INSTANCE_CONNECTION_NAME", + DEFAULT_REMOTE_DB_INSTANCE_CONNECTION_NAME, + ), + "db_user": os.environ.get("POLICYENGINE_DB_USER", DEFAULT_REMOTE_DB_USER), + "db_name": os.environ.get("POLICYENGINE_DB_NAME", DEFAULT_REMOTE_DB_NAME), + } + class _ResultProxy: """Lightweight wrapper that eagerly fetches results from a @@ -97,19 +114,17 @@ def __init__( self.initialize() def _create_pool(self): - instance_connection_name = "policyengine-api:us-central1:policyengine-api-data" + db_config = get_remote_database_config() self.connector = Connector() - db_user = "policyengine" db_pass = os.environ["POLICYENGINE_DB_PASSWORD"] if db_pass == ".dbpw": with open(".dbpw") as f: db_pass = f.read().strip() - db_name = "policyengine" conn = self.connector.connect( - instance_connection_string=instance_connection_name, + instance_connection_string=db_config["instance_connection_name"], driver="pymysql", - db=db_name, - user=db_user, + db=db_config["db_name"], + user=db_config["db_user"], password=db_pass, ) self.pool = sqlalchemy.create_engine( diff --git a/tests/integration/test_cloud_run_candidate.py b/tests/integration/test_cloud_run_candidate.py index 59482f812..91e687b62 100644 --- a/tests/integration/test_cloud_run_candidate.py +++ b/tests/integration/test_cloud_run_candidate.py @@ -1,5 +1,7 @@ import os +import pytest + def test_cloud_run_candidate_health_routes(api_client): health_response = api_client.get("/health") @@ -32,23 +34,11 @@ def test_cloud_run_candidate_metadata_policy_and_household( household_id = os.environ.get("CLOUD_RUN_SMOKE_HOUSEHOLD_ID") or None if household_id is None: - create_household_response = api_client.post( - "/us/household", - json={ - "label": f"cloud-run-smoke-{integration_probe_id}", - "data": { - "people": { - "you": { - "age": {"2026": 40}, - } - } - }, - }, - ) - assert create_household_response.status_code == 201, ( - create_household_response.text + pytest.fail( + "CLOUD_RUN_SMOKE_HOUSEHOLD_ID must be set to a pre-existing " + "non-production household fixture. Cloud Run smoke tests must not " + "create households." ) - household_id = str(create_household_response.json()["result"]["household_id"]) household_response = api_client.get(f"/us/household/{household_id}") assert household_response.status_code == 200, household_response.text diff --git a/tests/unit/data/test_remote_database_config.py b/tests/unit/data/test_remote_database_config.py new file mode 100644 index 000000000..33ce245a4 --- /dev/null +++ b/tests/unit/data/test_remote_database_config.py @@ -0,0 +1,34 @@ +import os + +os.environ.setdefault("FLASK_DEBUG", "1") + +from policyengine_api.data.data import get_remote_database_config + + +def test_remote_database_config_defaults_to_current_production_values(monkeypatch): + monkeypatch.delenv("POLICYENGINE_DB_INSTANCE_CONNECTION_NAME", raising=False) + monkeypatch.delenv("POLICYENGINE_DB_USER", raising=False) + monkeypatch.delenv("POLICYENGINE_DB_NAME", raising=False) + + assert get_remote_database_config() == { + "instance_connection_name": "policyengine-api:us-central1:policyengine-api-data", + "db_user": "policyengine", + "db_name": "policyengine", + } + + +def test_remote_database_config_can_target_non_production_db(monkeypatch): + monkeypatch.setenv( + "POLICYENGINE_DB_INSTANCE_CONNECTION_NAME", + "policyengine-api-staging:us-central1:policyengine-api-data-staging", + ) + monkeypatch.setenv("POLICYENGINE_DB_USER", "policyengine_staging") + monkeypatch.setenv("POLICYENGINE_DB_NAME", "policyengine_staging") + + assert get_remote_database_config() == { + "instance_connection_name": ( + "policyengine-api-staging:us-central1:policyengine-api-data-staging" + ), + "db_user": "policyengine_staging", + "db_name": "policyengine_staging", + } diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index 0ca3f0f52..6d459445b 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -6,6 +6,7 @@ REPO = Path(__file__).resolve().parents[2] +PRODUCTION_CLOUD_SQL_INSTANCE = "policyengine-api:us-central1:policyengine-api-data" def _script_env(**overrides: str) -> dict[str, str]: @@ -20,6 +21,9 @@ def _script_env(**overrides: str) -> dict[str, str]: def _required_runtime_env() -> dict[str, str]: return { + "CLOUD_RUN_CLOUD_SQL_INSTANCE": ( + "policyengine-api-staging:us-central1:policyengine-api-data-staging" + ), "POLICYENGINE_DB_PASSWORD": "db-password", "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN": "github-token", "ANTHROPIC_API_KEY": "anthropic-key", @@ -62,9 +66,23 @@ def test_validate_cloud_run_deploy_env_reports_missing_runtime_config(): assert result.returncode == 1 assert "Missing required Cloud Run deployment configuration" in result.stderr assert "POLICYENGINE_DB_PASSWORD" in result.stderr + assert "CLOUD_RUN_CLOUD_SQL_INSTANCE" in result.stderr assert "GATEWAY_AUTH_CLIENT_SECRET_RESOURCE" in result.stderr +def test_validate_cloud_run_deploy_env_rejects_production_db(): + env = _required_runtime_env() + env["CLOUD_RUN_CLOUD_SQL_INSTANCE"] = PRODUCTION_CLOUD_SQL_INSTANCE + + result = _run_script( + ".github/scripts/validate_cloud_run_deploy_env.sh", + _script_env(**env), + ) + + assert result.returncode == 1 + assert "must not use the production Cloud SQL instance" in result.stderr + + def test_build_cloud_run_image_dry_run_uses_cloud_run_dockerfile(): dockerignore = REPO / "gcp/cloud_run/Dockerfile.dockerignore" @@ -81,6 +99,7 @@ def test_build_cloud_run_image_dry_run_uses_cloud_run_dockerfile(): assert result.returncode == 0, result.stderr assert "gcp/cloud_run/Dockerfile" in result.stdout + assert PRODUCTION_CLOUD_SQL_INSTANCE not in result.stdout assert "docker push" in result.stdout assert ( "us-central1-docker.pkg.dev/policyengine-api/policyengine-api/" @@ -102,6 +121,8 @@ def test_deploy_cloud_run_candidate_dry_run_never_shifts_traffic(): assert "gcloud run deploy" in result.stdout assert "--no-traffic" in result.stdout assert "stage3-test" in result.stdout + assert "POLICYENGINE_DB_INSTANCE_CONNECTION_NAME=" in result.stdout + assert PRODUCTION_CLOUD_SQL_INSTANCE not in result.stdout assert "--to-latest" not in result.stdout assert "update-traffic" not in result.stdout From bc772f79c9b3d19432df2d8baadcd23ef76bf42f Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 1 Jun 2026 22:57:24 +0200 Subject: [PATCH 07/22] Add Cloud Run simulation gateway smoke probe --- .github/scripts/deploy_cloud_run_candidate.sh | 1 + ...gration-pr3-cloud-run-candidate-runbook.md | 6 ++- policyengine_api/asgi_factory.py | 46 ++++++++++++++++++- tests/integration/test_cloud_run_candidate.py | 9 ++++ tests/unit/test_asgi_factory.py | 44 ++++++++++++++++++ tests/unit/test_cloud_run_deploy_scripts.py | 1 + 6 files changed, 105 insertions(+), 2 deletions(-) diff --git a/.github/scripts/deploy_cloud_run_candidate.sh b/.github/scripts/deploy_cloud_run_candidate.sh index 989990160..b7641ee0f 100755 --- a/.github/scripts/deploy_cloud_run_candidate.sh +++ b/.github/scripts/deploy_cloud_run_candidate.sh @@ -22,6 +22,7 @@ env_vars=( "GATEWAY_AUTH_AUDIENCE=${GATEWAY_AUTH_AUDIENCE}" "GATEWAY_AUTH_CLIENT_ID=${GATEWAY_AUTH_CLIENT_ID}" "GATEWAY_AUTH_CLIENT_SECRET_RESOURCE=${GATEWAY_AUTH_CLIENT_SECRET_RESOURCE}" + "CLOUD_RUN_INTERNAL_PROBES=1" "API_HOST_BACKEND=cloud_run" "SIM_FRONT_DOOR=old_gateway_direct" "SIM_COMPUTE_ECONOMY=old_gateway" diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index dac822a7c..5764f2c66 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -10,7 +10,8 @@ data-plane configuration and does not move user traffic. tests pass. - Runtime environment configuration for a non-production Cloud SQL instance and the existing simulation gateway. -- Smoke tests against the tagged Cloud Run URL. +- Smoke tests against the tagged Cloud Run URL, including an internal + simulation-gateway health probe. ## Not Included @@ -59,12 +60,15 @@ After GitHub Actions prints the candidate URL: curl -i "$CLOUD_RUN_CANDIDATE_URL/health" curl -i "$CLOUD_RUN_CANDIDATE_URL/readiness-check" curl -i "$CLOUD_RUN_CANDIDATE_URL/liveness-check" +curl -i "$CLOUD_RUN_CANDIDATE_URL/health/simulation-gateway" curl -i "$CLOUD_RUN_CANDIDATE_URL/us/metadata" ``` Expected behavior: - `/health` returns FastAPI JSON: `{"status":"healthy"}`. +- `/health/simulation-gateway` returns FastAPI JSON confirming the existing + simulation gateway client can initialize and reach the gateway health check. - `/readiness-check` and `/liveness-check` return existing Flask text `OK`. - `/us/metadata` returns the existing v1 metadata contract from the non-production Cloud SQL instance. diff --git a/policyengine_api/asgi_factory.py b/policyengine_api/asgi_factory.py index b94838eaa..2fc0e3728 100644 --- a/policyengine_api/asgi_factory.py +++ b/policyengine_api/asgi_factory.py @@ -2,10 +2,11 @@ from __future__ import annotations +import os from typing import Literal from a2wsgi import WSGIMiddleware -from fastapi import FastAPI +from fastapi import FastAPI, HTTPException from pydantic import BaseModel from policyengine_api.constants import VERSION @@ -15,6 +16,19 @@ class HealthResponse(BaseModel): status: Literal["healthy"] +class SimulationGatewayHealthResponse(BaseModel): + status: Literal["healthy"] + simulation_gateway: Literal["healthy"] + + +def _internal_probes_enabled() -> bool: + return os.environ.get("CLOUD_RUN_INTERNAL_PROBES", "").lower() in { + "1", + "true", + "yes", + } + + def _add_vary_origin(response) -> None: vary = response.headers.get("Vary") if vary is None: @@ -48,5 +62,35 @@ async def add_cors_for_native_routes(request, call_next): def health() -> HealthResponse: return HealthResponse(status="healthy") + @app.get( + "/health/simulation-gateway", + response_model=SimulationGatewayHealthResponse, + include_in_schema=False, + ) + def simulation_gateway_health() -> SimulationGatewayHealthResponse: + if not _internal_probes_enabled(): + raise HTTPException(status_code=404, detail="Not found") + + from policyengine_api.libs.simulation_api_modal import SimulationAPIModal + + try: + gateway_healthy = SimulationAPIModal().health_check() + except Exception as error: + raise HTTPException( + status_code=503, + detail="Simulation gateway client initialization failed", + ) from error + + if not gateway_healthy: + raise HTTPException( + status_code=503, + detail="Simulation gateway health check failed", + ) + + return SimulationGatewayHealthResponse( + status="healthy", + simulation_gateway="healthy", + ) + app.mount("/", WSGIMiddleware(wsgi_app)) return app diff --git a/tests/integration/test_cloud_run_candidate.py b/tests/integration/test_cloud_run_candidate.py index 91e687b62..1ee4c618b 100644 --- a/tests/integration/test_cloud_run_candidate.py +++ b/tests/integration/test_cloud_run_candidate.py @@ -16,6 +16,15 @@ def test_cloud_run_candidate_health_routes(api_client): assert readiness_response.status_code == 200, readiness_response.text assert readiness_response.text == "OK" + simulation_gateway_response = api_client.get("/health/simulation-gateway") + assert simulation_gateway_response.status_code == 200, ( + simulation_gateway_response.text + ) + assert simulation_gateway_response.json() == { + "status": "healthy", + "simulation_gateway": "healthy", + } + def test_cloud_run_candidate_metadata_policy_and_household( api_client, diff --git a/tests/unit/test_asgi_factory.py b/tests/unit/test_asgi_factory.py index 015b62ae7..c767be913 100644 --- a/tests/unit/test_asgi_factory.py +++ b/tests/unit/test_asgi_factory.py @@ -1,6 +1,7 @@ import importlib import json import sys +from unittest.mock import patch import pytest from fastapi.testclient import TestClient @@ -161,6 +162,49 @@ def test_health_route_uses_same_reflected_cors_policy(): assert response.headers["vary"] == "Origin" +def test_simulation_gateway_health_probe_is_disabled_by_default(monkeypatch): + monkeypatch.delenv("CLOUD_RUN_INTERNAL_PROBES", raising=False) + client = TestClient(create_asgi_app(create_test_wsgi_app())) + + response = client.get("/health/simulation-gateway") + + assert response.status_code == 404 + + +def test_simulation_gateway_health_probe_checks_gateway(monkeypatch): + monkeypatch.setenv("CLOUD_RUN_INTERNAL_PROBES", "1") + client = TestClient(create_asgi_app(create_test_wsgi_app())) + + with patch( + "policyengine_api.libs.simulation_api_modal.SimulationAPIModal" + ) as simulation_api: + simulation_api.return_value.health_check.return_value = True + + response = client.get("/health/simulation-gateway") + + assert response.status_code == 200 + assert response.json() == { + "status": "healthy", + "simulation_gateway": "healthy", + } + simulation_api.assert_called_once_with() + simulation_api.return_value.health_check.assert_called_once_with() + + +def test_simulation_gateway_health_probe_reports_failure(monkeypatch): + monkeypatch.setenv("CLOUD_RUN_INTERNAL_PROBES", "1") + client = TestClient(create_asgi_app(create_test_wsgi_app())) + + with patch( + "policyengine_api.libs.simulation_api_modal.SimulationAPIModal" + ) as simulation_api: + simulation_api.return_value.health_check.return_value = False + + response = client.get("/health/simulation-gateway") + + assert response.status_code == 503 + + def test_existing_health_and_specification_paths_fall_back_to_flask(): client = TestClient(create_asgi_app(create_test_wsgi_app())) diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index 6d459445b..41d8be320 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -122,6 +122,7 @@ def test_deploy_cloud_run_candidate_dry_run_never_shifts_traffic(): assert "--no-traffic" in result.stdout assert "stage3-test" in result.stdout assert "POLICYENGINE_DB_INSTANCE_CONNECTION_NAME=" in result.stdout + assert "CLOUD_RUN_INTERNAL_PROBES=1" in result.stdout assert PRODUCTION_CLOUD_SQL_INSTANCE not in result.stdout assert "--to-latest" not in result.stdout assert "update-traffic" not in result.stdout From e2088a978b4390904c68bb911278e44ca97a7074 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 2 Jun 2026 15:54:29 +0200 Subject: [PATCH 08/22] Connect Cloud Run candidate to production DB --- .github/scripts/cloud_run_env.sh | 14 +---------- .../scripts/validate_cloud_run_deploy_env.sh | 2 -- .github/workflows/push.yml | 7 ++---- docs/engineering/skills/testing.md | 4 +-- ...gration-pr3-cloud-run-candidate-runbook.md | 23 ++++++++--------- tests/integration/test_cloud_run_candidate.py | 5 ++-- tests/unit/test_cloud_run_deploy_scripts.py | 25 ++++--------------- 7 files changed, 22 insertions(+), 58 deletions(-) diff --git a/.github/scripts/cloud_run_env.sh b/.github/scripts/cloud_run_env.sh index a17835c68..bcefe97aa 100755 --- a/.github/scripts/cloud_run_env.sh +++ b/.github/scripts/cloud_run_env.sh @@ -6,6 +6,7 @@ cloud_run_set_defaults() { CLOUD_RUN_SERVICE="${CLOUD_RUN_SERVICE:-policyengine-api}" CLOUD_RUN_ARTIFACT_REPOSITORY="${CLOUD_RUN_ARTIFACT_REPOSITORY:-policyengine-api}" CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT="${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT:-github-deployment@policyengine-api.iam.gserviceaccount.com}" + CLOUD_RUN_CLOUD_SQL_INSTANCE="${CLOUD_RUN_CLOUD_SQL_INSTANCE:-policyengine-api:us-central1:policyengine-api-data}" CLOUD_RUN_CPU="${CLOUD_RUN_CPU:-4}" CLOUD_RUN_MEMORY="${CLOUD_RUN_MEMORY:-16Gi}" CLOUD_RUN_TIMEOUT="${CLOUD_RUN_TIMEOUT:-300}" @@ -39,19 +40,6 @@ cloud_run_set_defaults() { export CLOUD_RUN_TAG } -cloud_run_reject_production_db() { - local production_instance="policyengine-api:us-central1:policyengine-api-data" - if [[ "${CLOUD_RUN_CLOUD_SQL_INSTANCE:-}" == "${production_instance}" ]]; then - echo "Cloud Run candidate must not use the production Cloud SQL instance: ${production_instance}" >&2 - return 1 - fi - - if [[ "${POLICYENGINE_DB_INSTANCE_CONNECTION_NAME:-}" == "${production_instance}" ]]; then - echo "Cloud Run candidate must not use the production DB instance connection name: ${production_instance}" >&2 - return 1 - fi -} - cloud_run_require_env() { local missing=() local name diff --git a/.github/scripts/validate_cloud_run_deploy_env.sh b/.github/scripts/validate_cloud_run_deploy_env.sh index b74df14d7..ab1141c37 100755 --- a/.github/scripts/validate_cloud_run_deploy_env.sh +++ b/.github/scripts/validate_cloud_run_deploy_env.sh @@ -24,5 +24,3 @@ cloud_run_require_env \ GATEWAY_AUTH_AUDIENCE \ GATEWAY_AUTH_CLIENT_ID \ GATEWAY_AUTH_CLIENT_SECRET_RESOURCE - -cloud_run_reject_production_db diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 0d5d9310a..6721d849d 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -332,11 +332,11 @@ jobs: deploy-cloud-run-candidate: name: Deploy Cloud Run candidate runs-on: ubuntu-latest - needs: integration-tests-staging + needs: deploy-production if: | (github.repository == 'PolicyEngine/policyengine-api') && (github.event.head_commit.message == 'Update PolicyEngine API') - environment: staging + environment: production permissions: contents: read id-token: write @@ -360,15 +360,12 @@ jobs: env: CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} - CLOUD_RUN_CLOUD_SQL_INSTANCE: ${{ vars.CLOUD_RUN_CLOUD_SQL_INSTANCE }} - name: Deploy tagged Cloud Run candidate run: bash .github/scripts/deploy_cloud_run_candidate.sh env: CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} - CLOUD_RUN_CLOUD_SQL_INSTANCE: ${{ vars.CLOUD_RUN_CLOUD_SQL_INSTANCE }} CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }} - POLICYENGINE_DB_INSTANCE_CONNECTION_NAME: ${{ vars.CLOUD_RUN_CLOUD_SQL_INSTANCE }} POLICYENGINE_DB_USER: ${{ vars.POLICYENGINE_DB_USER }} POLICYENGINE_DB_NAME: ${{ vars.POLICYENGINE_DB_NAME }} POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index d29580831..6397c1683 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -52,8 +52,8 @@ docker build -f gcp/cloud_run/Dockerfile -t policyengine-api-cloud-run:test . Live Cloud Run candidate checks must be explicit deployed probes. They require `API_BASE_URL` and `CLOUD_RUN_SMOKE_HOUSEHOLD_ID`, and should not run as part of ordinary local test commands. `CLOUD_RUN_SMOKE_HOUSEHOLD_ID` must point to a -pre-existing non-production household fixture; smoke tests must not create -households or point at the production Cloud SQL instance: +pre-existing read-only household fixture; smoke tests must not create or update +households: ```bash API_BASE_URL=https://candidate-url python -m pytest tests/integration/test_cloud_run_candidate.py -v diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index 5764f2c66..61aaa2db8 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -1,14 +1,14 @@ # PR 3 Cloud Run Candidate Runbook -PR 3 adds a Cloud Run candidate for the FastAPI ASGI shell. It uses staging -data-plane configuration and does not move user traffic. +PR 3 adds a production-configured Cloud Run candidate for the FastAPI ASGI +shell. It does not move user traffic. ## Included - Cloud Run Docker runtime for `policyengine_api.asgi:app`. -- Tagged no-traffic Cloud Run revisions deployed after staging integration - tests pass. -- Runtime environment configuration for a non-production Cloud SQL instance and +- Tagged no-traffic Cloud Run revisions deployed after App Engine production + promotion. +- Runtime environment configuration for the production Cloud SQL instance and the existing simulation gateway. - Smoke tests against the tagged Cloud Run URL, including an internal simulation-gateway health probe. @@ -27,14 +27,13 @@ data-plane configuration and does not move user traffic. - Region: `us-central1` - Service: `policyengine-api` - Artifact Registry repository: `policyengine-api` -- Cloud SQL instance: supplied by staging `CLOUD_RUN_CLOUD_SQL_INSTANCE`; this - must not be `policyengine-api:us-central1:policyengine-api-data`. +- Cloud SQL instance: `policyengine-api:us-central1:policyengine-api-data` - Revision tag: `stage3-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` ## Post-Merge Flow -The `Push` workflow deploys and tests App Engine staging first. Only after -staging integration tests pass, it builds and deploys a Cloud Run revision with: +The `Push` workflow still deploys and promotes App Engine production first. Only +after that succeeds, it builds and deploys a Cloud Run revision with: ```bash gcloud run deploy policyengine-api \ @@ -49,8 +48,7 @@ python -m pytest tests/integration/test_cloud_run_candidate.py -v ``` Failure marks the deployment workflow red, but App Engine remains the production -traffic target. The Cloud Run candidate must use staging DB credentials and a -non-production Cloud SQL instance. +traffic target. Smoke tests against the candidate must be read-only. ## Manual Smoke @@ -70,8 +68,7 @@ Expected behavior: - `/health/simulation-gateway` returns FastAPI JSON confirming the existing simulation gateway client can initialize and reach the gateway health check. - `/readiness-check` and `/liveness-check` return existing Flask text `OK`. -- `/us/metadata` returns the existing v1 metadata contract from the - non-production Cloud SQL instance. +- `/us/metadata` returns the existing v1 metadata contract from Cloud SQL. ## Rollback diff --git a/tests/integration/test_cloud_run_candidate.py b/tests/integration/test_cloud_run_candidate.py index 1ee4c618b..d8733493c 100644 --- a/tests/integration/test_cloud_run_candidate.py +++ b/tests/integration/test_cloud_run_candidate.py @@ -28,7 +28,6 @@ def test_cloud_run_candidate_health_routes(api_client): def test_cloud_run_candidate_metadata_policy_and_household( api_client, - integration_probe_id, ): metadata_response = api_client.get("/us/metadata") assert metadata_response.status_code == 200, metadata_response.text @@ -45,8 +44,8 @@ def test_cloud_run_candidate_metadata_policy_and_household( if household_id is None: pytest.fail( "CLOUD_RUN_SMOKE_HOUSEHOLD_ID must be set to a pre-existing " - "non-production household fixture. Cloud Run smoke tests must not " - "create households." + "read-only household fixture. Cloud Run smoke tests must not " + "create or update households." ) household_response = api_client.get(f"/us/household/{household_id}") diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index 41d8be320..cb85015a7 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -21,9 +21,6 @@ def _script_env(**overrides: str) -> dict[str, str]: def _required_runtime_env() -> dict[str, str]: return { - "CLOUD_RUN_CLOUD_SQL_INSTANCE": ( - "policyengine-api-staging:us-central1:policyengine-api-data-staging" - ), "POLICYENGINE_DB_PASSWORD": "db-password", "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN": "github-token", "ANTHROPIC_API_KEY": "anthropic-key", @@ -66,23 +63,9 @@ def test_validate_cloud_run_deploy_env_reports_missing_runtime_config(): assert result.returncode == 1 assert "Missing required Cloud Run deployment configuration" in result.stderr assert "POLICYENGINE_DB_PASSWORD" in result.stderr - assert "CLOUD_RUN_CLOUD_SQL_INSTANCE" in result.stderr assert "GATEWAY_AUTH_CLIENT_SECRET_RESOURCE" in result.stderr -def test_validate_cloud_run_deploy_env_rejects_production_db(): - env = _required_runtime_env() - env["CLOUD_RUN_CLOUD_SQL_INSTANCE"] = PRODUCTION_CLOUD_SQL_INSTANCE - - result = _run_script( - ".github/scripts/validate_cloud_run_deploy_env.sh", - _script_env(**env), - ) - - assert result.returncode == 1 - assert "must not use the production Cloud SQL instance" in result.stderr - - def test_build_cloud_run_image_dry_run_uses_cloud_run_dockerfile(): dockerignore = REPO / "gcp/cloud_run/Dockerfile.dockerignore" @@ -99,7 +82,6 @@ def test_build_cloud_run_image_dry_run_uses_cloud_run_dockerfile(): assert result.returncode == 0, result.stderr assert "gcp/cloud_run/Dockerfile" in result.stdout - assert PRODUCTION_CLOUD_SQL_INSTANCE not in result.stdout assert "docker push" in result.stdout assert ( "us-central1-docker.pkg.dev/policyengine-api/policyengine-api/" @@ -121,9 +103,12 @@ def test_deploy_cloud_run_candidate_dry_run_never_shifts_traffic(): assert "gcloud run deploy" in result.stdout assert "--no-traffic" in result.stdout assert "stage3-test" in result.stdout - assert "POLICYENGINE_DB_INSTANCE_CONNECTION_NAME=" in result.stdout + assert f"--add-cloudsql-instances {PRODUCTION_CLOUD_SQL_INSTANCE}" in result.stdout + assert ( + f"POLICYENGINE_DB_INSTANCE_CONNECTION_NAME={PRODUCTION_CLOUD_SQL_INSTANCE}" + in result.stdout + ) assert "CLOUD_RUN_INTERNAL_PROBES=1" in result.stdout - assert PRODUCTION_CLOUD_SQL_INSTANCE not in result.stdout assert "--to-latest" not in result.stdout assert "update-traffic" not in result.stdout From 3cf9a6a254d1975e739aed3fcc66362cd33e39fd Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 2 Jun 2026 16:57:42 +0200 Subject: [PATCH 09/22] Run Cloud Run candidate through staging track --- .github/workflows/push.yml | 104 ++++++++++++++++-- docs/engineering/skills/testing.md | 14 ++- ...gration-pr3-cloud-run-candidate-runbook.md | 51 +++++++-- tests/unit/test_cloud_run_deploy_scripts.py | 61 ++++++++++ 4 files changed, 202 insertions(+), 28 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 6721d849d..177da02a9 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -209,8 +209,73 @@ jobs: - name: Wait for staging version health run: bash .github/scripts/health_check.sh "${{ steps.version_url.outputs.url }}/readiness-check" + deploy-cloud-run-staging: + name: Deploy staging Cloud Run candidate + runs-on: ubuntu-latest + needs: + - ensure-staging-model-version-aligns-with-sim-api + - publish-git-tag + if: | + (github.repository == 'PolicyEngine/policyengine-api') + && (github.event.head_commit.message == 'Update PolicyEngine API') + environment: staging + permissions: + contents: read + id-token: write + outputs: + tag: ${{ steps.cloud_run.outputs.revision_tag }} + url: ${{ steps.cloud_run_url.outputs.url }} + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: GCP authentication + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}" + service_account: "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" + - name: Set up GCloud + uses: "google-github-actions/setup-gcloud@v2" + - name: Compute Cloud Run staging metadata + id: cloud_run + run: | + echo "image_tag=${GITHUB_SHA}" >> "$GITHUB_OUTPUT" + echo "revision_tag=stage3-staging-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" + - name: Build and push Cloud Run image + run: bash .github/scripts/build_cloud_run_image.sh + env: + CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} + CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + - name: Deploy tagged Cloud Run staging candidate + run: bash .github/scripts/deploy_cloud_run_candidate.sh + env: + CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} + CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }} + POLICYENGINE_DB_USER: ${{ vars.POLICYENGINE_DB_USER }} + POLICYENGINE_DB_NAME: ${{ vars.POLICYENGINE_DB_NAME }} + POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + SIMULATION_API_URL: ${{ secrets.SIMULATION_API_URL }} + GATEWAY_AUTH_ISSUER: ${{ secrets.GATEWAY_AUTH_ISSUER }} + GATEWAY_AUTH_AUDIENCE: ${{ secrets.GATEWAY_AUTH_AUDIENCE }} + GATEWAY_AUTH_CLIENT_ID: ${{ secrets.GATEWAY_AUTH_CLIENT_ID }} + GATEWAY_AUTH_CLIENT_SECRET_RESOURCE: ${{ secrets.GATEWAY_AUTH_CLIENT_SECRET_RESOURCE }} + - name: Resolve Cloud Run staging URL + id: cloud_run_url + run: | + url="$(bash .github/scripts/get_cloud_run_tag_url.sh)" + echo "url=${url}" >> "$GITHUB_OUTPUT" + echo "Cloud Run staging URL: ${url}" + env: + CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + - name: Wait for Cloud Run staging health + run: bash .github/scripts/health_check.sh "${{ steps.cloud_run_url.outputs.url }}/readiness-check" + integration-tests-staging: - name: Run staging integration tests + name: Run App Engine staging integration tests runs-on: ubuntu-latest needs: deploy-staging if: | @@ -231,10 +296,34 @@ jobs: API_BASE_URL: ${{ needs.deploy-staging.outputs.url }} STAGING_API_TEST_PROBE_ID: ${{ needs.deploy-staging.outputs.version }} + integration-tests-staging-cloud-run: + name: Run Cloud Run staging integration tests + runs-on: ubuntu-latest + needs: deploy-cloud-run-staging + if: | + (github.repository == 'PolicyEngine/policyengine-api') + && (github.event.head_commit.message == 'Update PolicyEngine API') + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install staging test dependencies + run: pip install pytest httpx + - name: Run staging smoke test + run: python -m pytest tests/integration/test_live_calculate.py tests/integration/test_live_economy.py tests/integration/test_live_budget_window_cache.py -v + env: + API_BASE_URL: ${{ needs.deploy-cloud-run-staging.outputs.url }} + STAGING_API_TEST_PROBE_ID: cloud-run-${{ needs.deploy-cloud-run-staging.outputs.tag }} + ensure-production-model-version-aligns-with-sim-api: name: Ensure production model version aligns with simulation API runs-on: ubuntu-latest - needs: integration-tests-staging + needs: + - integration-tests-staging + - integration-tests-staging-cloud-run if: | (github.repository == 'PolicyEngine/policyengine-api') && (github.event.head_commit.message == 'Update PolicyEngine API') @@ -330,9 +419,9 @@ jobs: APP_ENGINE_VERSION: ${{ steps.version.outputs.version }} deploy-cloud-run-candidate: - name: Deploy Cloud Run candidate + name: Deploy production Cloud Run candidate runs-on: ubuntu-latest - needs: deploy-production + needs: ensure-production-model-version-aligns-with-sim-api if: | (github.repository == 'PolicyEngine/policyengine-api') && (github.event.head_commit.message == 'Update PolicyEngine API') @@ -354,12 +443,7 @@ jobs: id: cloud_run run: | echo "image_tag=${GITHUB_SHA}" >> "$GITHUB_OUTPUT" - echo "revision_tag=stage3-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - - name: Build and push Cloud Run image - run: bash .github/scripts/build_cloud_run_image.sh - env: - CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} - CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + echo "revision_tag=stage3-prod-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - name: Deploy tagged Cloud Run candidate run: bash .github/scripts/deploy_cloud_run_candidate.sh env: diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index 6397c1683..36b743b75 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -42,18 +42,20 @@ route inventory, migration registry flags, or v1 contract expectations change. FastAPI shell-only fallback changes should not change the route catalog. For PR 3 Cloud Run candidate deployment changes, verify the command-building -guards, ASGI compatibility, and container build: +guards, workflow track structure, ASGI compatibility, and container build: ```bash python -m pytest tests/unit/test_cloud_run_deploy_scripts.py tests/unit/test_asgi_factory.py -q docker build -f gcp/cloud_run/Dockerfile -t policyengine-api-cloud-run:test . ``` -Live Cloud Run candidate checks must be explicit deployed probes. They require -`API_BASE_URL` and `CLOUD_RUN_SMOKE_HOUSEHOLD_ID`, and should not run as part of -ordinary local test commands. `CLOUD_RUN_SMOKE_HOUSEHOLD_ID` must point to a -pre-existing read-only household fixture; smoke tests must not create or update -households: +Staging deployment checks should run the same live integration suite against +both the App Engine staging URL and the tagged Cloud Run staging URL. Live Cloud +Run candidate checks must be explicit deployed probes. Production candidate +smoke tests require `API_BASE_URL` and `CLOUD_RUN_SMOKE_HOUSEHOLD_ID`, and +should not run as part of ordinary local test commands. +`CLOUD_RUN_SMOKE_HOUSEHOLD_ID` must point to a pre-existing read-only household +fixture; smoke tests must not create or update households: ```bash API_BASE_URL=https://candidate-url python -m pytest tests/integration/test_cloud_run_candidate.py -v diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index 61aaa2db8..3ab6b3638 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -6,12 +6,14 @@ shell. It does not move user traffic. ## Included - Cloud Run Docker runtime for `policyengine_api.asgi:app`. -- Tagged no-traffic Cloud Run revisions deployed after App Engine production - promotion. +- Tagged no-traffic Cloud Run revisions deployed on both the staging and + production tracks. - Runtime environment configuration for the production Cloud SQL instance and the existing simulation gateway. -- Smoke tests against the tagged Cloud Run URL, including an internal - simulation-gateway health probe. +- The same live staging integration suite against both the App Engine staging + URL and the tagged Cloud Run staging URL. +- Production smoke tests against the tagged Cloud Run URL, including an + internal simulation-gateway health probe. ## Not Included @@ -28,12 +30,35 @@ shell. It does not move user traffic. - Service: `policyengine-api` - Artifact Registry repository: `policyengine-api` - Cloud SQL instance: `policyengine-api:us-central1:policyengine-api-data` -- Revision tag: `stage3-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` +- Staging revision tag: `stage3-staging-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` +- Production revision tag: `stage3-prod-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` ## Post-Merge Flow -The `Push` workflow still deploys and promotes App Engine production first. Only -after that succeeds, it builds and deploys a Cloud Run revision with: +The `Push` workflow now uses two deployment tracks. + +Staging: + +1. Deploy an App Engine staging version. +2. Build and deploy a tagged Cloud Run staging revision with no traffic. +3. Run the same live integration suite against both URLs in parallel: + +```bash +python -m pytest \ + tests/integration/test_live_calculate.py \ + tests/integration/test_live_economy.py \ + tests/integration/test_live_budget_window_cache.py \ + -v +``` + +Production: + +1. After both staging integration jobs pass, run the production model-version + alignment check. +2. Deploy/promote the App Engine production version. +3. Deploy a tagged Cloud Run production revision with no traffic. + +The Cloud Run deploy command still uses: ```bash gcloud run deploy policyengine-api \ @@ -41,14 +66,15 @@ gcloud run deploy policyengine-api \ --no-traffic ``` -The workflow then resolves the tagged URL and runs: +The production Cloud Run job resolves the tagged URL and runs: ```bash python -m pytest tests/integration/test_cloud_run_candidate.py -v ``` Failure marks the deployment workflow red, but App Engine remains the production -traffic target. Smoke tests against the candidate must be read-only. +traffic target because Cloud Run is not assigned traffic and the public URL is +not migrated. Smoke tests against the production candidate must be read-only. ## Manual Smoke @@ -72,6 +98,7 @@ Expected behavior: ## Rollback -No user traffic is routed to the Cloud Run candidate in this PR. If the candidate -fails, leave App Engine as production-primary and fix the Cloud Run deploy path -in a follow-up commit. +No user traffic is routed to the Cloud Run candidate in this PR. If the staging +Cloud Run track fails, production deployment is blocked. If the production Cloud +Run candidate fails, leave App Engine as production-primary and fix the Cloud Run +deploy path in a follow-up commit. diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index cb85015a7..5a2587f6a 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import re import subprocess from pathlib import Path @@ -47,6 +48,20 @@ def _run_script(path: str, env: dict[str, str]) -> subprocess.CompletedProcess[s ) +def _push_workflow() -> str: + return (REPO / ".github/workflows/push.yml").read_text(encoding="utf-8") + + +def _workflow_job_block(workflow: str, job_name: str) -> str: + match = re.search( + rf"^ {re.escape(job_name)}:\n(?P.*?)(?=^ [a-zA-Z0-9_-]+:|\Z)", + workflow, + flags=re.MULTILINE | re.DOTALL, + ) + assert match is not None, f"Missing workflow job {job_name}" + return match.group("body") + + def test_cloud_run_startup_uses_asgi_entrypoint(): start_script = (REPO / "gcp/cloud_run/start.sh").read_text(encoding="utf-8") @@ -123,3 +138,49 @@ def test_get_cloud_run_tag_url_dry_run_uses_candidate_tag(): assert result.stdout.strip() == ( "https://stage3-test---policyengine-api-dry-run.a.run.app" ) + + +def test_push_workflow_tests_app_engine_and_cloud_run_staging_tracks(): + workflow = _push_workflow() + app_engine_tests = _workflow_job_block(workflow, "integration-tests-staging") + cloud_run_tests = _workflow_job_block( + workflow, + "integration-tests-staging-cloud-run", + ) + production_gate = _workflow_job_block( + workflow, + "ensure-production-model-version-aligns-with-sim-api", + ) + live_test_command = ( + "python -m pytest tests/integration/test_live_calculate.py " + "tests/integration/test_live_economy.py " + "tests/integration/test_live_budget_window_cache.py -v" + ) + + assert live_test_command in app_engine_tests + assert live_test_command in cloud_run_tests + assert "API_BASE_URL: ${{ needs.deploy-staging.outputs.url }}" in app_engine_tests + assert ( + "API_BASE_URL: ${{ needs.deploy-cloud-run-staging.outputs.url }}" + in cloud_run_tests + ) + assert "- integration-tests-staging" in production_gate + assert "- integration-tests-staging-cloud-run" in production_gate + + +def test_push_workflow_deploys_production_tracks_in_parallel(): + workflow = _push_workflow() + app_engine_production = _workflow_job_block(workflow, "deploy-production") + cloud_run_production = _workflow_job_block(workflow, "deploy-cloud-run-candidate") + + assert ( + "needs: ensure-production-model-version-aligns-with-sim-api" + in app_engine_production + ) + assert ( + "needs: ensure-production-model-version-aligns-with-sim-api" + in cloud_run_production + ) + assert "needs: deploy-production" not in cloud_run_production + assert "stage3-prod-" in cloud_run_production + assert "Build and push Cloud Run image" not in cloud_run_production From 7038d79b2c8c3b632cf65a4fa52003a3c3f85f17 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 2 Jun 2026 17:56:19 +0200 Subject: [PATCH 10/22] Promote tested Cloud Run candidates --- .github/scripts/get_cloud_run_service_url.sh | 17 ++++++ .github/scripts/promote_cloud_run_tag.sh | 18 +++++++ .github/workflows/push.yml | 53 ++++++++++++++++++- docs/engineering/skills/testing.md | 11 ++-- ...gration-pr3-cloud-run-candidate-runbook.md | 39 ++++++++++---- tests/unit/test_cloud_run_deploy_scripts.py | 44 ++++++++++++++- 6 files changed, 165 insertions(+), 17 deletions(-) create mode 100644 .github/scripts/get_cloud_run_service_url.sh create mode 100644 .github/scripts/promote_cloud_run_tag.sh diff --git a/.github/scripts/get_cloud_run_service_url.sh b/.github/scripts/get_cloud_run_service_url.sh new file mode 100644 index 000000000..1dc193cc3 --- /dev/null +++ b/.github/scripts/get_cloud_run_service_url.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source .github/scripts/cloud_run_env.sh +cloud_run_set_defaults + +if [[ "${CLOUD_RUN_DRY_RUN:-0}" == "1" ]]; then + echo "https://${CLOUD_RUN_SERVICE}-dry-run.a.run.app" + exit 0 +fi + +gcloud run services describe "${CLOUD_RUN_SERVICE}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --region "${CLOUD_RUN_REGION}" \ + --platform managed \ + --format 'value(status.url)' diff --git a/.github/scripts/promote_cloud_run_tag.sh b/.github/scripts/promote_cloud_run_tag.sh new file mode 100644 index 000000000..751cc6b26 --- /dev/null +++ b/.github/scripts/promote_cloud_run_tag.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source .github/scripts/cloud_run_env.sh +cloud_run_set_defaults + +cloud_run_require_env \ + CLOUD_RUN_PROJECT \ + CLOUD_RUN_REGION \ + CLOUD_RUN_SERVICE \ + CLOUD_RUN_TAG + +cloud_run_run gcloud run services update-traffic "${CLOUD_RUN_SERVICE}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --region "${CLOUD_RUN_REGION}" \ + --platform managed \ + --to-tags "${CLOUD_RUN_TAG}=100" diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 177da02a9..60c6dc52c 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -318,12 +318,51 @@ jobs: API_BASE_URL: ${{ needs.deploy-cloud-run-staging.outputs.url }} STAGING_API_TEST_PROBE_ID: cloud-run-${{ needs.deploy-cloud-run-staging.outputs.tag }} + promote-cloud-run-staging: + name: Promote staging Cloud Run traffic + runs-on: ubuntu-latest + needs: + - deploy-cloud-run-staging + - integration-tests-staging + - integration-tests-staging-cloud-run + if: | + (github.repository == 'PolicyEngine/policyengine-api') + && (github.event.head_commit.message == 'Update PolicyEngine API') + environment: staging + permissions: + contents: read + id-token: write + outputs: + url: ${{ steps.cloud_run_service_url.outputs.url }} + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: GCP authentication + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}" + service_account: "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" + - name: Set up GCloud + uses: "google-github-actions/setup-gcloud@v2" + - name: Promote Cloud Run staging candidate + run: bash .github/scripts/promote_cloud_run_tag.sh + env: + CLOUD_RUN_TAG: ${{ needs.deploy-cloud-run-staging.outputs.tag }} + - name: Resolve Cloud Run staging service URL + id: cloud_run_service_url + run: | + url="$(bash .github/scripts/get_cloud_run_service_url.sh)" + echo "url=${url}" >> "$GITHUB_OUTPUT" + echo "Cloud Run staging service URL: ${url}" + - name: Wait for Cloud Run staging service health + run: bash .github/scripts/health_check.sh "${{ steps.cloud_run_service_url.outputs.url }}/readiness-check" + ensure-production-model-version-aligns-with-sim-api: name: Ensure production model version aligns with simulation API runs-on: ubuntu-latest needs: - integration-tests-staging - - integration-tests-staging-cloud-run + - promote-cloud-run-staging if: | (github.repository == 'PolicyEngine/policyengine-api') && (github.event.head_commit.message == 'Update PolicyEngine API') @@ -478,6 +517,18 @@ jobs: API_BASE_URL: ${{ steps.cloud_run_url.outputs.url }} STAGING_API_TEST_PROBE_ID: cloud-run-${{ steps.cloud_run.outputs.revision_tag }} CLOUD_RUN_SMOKE_HOUSEHOLD_ID: ${{ vars.CLOUD_RUN_SMOKE_HOUSEHOLD_ID }} + - name: Promote Cloud Run production candidate + run: bash .github/scripts/promote_cloud_run_tag.sh + env: + CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} + - name: Resolve Cloud Run production service URL + id: cloud_run_service_url + run: | + url="$(bash .github/scripts/get_cloud_run_service_url.sh)" + echo "url=${url}" >> "$GITHUB_OUTPUT" + echo "Cloud Run production service URL: ${url}" + - name: Wait for Cloud Run production service health + run: bash .github/scripts/health_check.sh "${{ steps.cloud_run_service_url.outputs.url }}/readiness-check" docker: name: Docker diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index 36b743b75..40939ef1a 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -50,10 +50,13 @@ docker build -f gcp/cloud_run/Dockerfile -t policyengine-api-cloud-run:test . ``` Staging deployment checks should run the same live integration suite against -both the App Engine staging URL and the tagged Cloud Run staging URL. Live Cloud -Run candidate checks must be explicit deployed probes. Production candidate -smoke tests require `API_BASE_URL` and `CLOUD_RUN_SMOKE_HOUSEHOLD_ID`, and -should not run as part of ordinary local test commands. +both the App Engine staging URL and the tagged Cloud Run staging URL before +promoting the tested Cloud Run tag to the service URL. Production Cloud Run +promotion should happen only after tagged candidate smoke tests pass, and should +health-check the Cloud Run service URL after promotion. Live Cloud Run candidate +checks must be explicit deployed probes. Production candidate smoke tests +require `API_BASE_URL` and `CLOUD_RUN_SMOKE_HOUSEHOLD_ID`, and should not run as +part of ordinary local test commands. `CLOUD_RUN_SMOKE_HOUSEHOLD_ID` must point to a pre-existing read-only household fixture; smoke tests must not create or update households: diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index 3ab6b3638..28cc1dfbd 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -1,13 +1,14 @@ # PR 3 Cloud Run Candidate Runbook PR 3 adds a production-configured Cloud Run candidate for the FastAPI ASGI -shell. It does not move user traffic. +shell. It makes the Cloud Run service URL live after staged validation, but it +does not migrate the public App Engine/custom API URL. ## Included - Cloud Run Docker runtime for `policyengine_api.asgi:app`. - Tagged no-traffic Cloud Run revisions deployed on both the staging and - production tracks. + production tracks, then promoted to the Cloud Run service URL after tests. - Runtime environment configuration for the production Cloud SQL instance and the existing simulation gateway. - The same live staging integration suite against both the App Engine staging @@ -17,8 +18,9 @@ shell. It does not move user traffic. ## Not Included -- No public API host traffic shift. -- No Cloud Run traffic ramp. +- No public App Engine/custom API host traffic shift. +- No percent-based Cloud Run traffic ramp; the tested tag is promoted to 100% + of the Cloud Run service URL. - No native FastAPI route migration beyond `/health`. - No Supabase, Alembic, SQLAlchemy model, or Modal compute migration. - No App Engine retirement. @@ -50,6 +52,8 @@ python -m pytest \ tests/integration/test_live_budget_window_cache.py \ -v ``` +4. Promote the tested Cloud Run staging tag to 100% of the Cloud Run service + URL and health-check that service URL. Production: @@ -57,6 +61,9 @@ Production: alignment check. 2. Deploy/promote the App Engine production version. 3. Deploy a tagged Cloud Run production revision with no traffic. +4. Smoke-test the tagged Cloud Run production URL. +5. Promote the tested production tag to 100% of the Cloud Run service URL and + health-check that service URL. The Cloud Run deploy command still uses: @@ -72,9 +79,16 @@ The production Cloud Run job resolves the tagged URL and runs: python -m pytest tests/integration/test_cloud_run_candidate.py -v ``` -Failure marks the deployment workflow red, but App Engine remains the production -traffic target because Cloud Run is not assigned traffic and the public URL is -not migrated. Smoke tests against the production candidate must be read-only. +Then it assigns Cloud Run service traffic to the tested tag: + +```bash +gcloud run services update-traffic policyengine-api \ + --to-tags "$CLOUD_RUN_TAG=100" +``` + +Failure marks the deployment workflow red. App Engine remains the public +production traffic target because the public URL is not migrated to Cloud Run. +Smoke tests against the production candidate must be read-only. ## Manual Smoke @@ -98,7 +112,10 @@ Expected behavior: ## Rollback -No user traffic is routed to the Cloud Run candidate in this PR. If the staging -Cloud Run track fails, production deployment is blocked. If the production Cloud -Run candidate fails, leave App Engine as production-primary and fix the Cloud Run -deploy path in a follow-up commit. +The public App Engine/custom API URL is not routed to the Cloud Run candidate in +this PR. If the staging Cloud Run track fails, production deployment is blocked. +If the production Cloud Run candidate fails before promotion, leave App Engine +as production-primary and fix the Cloud Run deploy path in a follow-up commit. +If the production Cloud Run service URL is promoted and later regresses, deploy +a fixed tagged revision and promote that tag, or manually shift the Cloud Run +service URL back to a prior healthy revision. diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index 5a2587f6a..79736867c 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -140,6 +140,28 @@ def test_get_cloud_run_tag_url_dry_run_uses_candidate_tag(): ) +def test_get_cloud_run_service_url_dry_run_uses_service_url(): + result = _run_script( + ".github/scripts/get_cloud_run_service_url.sh", + _script_env(CLOUD_RUN_SERVICE="policyengine-api"), + ) + + assert result.returncode == 0, result.stderr + assert result.stdout.strip() == "https://policyengine-api-dry-run.a.run.app" + + +def test_promote_cloud_run_tag_dry_run_shifts_service_traffic_to_tag(): + result = _run_script( + ".github/scripts/promote_cloud_run_tag.sh", + _script_env(CLOUD_RUN_TAG="stage3-test", CLOUD_RUN_SERVICE="policyengine-api"), + ) + + assert result.returncode == 0, result.stderr + assert "gcloud run services update-traffic policyengine-api" in result.stdout + assert "--to-tags stage3-test=100" in result.stdout + assert "--to-latest" not in result.stdout + + def test_push_workflow_tests_app_engine_and_cloud_run_staging_tracks(): workflow = _push_workflow() app_engine_tests = _workflow_job_block(workflow, "integration-tests-staging") @@ -147,6 +169,7 @@ def test_push_workflow_tests_app_engine_and_cloud_run_staging_tracks(): workflow, "integration-tests-staging-cloud-run", ) + cloud_run_promotion = _workflow_job_block(workflow, "promote-cloud-run-staging") production_gate = _workflow_job_block( workflow, "ensure-production-model-version-aligns-with-sim-api", @@ -165,7 +188,12 @@ def test_push_workflow_tests_app_engine_and_cloud_run_staging_tracks(): in cloud_run_tests ) assert "- integration-tests-staging" in production_gate - assert "- integration-tests-staging-cloud-run" in production_gate + assert "- promote-cloud-run-staging" in production_gate + assert "- integration-tests-staging-cloud-run" not in production_gate + assert "- integration-tests-staging" in cloud_run_promotion + assert "- integration-tests-staging-cloud-run" in cloud_run_promotion + assert "bash .github/scripts/promote_cloud_run_tag.sh" in cloud_run_promotion + assert "bash .github/scripts/get_cloud_run_service_url.sh" in cloud_run_promotion def test_push_workflow_deploys_production_tracks_in_parallel(): @@ -184,3 +212,17 @@ def test_push_workflow_deploys_production_tracks_in_parallel(): assert "needs: deploy-production" not in cloud_run_production assert "stage3-prod-" in cloud_run_production assert "Build and push Cloud Run image" not in cloud_run_production + + +def test_push_workflow_promotes_production_cloud_run_after_candidate_smoke(): + workflow = _push_workflow() + cloud_run_production = _workflow_job_block(workflow, "deploy-cloud-run-candidate") + smoke_index = cloud_run_production.index( + "python -m pytest tests/integration/test_cloud_run_candidate.py -v" + ) + promote_index = cloud_run_production.index( + "bash .github/scripts/promote_cloud_run_tag.sh" + ) + + assert smoke_index < promote_index + assert "bash .github/scripts/get_cloud_run_service_url.sh" in cloud_run_production From 1c22eb8a37f19049706ba263c5d67f32fd9faae4 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 2 Jun 2026 19:39:31 +0200 Subject: [PATCH 11/22] Harden Cloud Run startup supervision --- docs/engineering/skills/testing.md | 6 ++ ...gration-pr3-cloud-run-candidate-runbook.md | 4 ++ gcp/cloud_run/Dockerfile | 2 +- gcp/cloud_run/start.sh | 66 +++++++++++++++++-- tests/unit/test_cloud_run_deploy_scripts.py | 38 +++++++++++ 5 files changed, 111 insertions(+), 5 deletions(-) diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index 40939ef1a..71ce0d56f 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -49,6 +49,12 @@ python -m pytest tests/unit/test_cloud_run_deploy_scripts.py tests/unit/test_asg docker build -f gcp/cloud_run/Dockerfile -t policyengine-api-cloud-run:test . ``` +If the Cloud Run container startup script changes, keep the script syntax and +child-process supervision assertions in `tests/unit/test_cloud_run_deploy_scripts.py` +updated. The tier 1 Redis path keeps Redis local to the container, so tests +should verify the bash entrypoint, explicit Redis/Uvicorn PID tracking, and +fail-fast behavior rather than any managed Redis integration. + Staging deployment checks should run the same live integration suite against both the App Engine staging URL and the tagged Cloud Run staging URL before promoting the tested Cloud Run tag to the service URL. Production Cloud Run diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index 28cc1dfbd..d02993dbc 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -15,6 +15,9 @@ does not migrate the public App Engine/custom API URL. URL and the tagged Cloud Run staging URL. - Production smoke tests against the tagged Cloud Run URL, including an internal simulation-gateway health probe. +- Tier 1 Cloud Run startup supervision: the container still runs local Redis, + but the bash startup script tracks Redis and Uvicorn child PIDs explicitly and + exits if either process dies. ## Not Included @@ -23,6 +26,7 @@ does not migrate the public App Engine/custom API URL. of the Cloud Run service URL. - No native FastAPI route migration beyond `/health`. - No Supabase, Alembic, SQLAlchemy model, or Modal compute migration. +- No managed Redis, Redis Memorystore, or API v2-alpha-style cache replacement. - No App Engine retirement. ## Resource Defaults diff --git a/gcp/cloud_run/Dockerfile b/gcp/cloud_run/Dockerfile index d2a9191f9..f2a38279c 100644 --- a/gcp/cloud_run/Dockerfile +++ b/gcp/cloud_run/Dockerfile @@ -19,4 +19,4 @@ ENV CACHE_REDIS_HOST=127.0.0.1 ENV CACHE_REDIS_PORT=6379 ENV CACHE_REDIS_DB=0 -CMD ["/bin/sh", "/app/start.sh"] +CMD ["/bin/bash", "/app/start.sh"] diff --git a/gcp/cloud_run/start.sh b/gcp/cloud_run/start.sh index 7fc4df859..35a27a455 100755 --- a/gcp/cloud_run/start.sh +++ b/gcp/cloud_run/start.sh @@ -1,20 +1,60 @@ -#!/bin/sh -set -eu +#!/usr/bin/env bash +set -euo pipefail PORT="${PORT:-8080}" CACHE_REDIS_HOST="${CACHE_REDIS_HOST:-127.0.0.1}" CACHE_REDIS_PORT="${CACHE_REDIS_PORT:-6379}" CACHE_REDIS_DB="${CACHE_REDIS_DB:-0}" WEB_CONCURRENCY="${WEB_CONCURRENCY:-1}" +REDIS_READY_MAX_ATTEMPTS="${REDIS_READY_MAX_ATTEMPTS:-30}" export CACHE_REDIS_HOST CACHE_REDIS_PORT CACHE_REDIS_DB +redis_pid="" +uvicorn_pid="" + +shutdown() { + trap - INT TERM + + if [ -n "$uvicorn_pid" ] && kill -0 "$uvicorn_pid" 2>/dev/null; then + kill "$uvicorn_pid" 2>/dev/null || true + fi + + if [ -n "$redis_pid" ] && kill -0 "$redis_pid" 2>/dev/null; then + kill "$redis_pid" 2>/dev/null || true + fi + + if [ -n "$uvicorn_pid" ]; then + wait "$uvicorn_pid" 2>/dev/null || true + fi + + if [ -n "$redis_pid" ]; then + wait "$redis_pid" 2>/dev/null || true + fi +} + +trap 'shutdown; exit 143' INT TERM + redis-server --bind "$CACHE_REDIS_HOST" \ --port "$CACHE_REDIS_PORT" \ --protected-mode yes \ --maxclients 10000 \ --timeout 0 & +redis_pid="$!" +redis_ready_attempts=0 until redis-cli -h "$CACHE_REDIS_HOST" -p "$CACHE_REDIS_PORT" ping >/dev/null 2>&1; do + redis_ready_attempts=$((redis_ready_attempts + 1)) + if ! kill -0 "$redis_pid" 2>/dev/null; then + echo "Redis exited before becoming ready" >&2 + shutdown + exit 1 + fi + + if [ "$redis_ready_attempts" -ge "$REDIS_READY_MAX_ATTEMPTS" ]; then + echo "Redis did not become ready after $redis_ready_attempts attempts" >&2 + shutdown + exit 1 + fi sleep 1 done @@ -24,7 +64,25 @@ uvicorn policyengine_api.asgi:app \ --workers "$WEB_CONCURRENCY" \ --proxy-headers \ --forwarded-allow-ips '*' & +uvicorn_pid="$!" + +set +e +wait -n "$redis_pid" "$uvicorn_pid" +status="$?" +set -e + +if ! kill -0 "$redis_pid" 2>/dev/null; then + echo "Redis exited; stopping Cloud Run container" >&2 +elif ! kill -0 "$uvicorn_pid" 2>/dev/null; then + echo "Uvicorn exited; stopping Cloud Run container" >&2 +else + echo "A supervised Cloud Run process exited; stopping container" >&2 +fi + +shutdown -trap "pkill -P $$; exit 1" INT TERM +if [ "$status" -eq 0 ]; then + exit 1 +fi -wait +exit "$status" diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index 79736867c..5288eab2e 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -69,6 +69,44 @@ def test_cloud_run_startup_uses_asgi_entrypoint(): assert "policyengine_api.api" not in start_script +def test_cloud_run_startup_script_is_shell_syntax_valid(): + result = subprocess.run( + ["bash", "-n", "gcp/cloud_run/start.sh"], + cwd=REPO, + text=True, + capture_output=True, + check=False, + ) + + assert result.returncode == 0, result.stderr + + +def test_cloud_run_dockerfile_runs_startup_with_bash(): + dockerfile = (REPO / "gcp/cloud_run/Dockerfile").read_text(encoding="utf-8") + + assert 'CMD ["/bin/bash", "/app/start.sh"]' in dockerfile + assert 'CMD ["/bin/sh", "/app/start.sh"]' not in dockerfile + + +def test_cloud_run_startup_supervises_redis_and_uvicorn_children(): + start_script = (REPO / "gcp/cloud_run/start.sh").read_text(encoding="utf-8") + + assert "#!/usr/bin/env bash" in start_script + assert 'redis_pid="$!"' in start_script + assert 'uvicorn_pid="$!"' in start_script + assert "REDIS_READY_MAX_ATTEMPTS" in start_script + assert "Redis exited before becoming ready" in start_script + assert "Redis did not become ready" in start_script + assert "Redis exited; stopping Cloud Run container" in start_script + assert "Uvicorn exited; stopping Cloud Run container" in start_script + assert 'wait -n "$redis_pid" "$uvicorn_pid"' in start_script + assert 'kill -0 "$redis_pid"' in start_script + assert 'kill -0 "$uvicorn_pid"' in start_script + assert "trap 'shutdown; exit 143' INT TERM" in start_script + assert "pkill" not in start_script + assert re.search(r"(?m)^ *wait 2>/dev/null", start_script) is None + + def test_validate_cloud_run_deploy_env_reports_missing_runtime_config(): result = _run_script( ".github/scripts/validate_cloud_run_deploy_env.sh", From be1d6adadd8d145767cea57bff78d9cb362575cd Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 2 Jun 2026 22:17:54 +0200 Subject: [PATCH 12/22] Log FastAPI native migration requests --- policyengine_api/asgi_factory.py | 24 +++++++ policyengine_api/migration_flags.py | 7 ++- policyengine_api/migration_logging.py | 63 ++++++++++++------- .../routes/test_migration_context_logging.py | 55 ++++++++++++++++ tests/unit/test_migration_flags.py | 2 + 5 files changed, 129 insertions(+), 22 deletions(-) diff --git a/policyengine_api/asgi_factory.py b/policyengine_api/asgi_factory.py index 2fc0e3728..7fb6166cf 100644 --- a/policyengine_api/asgi_factory.py +++ b/policyengine_api/asgi_factory.py @@ -3,6 +3,8 @@ from __future__ import annotations import os +import time +import uuid from typing import Literal from a2wsgi import WSGIMiddleware @@ -10,6 +12,15 @@ from pydantic import BaseModel from policyengine_api.constants import VERSION +from policyengine_api.migration_logging import log_migration_request + + +FASTAPI_NATIVE_LOGGED_PATHS = frozenset( + { + "/health", + "/health/simulation-gateway", + } +) class HealthResponse(BaseModel): @@ -51,11 +62,24 @@ def create_asgi_app(wsgi_app) -> FastAPI: @app.middleware("http") async def add_cors_for_native_routes(request, call_next): + started_at = time.time() + request_id = request.headers.get("X-Request-ID") or uuid.uuid4().hex response = await call_next(request) origin = request.headers.get("origin") if origin and "access-control-allow-origin" not in response.headers: response.headers["Access-Control-Allow-Origin"] = origin _add_vary_origin(response) + if request.url.path in FASTAPI_NATIVE_LOGGED_PATHS: + try: + log_migration_request( + request_id=request_id, + method=request.method, + path=request.url.path, + status_code=response.status_code, + started_at=started_at, + ) + except Exception: + pass return response @app.get("/health", response_model=HealthResponse) diff --git a/policyengine_api/migration_flags.py b/policyengine_api/migration_flags.py index c67ddda70..0075ab9bf 100644 --- a/policyengine_api/migration_flags.py +++ b/policyengine_api/migration_flags.py @@ -69,7 +69,12 @@ def infer_route_group(path: str) -> str: """Infer a migration route group from a request path.""" if path in {"/", ""}: return "home" - if path in {"/liveness-check", "/readiness-check"}: + if path in { + "/health", + "/health/simulation-gateway", + "/liveness-check", + "/readiness-check", + }: return "health" if path == "/specification": return "specification" diff --git a/policyengine_api/migration_logging.py b/policyengine_api/migration_logging.py index 8bbb31dc5..5230502d8 100644 --- a/policyengine_api/migration_logging.py +++ b/policyengine_api/migration_logging.py @@ -27,27 +27,15 @@ def set_request_migration_context(): @app.after_request def log_request_migration_context(response): try: - route_group = infer_route_group(flask.request.path) - migration_context = get_migration_log_context(route_group) - elapsed_ms = None - started_at = getattr(flask.g, "request_started_at", None) - if started_at is not None: - elapsed_ms = round((time.time() - started_at) * 1000, 2) - - logger.log_struct( - { - "message": "API request served", - "request_id": getattr(flask.g, "request_id", None), - "method": flask.request.method, - "path": flask.request.path, - "status_code": response.status_code, - "latency_ms": elapsed_ms, - "country_id": flask.request.view_args.get("country_id") - if flask.request.view_args - else None, - "migration": migration_context, - }, - severity="INFO" if response.status_code < 500 else "ERROR", + log_migration_request( + request_id=getattr(flask.g, "request_id", None), + method=flask.request.method, + path=flask.request.path, + status_code=response.status_code, + started_at=getattr(flask.g, "request_started_at", None), + country_id=flask.request.view_args.get("country_id") + if flask.request.view_args + else None, ) except Exception: try: @@ -55,3 +43,36 @@ def log_request_migration_context(response): except Exception: pass return response + + +def log_migration_request( + *, + request_id: str | None, + method: str, + path: str, + status_code: int, + started_at: float | None, + country_id: str | None = None, +) -> None: + """Log a migration-aware API request in the shared structured format.""" + + elapsed_ms = None + if started_at is not None: + elapsed_ms = round((time.time() - started_at) * 1000, 2) + + route_group = infer_route_group(path) + migration_context = get_migration_log_context(route_group) + + logger.log_struct( + { + "message": "API request served", + "request_id": request_id, + "method": method, + "path": path, + "status_code": status_code, + "latency_ms": elapsed_ms, + "country_id": country_id, + "migration": migration_context, + }, + severity="INFO" if status_code < 500 else "ERROR", + ) diff --git a/tests/unit/routes/test_migration_context_logging.py b/tests/unit/routes/test_migration_context_logging.py index 204b284eb..29f32a72d 100644 --- a/tests/unit/routes/test_migration_context_logging.py +++ b/tests/unit/routes/test_migration_context_logging.py @@ -19,6 +19,17 @@ def readiness_check(): return app +def _app_without_migration_logging(): + app = Flask(__name__) + app.config["TESTING"] = True + + @app.route("/fallback") + def fallback(): + return Response("fallback", status=200, mimetype="text/plain") + + return app + + def test_request_logging_includes_migration_context(): with patch("policyengine_api.migration_logging.logger") as mock_logger: response = _app().test_client().get("/readiness-check") @@ -53,3 +64,47 @@ def test_request_logging_runs_for_asgi_fallback_routes(): log_payload = mock_logger.log_struct.call_args.args[0] assert log_payload["path"] == "/readiness-check" assert log_payload["migration"]["route_group"] == "health" + + +def test_request_logging_runs_for_fastapi_native_health_routes(monkeypatch): + monkeypatch.setenv("API_HOST_BACKEND", "cloud_run") + + with patch("policyengine_api.migration_logging.logger") as mock_logger: + response = TestClient(create_asgi_app(_app())).get( + "/health", + headers={"X-Request-ID": "request-123"}, + ) + + assert response.status_code == 200 + assert response.json() == {"status": "healthy"} + log_payload = mock_logger.log_struct.call_args.args[0] + assert log_payload["message"] == "API request served" + assert log_payload["request_id"] == "request-123" + assert log_payload["path"] == "/health" + assert log_payload["status_code"] == 200 + assert log_payload["country_id"] is None + assert log_payload["migration"]["route_group"] == "health" + assert log_payload["migration"]["api_host_backend"] == "cloud_run" + assert log_payload["migration"]["route_impl"] == "flask_fallback" + + +def test_fastapi_native_logging_failure_does_not_change_response(): + with patch( + "policyengine_api.migration_logging.logger.log_struct", + side_effect=RuntimeError("logging failed"), + ): + response = TestClient(create_asgi_app(_app())).get("/health") + + assert response.status_code == 200 + assert response.json() == {"status": "healthy"} + + +def test_asgi_shell_does_not_log_unregistered_flask_fallback_routes(): + with patch("policyengine_api.migration_logging.logger") as mock_logger: + response = TestClient(create_asgi_app(_app_without_migration_logging())).get( + "/fallback" + ) + + assert response.status_code == 200 + assert response.content == b"fallback" + mock_logger.log_struct.assert_not_called() diff --git a/tests/unit/test_migration_flags.py b/tests/unit/test_migration_flags.py index b44a4edf0..58f073337 100644 --- a/tests/unit/test_migration_flags.py +++ b/tests/unit/test_migration_flags.py @@ -57,6 +57,8 @@ def test_invalid_migration_flag_raises(monkeypatch): ("path", "expected_group"), [ ("/", "home"), + ("/health", "health"), + ("/health/simulation-gateway", "health"), ("/readiness-check", "health"), ("/us/metadata", "metadata"), ("/us/policy/1", "policy"), From 45e4d7d96c4a7d5e710233449c72ed87aeac7dfb Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 2 Jun 2026 23:00:42 +0200 Subject: [PATCH 13/22] Use dedicated Cloud Run runtime service account --- .github/scripts/cloud_run_env.sh | 2 +- .github/workflows/push.yml | 4 +-- ...gration-pr3-cloud-run-candidate-runbook.md | 19 +++++++++++++ tests/unit/test_cloud_run_deploy_scripts.py | 27 +++++++++++++++++++ 4 files changed, 49 insertions(+), 3 deletions(-) diff --git a/.github/scripts/cloud_run_env.sh b/.github/scripts/cloud_run_env.sh index bcefe97aa..cfcb62b7e 100755 --- a/.github/scripts/cloud_run_env.sh +++ b/.github/scripts/cloud_run_env.sh @@ -5,7 +5,7 @@ cloud_run_set_defaults() { CLOUD_RUN_REGION="${CLOUD_RUN_REGION:-us-central1}" CLOUD_RUN_SERVICE="${CLOUD_RUN_SERVICE:-policyengine-api}" CLOUD_RUN_ARTIFACT_REPOSITORY="${CLOUD_RUN_ARTIFACT_REPOSITORY:-policyengine-api}" - CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT="${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT:-github-deployment@policyengine-api.iam.gserviceaccount.com}" + CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT="${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT:-policyengine-api-cr-runtime@policyengine-api.iam.gserviceaccount.com}" CLOUD_RUN_CLOUD_SQL_INSTANCE="${CLOUD_RUN_CLOUD_SQL_INSTANCE:-policyengine-api:us-central1:policyengine-api-data}" CLOUD_RUN_CPU="${CLOUD_RUN_CPU:-4}" CLOUD_RUN_MEMORY="${CLOUD_RUN_MEMORY:-16Gi}" diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 60c6dc52c..6153c0324 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -250,7 +250,7 @@ jobs: env: CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} - CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }} + CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT }} POLICYENGINE_DB_USER: ${{ vars.POLICYENGINE_DB_USER }} POLICYENGINE_DB_NAME: ${{ vars.POLICYENGINE_DB_NAME }} POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} @@ -488,7 +488,7 @@ jobs: env: CLOUD_RUN_IMAGE_TAG: ${{ steps.cloud_run.outputs.image_tag }} CLOUD_RUN_TAG: ${{ steps.cloud_run.outputs.revision_tag }} - CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }} + CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT }} POLICYENGINE_DB_USER: ${{ vars.POLICYENGINE_DB_USER }} POLICYENGINE_DB_NAME: ${{ vars.POLICYENGINE_DB_NAME }} POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index d02993dbc..fe70ae417 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -11,6 +11,8 @@ does not migrate the public App Engine/custom API URL. production tracks, then promoted to the Cloud Run service URL after tests. - Runtime environment configuration for the production Cloud SQL instance and the existing simulation gateway. +- A dedicated Cloud Run runtime service account, separate from the GitHub deploy + service account used to run `gcloud`. - The same live staging integration suite against both the App Engine staging URL and the tagged Cloud Run staging URL. - Production smoke tests against the tagged Cloud Run URL, including an @@ -35,10 +37,27 @@ does not migrate the public App Engine/custom API URL. - Region: `us-central1` - Service: `policyengine-api` - Artifact Registry repository: `policyengine-api` +- Cloud Run runtime service account: + `policyengine-api-cr-runtime@policyengine-api.iam.gserviceaccount.com` - Cloud SQL instance: `policyengine-api:us-central1:policyengine-api-data` - Staging revision tag: `stage3-staging-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` - Production revision tag: `stage3-prod-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` +## Required Runtime IAM + +GitHub Actions still authenticates as `${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}` +to deploy App Engine and Cloud Run. Cloud Run itself runs as +`${{ secrets.GCP_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT }}`. + +The runtime service account must be: + +- granted Cloud SQL client access for + `policyengine-api:us-central1:policyengine-api-data`; +- allowed to read the Secret Manager secret referenced by + `GATEWAY_AUTH_CLIENT_SECRET_RESOURCE`; +- allowed as a service account user for the GitHub deploy service account, so the + workflow can deploy revisions using that runtime identity. + ## Post-Merge Flow The `Push` workflow now uses two deployment tracks. diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index 5288eab2e..f05c926be 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -8,6 +8,9 @@ REPO = Path(__file__).resolve().parents[2] PRODUCTION_CLOUD_SQL_INSTANCE = "policyengine-api:us-central1:policyengine-api-data" +DEDICATED_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT = ( + "policyengine-api-cr-runtime@policyengine-api.iam.gserviceaccount.com" +) def _script_env(**overrides: str) -> dict[str, str]: @@ -156,6 +159,10 @@ def test_deploy_cloud_run_candidate_dry_run_never_shifts_traffic(): assert "gcloud run deploy" in result.stdout assert "--no-traffic" in result.stdout assert "stage3-test" in result.stdout + assert ( + f"--service-account {DEDICATED_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT}" + in result.stdout + ) assert f"--add-cloudsql-instances {PRODUCTION_CLOUD_SQL_INSTANCE}" in result.stdout assert ( f"POLICYENGINE_DB_INSTANCE_CONNECTION_NAME={PRODUCTION_CLOUD_SQL_INSTANCE}" @@ -252,6 +259,26 @@ def test_push_workflow_deploys_production_tracks_in_parallel(): assert "Build and push Cloud Run image" not in cloud_run_production +def test_push_workflow_uses_dedicated_cloud_run_runtime_service_account(): + workflow = _push_workflow() + cloud_run_staging = _workflow_job_block(workflow, "deploy-cloud-run-staging") + cloud_run_production = _workflow_job_block(workflow, "deploy-cloud-run-candidate") + + runtime_account_secret = ( + "CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: " + "${{ secrets.GCP_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT }}" + ) + deploy_account_secret = ( + "CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: " + "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" + ) + + assert runtime_account_secret in cloud_run_staging + assert runtime_account_secret in cloud_run_production + assert deploy_account_secret not in cloud_run_staging + assert deploy_account_secret not in cloud_run_production + + def test_push_workflow_promotes_production_cloud_run_after_candidate_smoke(): workflow = _push_workflow() cloud_run_production = _workflow_job_block(workflow, "deploy-cloud-run-candidate") From f81ac0e73f5bb98a53ae5bf6ce7e381a699e5c71 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 2 Jun 2026 23:19:07 +0200 Subject: [PATCH 14/22] Treat simulation gateway probe as public --- .github/scripts/deploy_cloud_run_candidate.sh | 1 - docs/migration-pr3-cloud-run-candidate-runbook.md | 4 ++-- policyengine_api/asgi_factory.py | 12 ------------ tests/unit/test_asgi_factory.py | 15 ++------------- tests/unit/test_cloud_run_deploy_scripts.py | 2 +- 5 files changed, 5 insertions(+), 29 deletions(-) diff --git a/.github/scripts/deploy_cloud_run_candidate.sh b/.github/scripts/deploy_cloud_run_candidate.sh index b7641ee0f..989990160 100755 --- a/.github/scripts/deploy_cloud_run_candidate.sh +++ b/.github/scripts/deploy_cloud_run_candidate.sh @@ -22,7 +22,6 @@ env_vars=( "GATEWAY_AUTH_AUDIENCE=${GATEWAY_AUTH_AUDIENCE}" "GATEWAY_AUTH_CLIENT_ID=${GATEWAY_AUTH_CLIENT_ID}" "GATEWAY_AUTH_CLIENT_SECRET_RESOURCE=${GATEWAY_AUTH_CLIENT_SECRET_RESOURCE}" - "CLOUD_RUN_INTERNAL_PROBES=1" "API_HOST_BACKEND=cloud_run" "SIM_FRONT_DOOR=old_gateway_direct" "SIM_COMPUTE_ECONOMY=old_gateway" diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index fe70ae417..bc84b25da 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -15,8 +15,8 @@ does not migrate the public App Engine/custom API URL. service account used to run `gcloud`. - The same live staging integration suite against both the App Engine staging URL and the tagged Cloud Run staging URL. -- Production smoke tests against the tagged Cloud Run URL, including an - internal simulation-gateway health probe. +- Production smoke tests against the tagged Cloud Run URL, including the public + simulation-gateway health probe. - Tier 1 Cloud Run startup supervision: the container still runs local Redis, but the bash startup script tracks Redis and Uvicorn child PIDs explicitly and exits if either process dies. diff --git a/policyengine_api/asgi_factory.py b/policyengine_api/asgi_factory.py index 7fb6166cf..3b81ee608 100644 --- a/policyengine_api/asgi_factory.py +++ b/policyengine_api/asgi_factory.py @@ -2,7 +2,6 @@ from __future__ import annotations -import os import time import uuid from typing import Literal @@ -32,14 +31,6 @@ class SimulationGatewayHealthResponse(BaseModel): simulation_gateway: Literal["healthy"] -def _internal_probes_enabled() -> bool: - return os.environ.get("CLOUD_RUN_INTERNAL_PROBES", "").lower() in { - "1", - "true", - "yes", - } - - def _add_vary_origin(response) -> None: vary = response.headers.get("Vary") if vary is None: @@ -92,9 +83,6 @@ def health() -> HealthResponse: include_in_schema=False, ) def simulation_gateway_health() -> SimulationGatewayHealthResponse: - if not _internal_probes_enabled(): - raise HTTPException(status_code=404, detail="Not found") - from policyengine_api.libs.simulation_api_modal import SimulationAPIModal try: diff --git a/tests/unit/test_asgi_factory.py b/tests/unit/test_asgi_factory.py index c767be913..07adb7e29 100644 --- a/tests/unit/test_asgi_factory.py +++ b/tests/unit/test_asgi_factory.py @@ -162,17 +162,7 @@ def test_health_route_uses_same_reflected_cors_policy(): assert response.headers["vary"] == "Origin" -def test_simulation_gateway_health_probe_is_disabled_by_default(monkeypatch): - monkeypatch.delenv("CLOUD_RUN_INTERNAL_PROBES", raising=False) - client = TestClient(create_asgi_app(create_test_wsgi_app())) - - response = client.get("/health/simulation-gateway") - - assert response.status_code == 404 - - -def test_simulation_gateway_health_probe_checks_gateway(monkeypatch): - monkeypatch.setenv("CLOUD_RUN_INTERNAL_PROBES", "1") +def test_public_simulation_gateway_health_probe_checks_gateway(): client = TestClient(create_asgi_app(create_test_wsgi_app())) with patch( @@ -191,8 +181,7 @@ def test_simulation_gateway_health_probe_checks_gateway(monkeypatch): simulation_api.return_value.health_check.assert_called_once_with() -def test_simulation_gateway_health_probe_reports_failure(monkeypatch): - monkeypatch.setenv("CLOUD_RUN_INTERNAL_PROBES", "1") +def test_public_simulation_gateway_health_probe_reports_failure(): client = TestClient(create_asgi_app(create_test_wsgi_app())) with patch( diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index f05c926be..c57344ded 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -168,7 +168,7 @@ def test_deploy_cloud_run_candidate_dry_run_never_shifts_traffic(): f"POLICYENGINE_DB_INSTANCE_CONNECTION_NAME={PRODUCTION_CLOUD_SQL_INSTANCE}" in result.stdout ) - assert "CLOUD_RUN_INTERNAL_PROBES=1" in result.stdout + assert "CLOUD_RUN_INTERNAL_PROBES" not in result.stdout assert "--to-latest" not in result.stdout assert "update-traffic" not in result.stdout From f13cbd5b07717b03854527a9ae7f2732b322b485 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 2 Jun 2026 23:34:39 +0200 Subject: [PATCH 15/22] Require AI harness lint before commits --- docs/engineering/skills/github-prs.md | 14 ++++++++++++++ docs/engineering/skills/testing.md | 12 ++++++++++-- tests/unit/test_cloud_run_deploy_scripts.py | 3 +-- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/docs/engineering/skills/github-prs.md b/docs/engineering/skills/github-prs.md index 94b6cce60..ddbc83d9a 100644 --- a/docs/engineering/skills/github-prs.md +++ b/docs/engineering/skills/github-prs.md @@ -24,3 +24,17 @@ For migration work, identify: - what is newly prepared for FastAPI, SQLAlchemy/Alembic, Supabase, Cloud Run, or Modal migration; - which user-visible API contract changes are intentionally introduced. + +## Commit Hygiene + +AI agents must run formatting and lint checks before committing their own code +changes: + +```bash +make format +ruff check +``` + +Commit only after formatting succeeds and changed Python files pass lint. If a +broader repo-wide lint command fails on unrelated pre-existing issues, include +that result in the handoff instead of hiding it. diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index 71ce0d56f..907635fef 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -70,5 +70,13 @@ fixture; smoke tests must not create or update households: API_BASE_URL=https://candidate-url python -m pytest tests/integration/test_cloud_run_candidate.py -v ``` -Run `ruff format --check` and `ruff check` on changed Python files before -handoff. +Before committing AI-authored code changes, run repository formatting and lint: + +```bash +make format +ruff check +``` + +Commit only after formatting succeeds and changed Python files pass lint. If a +broader repo-wide lint command fails on unrelated pre-existing issues, include +that result in the handoff instead of hiding it. diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index c57344ded..101b24f81 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -269,8 +269,7 @@ def test_push_workflow_uses_dedicated_cloud_run_runtime_service_account(): "${{ secrets.GCP_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT }}" ) deploy_account_secret = ( - "CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: " - "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" + "CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" ) assert runtime_account_secret in cloud_run_staging From f5899cdaec34daf7d93d2d74c3f467d49e3c028d Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 3 Jun 2026 22:41:32 +0200 Subject: [PATCH 16/22] Use Secret Manager for Cloud Run runtime secrets --- .github/scripts/cloud_run_env.sh | 10 +++ .github/scripts/deploy_cloud_run_candidate.sh | 17 ++-- .../scripts/validate_cloud_run_deploy_env.sh | 10 +-- .github/workflows/push.yml | 10 --- .github/workflows/sync-cloud-run-secrets.yml | 89 ++++++++++++++++++ ...gration-pr3-cloud-run-candidate-runbook.md | 37 ++++++++ tests/unit/test_cloud_run_deploy_scripts.py | 90 +++++++++++++++++-- 7 files changed, 236 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/sync-cloud-run-secrets.yml diff --git a/.github/scripts/cloud_run_env.sh b/.github/scripts/cloud_run_env.sh index cfcb62b7e..66ca3c5f1 100755 --- a/.github/scripts/cloud_run_env.sh +++ b/.github/scripts/cloud_run_env.sh @@ -13,6 +13,11 @@ cloud_run_set_defaults() { CLOUD_RUN_MIN_INSTANCES="${CLOUD_RUN_MIN_INSTANCES:-0}" CLOUD_RUN_MAX_INSTANCES="${CLOUD_RUN_MAX_INSTANCES:-1}" CLOUD_RUN_PORT="${CLOUD_RUN_PORT:-8080}" + CLOUD_RUN_POLICYENGINE_DB_PASSWORD_SECRET="${CLOUD_RUN_POLICYENGINE_DB_PASSWORD_SECRET:-policyengine-api-prod-db-password:latest}" + CLOUD_RUN_GITHUB_MICRODATA_TOKEN_SECRET="${CLOUD_RUN_GITHUB_MICRODATA_TOKEN_SECRET:-policyengine-api-prod-github-microdata-token:latest}" + CLOUD_RUN_ANTHROPIC_API_KEY_SECRET="${CLOUD_RUN_ANTHROPIC_API_KEY_SECRET:-policyengine-api-prod-anthropic-api-key:latest}" + CLOUD_RUN_OPENAI_API_KEY_SECRET="${CLOUD_RUN_OPENAI_API_KEY_SECRET:-policyengine-api-prod-openai-api-key:latest}" + CLOUD_RUN_HUGGING_FACE_TOKEN_SECRET="${CLOUD_RUN_HUGGING_FACE_TOKEN_SECRET:-policyengine-api-prod-hugging-face-token:latest}" local sha sha="${GITHUB_SHA:-local}" @@ -35,6 +40,11 @@ cloud_run_set_defaults() { export CLOUD_RUN_MIN_INSTANCES export CLOUD_RUN_MAX_INSTANCES export CLOUD_RUN_PORT + export CLOUD_RUN_POLICYENGINE_DB_PASSWORD_SECRET + export CLOUD_RUN_GITHUB_MICRODATA_TOKEN_SECRET + export CLOUD_RUN_ANTHROPIC_API_KEY_SECRET + export CLOUD_RUN_OPENAI_API_KEY_SECRET + export CLOUD_RUN_HUGGING_FACE_TOKEN_SECRET export CLOUD_RUN_IMAGE_TAG export CLOUD_RUN_IMAGE_URI export CLOUD_RUN_TAG diff --git a/.github/scripts/deploy_cloud_run_candidate.sh b/.github/scripts/deploy_cloud_run_candidate.sh index 989990160..f97dd9a29 100755 --- a/.github/scripts/deploy_cloud_run_candidate.sh +++ b/.github/scripts/deploy_cloud_run_candidate.sh @@ -11,11 +11,6 @@ env_vars=( "POLICYENGINE_DB_INSTANCE_CONNECTION_NAME=${CLOUD_RUN_CLOUD_SQL_INSTANCE}" "POLICYENGINE_DB_USER=${POLICYENGINE_DB_USER:-policyengine}" "POLICYENGINE_DB_NAME=${POLICYENGINE_DB_NAME:-policyengine}" - "POLICYENGINE_DB_PASSWORD=${POLICYENGINE_DB_PASSWORD}" - "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN=${POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}" - "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}" - "OPENAI_API_KEY=${OPENAI_API_KEY}" - "HUGGING_FACE_TOKEN=${HUGGING_FACE_TOKEN}" "SIMULATION_API_URL=${SIMULATION_API_URL}" "GATEWAY_AUTH_REQUIRED=1" "GATEWAY_AUTH_ISSUER=${GATEWAY_AUTH_ISSUER}" @@ -28,7 +23,16 @@ env_vars=( "CLOUD_RUN_REVISION_TAG=${CLOUD_RUN_TAG}" ) +secret_vars=( + "POLICYENGINE_DB_PASSWORD=${CLOUD_RUN_POLICYENGINE_DB_PASSWORD_SECRET}" + "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN=${CLOUD_RUN_GITHUB_MICRODATA_TOKEN_SECRET}" + "ANTHROPIC_API_KEY=${CLOUD_RUN_ANTHROPIC_API_KEY_SECRET}" + "OPENAI_API_KEY=${CLOUD_RUN_OPENAI_API_KEY_SECRET}" + "HUGGING_FACE_TOKEN=${CLOUD_RUN_HUGGING_FACE_TOKEN_SECRET}" +) + set_env_vars="$(IFS='|'; echo "^|^${env_vars[*]}")" +set_secret_vars="$(IFS='|'; echo "^|^${secret_vars[*]}")" cloud_run_run gcloud run deploy "${CLOUD_RUN_SERVICE}" \ --project "${CLOUD_RUN_PROJECT}" \ @@ -47,4 +51,5 @@ cloud_run_run gcloud run deploy "${CLOUD_RUN_SERVICE}" \ --timeout "${CLOUD_RUN_TIMEOUT}" \ --min-instances "${CLOUD_RUN_MIN_INSTANCES}" \ --max-instances "${CLOUD_RUN_MAX_INSTANCES}" \ - --set-env-vars "${set_env_vars}" + --set-env-vars "${set_env_vars}" \ + --set-secrets "${set_secret_vars}" diff --git a/.github/scripts/validate_cloud_run_deploy_env.sh b/.github/scripts/validate_cloud_run_deploy_env.sh index ab1141c37..fe549db34 100755 --- a/.github/scripts/validate_cloud_run_deploy_env.sh +++ b/.github/scripts/validate_cloud_run_deploy_env.sh @@ -14,11 +14,11 @@ cloud_run_require_env \ CLOUD_RUN_TAG \ CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT \ CLOUD_RUN_CLOUD_SQL_INSTANCE \ - POLICYENGINE_DB_PASSWORD \ - POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN \ - ANTHROPIC_API_KEY \ - OPENAI_API_KEY \ - HUGGING_FACE_TOKEN \ + CLOUD_RUN_POLICYENGINE_DB_PASSWORD_SECRET \ + CLOUD_RUN_GITHUB_MICRODATA_TOKEN_SECRET \ + CLOUD_RUN_ANTHROPIC_API_KEY_SECRET \ + CLOUD_RUN_OPENAI_API_KEY_SECRET \ + CLOUD_RUN_HUGGING_FACE_TOKEN_SECRET \ SIMULATION_API_URL \ GATEWAY_AUTH_ISSUER \ GATEWAY_AUTH_AUDIENCE \ diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 6153c0324..05b320cbc 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -253,11 +253,6 @@ jobs: CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT }} POLICYENGINE_DB_USER: ${{ vars.POLICYENGINE_DB_USER }} POLICYENGINE_DB_NAME: ${{ vars.POLICYENGINE_DB_NAME }} - POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} - POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} SIMULATION_API_URL: ${{ secrets.SIMULATION_API_URL }} GATEWAY_AUTH_ISSUER: ${{ secrets.GATEWAY_AUTH_ISSUER }} GATEWAY_AUTH_AUDIENCE: ${{ secrets.GATEWAY_AUTH_AUDIENCE }} @@ -491,11 +486,6 @@ jobs: CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT }} POLICYENGINE_DB_USER: ${{ vars.POLICYENGINE_DB_USER }} POLICYENGINE_DB_NAME: ${{ vars.POLICYENGINE_DB_NAME }} - POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} - POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} SIMULATION_API_URL: ${{ secrets.SIMULATION_API_URL }} GATEWAY_AUTH_ISSUER: ${{ secrets.GATEWAY_AUTH_ISSUER }} GATEWAY_AUTH_AUDIENCE: ${{ secrets.GATEWAY_AUTH_AUDIENCE }} diff --git a/.github/workflows/sync-cloud-run-secrets.yml b/.github/workflows/sync-cloud-run-secrets.yml new file mode 100644 index 000000000..2cc691182 --- /dev/null +++ b/.github/workflows/sync-cloud-run-secrets.yml @@ -0,0 +1,89 @@ +name: Sync Cloud Run secrets + +on: + workflow_dispatch: + +concurrency: + group: cloud-run-secret-sync + +jobs: + sync-cloud-run-secrets: + name: Sync GitHub secrets to Secret Manager + runs-on: ubuntu-latest + environment: production + permissions: + contents: read + id-token: write + steps: + - name: Require master branch + if: github.ref != 'refs/heads/master' + run: | + echo "::error::Cloud Run secret sync must run from master." + exit 1 + - name: Checkout repo + uses: actions/checkout@v4 + - name: GCP authentication + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}" + service_account: "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" + - name: Set up GCloud + uses: "google-github-actions/setup-gcloud@v2" + - name: Sync runtime secrets + env: + CLOUD_RUN_PROJECT: policyengine-api + CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT: ${{ secrets.GCP_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT }} + POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }} + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + run: | + set -euo pipefail + set +x + + require_env() { + local env_name="$1" + if [[ -z "${!env_name:-}" ]]; then + echo "::error::Missing required workflow environment ${env_name}." + exit 1 + fi + } + + sync_secret() { + local env_name="$1" + local secret_name="$2" + local secret_value="${!env_name:-}" + + if [[ -z "${secret_value}" ]]; then + echo "::error::Missing required GitHub secret ${env_name}." + exit 1 + fi + + if ! gcloud secrets describe "${secret_name}" \ + --project "${CLOUD_RUN_PROJECT}" >/dev/null 2>&1; then + gcloud secrets create "${secret_name}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --replication-policy automatic + fi + + printf '%s' "${secret_value}" | gcloud secrets versions add \ + "${secret_name}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --data-file=- >/dev/null + + gcloud secrets add-iam-policy-binding "${secret_name}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --member "serviceAccount:${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT}" \ + --role roles/secretmanager.secretAccessor >/dev/null + + echo "Synced ${env_name} to Secret Manager secret ${secret_name}." + } + + require_env CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT + + sync_secret POLICYENGINE_DB_PASSWORD policyengine-api-prod-db-password + sync_secret POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN policyengine-api-prod-github-microdata-token + sync_secret ANTHROPIC_API_KEY policyengine-api-prod-anthropic-api-key + sync_secret OPENAI_API_KEY policyengine-api-prod-openai-api-key + sync_secret HUGGING_FACE_TOKEN policyengine-api-prod-hugging-face-token diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index bc84b25da..a4a4193bb 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -11,6 +11,8 @@ does not migrate the public App Engine/custom API URL. production tracks, then promoted to the Cloud Run service URL after tests. - Runtime environment configuration for the production Cloud SQL instance and the existing simulation gateway. +- Secret Manager-backed Cloud Run runtime credentials, synced manually from + existing GitHub Actions secrets. - A dedicated Cloud Run runtime service account, separate from the GitHub deploy service account used to run `gcloud`. - The same live staging integration suite against both the App Engine staging @@ -29,6 +31,8 @@ does not migrate the public App Engine/custom API URL. - No native FastAPI route migration beyond `/health`. - No Supabase, Alembic, SQLAlchemy model, or Modal compute migration. - No managed Redis, Redis Memorystore, or API v2-alpha-style cache replacement. +- No App Engine secret-handling migration; App Engine deploys still use the + existing transitional bundle path. - No App Engine retirement. ## Resource Defaults @@ -42,6 +46,12 @@ does not migrate the public App Engine/custom API URL. - Cloud SQL instance: `policyengine-api:us-central1:policyengine-api-data` - Staging revision tag: `stage3-staging-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` - Production revision tag: `stage3-prod-${GITHUB_RUN_NUMBER}-${GITHUB_SHA::7}` +- Secret Manager secrets: + - `policyengine-api-prod-db-password` + - `policyengine-api-prod-github-microdata-token` + - `policyengine-api-prod-anthropic-api-key` + - `policyengine-api-prod-openai-api-key` + - `policyengine-api-prod-hugging-face-token` ## Required Runtime IAM @@ -53,11 +63,38 @@ The runtime service account must be: - granted Cloud SQL client access for `policyengine-api:us-central1:policyengine-api-data`; +- allowed to read the five Cloud Run runtime secrets listed above; - allowed to read the Secret Manager secret referenced by `GATEWAY_AUTH_CLIENT_SECRET_RESOURCE`; - allowed as a service account user for the GitHub deploy service account, so the workflow can deploy revisions using that runtime identity. +The manual `Sync Cloud Run secrets` workflow authenticates through Workload +Identity Federation as `${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}`. That deploy +service account must be able to create the five secrets if missing, add secret +versions, and grant the Cloud Run runtime service account Secret Manager access +on those secrets. + +## Secret Sync + +Run `.github/workflows/sync-cloud-run-secrets.yml` manually from `master` before +the first Cloud Run deployment that uses Secret Manager references, and again +whenever one of the source GitHub secrets is rotated. + +The workflow copies these existing GitHub secrets into Secret Manager: + +- `POLICYENGINE_DB_PASSWORD` +- `POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN` +- `ANTHROPIC_API_KEY` +- `OPENAI_API_KEY` +- `HUGGING_FACE_TOKEN` + +The workflow writes secret payloads to `gcloud secrets versions add` through +stdin and does not print secret values. GitHub Actions remains the temporary +source of truth in PR 3. The long-term target is to create, rotate, and manage +these credentials directly in Secret Manager, with GitHub Actions only deploying +Secret Manager references. + ## Post-Merge Flow The `Push` workflow now uses two deployment tracks. diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index 101b24f81..719a02d22 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -11,6 +11,22 @@ DEDICATED_CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT = ( "policyengine-api-cr-runtime@policyengine-api.iam.gserviceaccount.com" ) +CLOUD_RUN_SECRET_MAPPINGS = { + "POLICYENGINE_DB_PASSWORD": "policyengine-api-prod-db-password:latest", + "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN": ( + "policyengine-api-prod-github-microdata-token:latest" + ), + "ANTHROPIC_API_KEY": "policyengine-api-prod-anthropic-api-key:latest", + "OPENAI_API_KEY": "policyengine-api-prod-openai-api-key:latest", + "HUGGING_FACE_TOKEN": "policyengine-api-prod-hugging-face-token:latest", +} +RAW_CLOUD_RUN_SECRET_VALUES = ( + "raw-db-secret-value", + "raw-github-secret-value", + "raw-anthropic-secret-value", + "raw-openai-secret-value", + "raw-hf-secret-value", +) def _script_env(**overrides: str) -> dict[str, str]: @@ -25,11 +41,11 @@ def _script_env(**overrides: str) -> dict[str, str]: def _required_runtime_env() -> dict[str, str]: return { - "POLICYENGINE_DB_PASSWORD": "db-password", - "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN": "github-token", - "ANTHROPIC_API_KEY": "anthropic-key", - "OPENAI_API_KEY": "openai-key", - "HUGGING_FACE_TOKEN": "hf-token", + "POLICYENGINE_DB_PASSWORD": "raw-db-secret-value", + "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN": ("raw-github-secret-value"), + "ANTHROPIC_API_KEY": "raw-anthropic-secret-value", + "OPENAI_API_KEY": "raw-openai-secret-value", + "HUGGING_FACE_TOKEN": "raw-hf-secret-value", "SIMULATION_API_URL": "https://simulation.example.test", "GATEWAY_AUTH_ISSUER": "https://issuer.example.test", "GATEWAY_AUTH_AUDIENCE": "simulation-gateway", @@ -55,6 +71,12 @@ def _push_workflow() -> str: return (REPO / ".github/workflows/push.yml").read_text(encoding="utf-8") +def _sync_secrets_workflow() -> str: + return (REPO / ".github/workflows/sync-cloud-run-secrets.yml").read_text( + encoding="utf-8" + ) + + def _workflow_job_block(workflow: str, job_name: str) -> str: match = re.search( rf"^ {re.escape(job_name)}:\n(?P.*?)(?=^ [a-zA-Z0-9_-]+:|\Z)", @@ -118,8 +140,9 @@ def test_validate_cloud_run_deploy_env_reports_missing_runtime_config(): assert result.returncode == 1 assert "Missing required Cloud Run deployment configuration" in result.stderr - assert "POLICYENGINE_DB_PASSWORD" in result.stderr + assert "SIMULATION_API_URL" in result.stderr assert "GATEWAY_AUTH_CLIENT_SECRET_RESOURCE" in result.stderr + assert "POLICYENGINE_DB_PASSWORD" not in result.stderr def test_build_cloud_run_image_dry_run_uses_cloud_run_dockerfile(): @@ -168,6 +191,11 @@ def test_deploy_cloud_run_candidate_dry_run_never_shifts_traffic(): f"POLICYENGINE_DB_INSTANCE_CONNECTION_NAME={PRODUCTION_CLOUD_SQL_INSTANCE}" in result.stdout ) + assert "--set-secrets" in result.stdout + for env_name, secret_ref in CLOUD_RUN_SECRET_MAPPINGS.items(): + assert f"{env_name}={secret_ref}" in result.stdout + for raw_secret_value in RAW_CLOUD_RUN_SECRET_VALUES: + assert raw_secret_value not in result.stdout assert "CLOUD_RUN_INTERNAL_PROBES" not in result.stdout assert "--to-latest" not in result.stdout assert "update-traffic" not in result.stdout @@ -278,6 +306,56 @@ def test_push_workflow_uses_dedicated_cloud_run_runtime_service_account(): assert deploy_account_secret not in cloud_run_production +def test_push_workflow_does_not_pass_raw_secrets_to_cloud_run_deploy_jobs(): + workflow = _push_workflow() + cloud_run_staging = _workflow_job_block(workflow, "deploy-cloud-run-staging") + cloud_run_production = _workflow_job_block(workflow, "deploy-cloud-run-candidate") + raw_secret_envs = ( + "POLICYENGINE_DB_PASSWORD: ${{ secrets.POLICYENGINE_DB_PASSWORD }}", + ( + "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: " + "${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN }}" + ), + "ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}", + "OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}", + "HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}", + ) + + for raw_secret_env in raw_secret_envs: + assert raw_secret_env not in cloud_run_staging + assert raw_secret_env not in cloud_run_production + + +def test_sync_cloud_run_secrets_workflow_is_manual_and_environment_gated(): + workflow = _sync_secrets_workflow() + + assert "workflow_dispatch:" in workflow + assert "pull_request:" not in workflow + assert "push:" not in workflow + assert "environment: production" in workflow + assert "id-token: write" in workflow + assert "github.ref != 'refs/heads/master'" in workflow + assert "google-github-actions/auth@v2" in workflow + assert ( + 'workload_identity_provider: "${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}"' + in workflow + ) + assert 'service_account: "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}"' in workflow + + +def test_sync_cloud_run_secrets_workflow_writes_expected_secret_versions(): + workflow = _sync_secrets_workflow() + + assert "set +x" in workflow + assert "--data-file=-" in workflow + assert "gcloud secrets add-iam-policy-binding" in workflow + assert "roles/secretmanager.secretAccessor" in workflow + for env_name, secret_ref in CLOUD_RUN_SECRET_MAPPINGS.items(): + secret_name = secret_ref.removesuffix(":latest") + assert f"{env_name}: ${{{{ secrets.{env_name} }}}}" in workflow + assert f"sync_secret {env_name} {secret_name}" in workflow + + def test_push_workflow_promotes_production_cloud_run_after_candidate_smoke(): workflow = _push_workflow() cloud_run_production = _workflow_job_block(workflow, "deploy-cloud-run-candidate") From cc2125b1f5730438bdb297d3bcabe8a384633bd4 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 4 Jun 2026 17:06:28 +0200 Subject: [PATCH 17/22] Temporarily allow PR secret sync --- .github/workflows/sync-cloud-run-secrets.yml | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sync-cloud-run-secrets.yml b/.github/workflows/sync-cloud-run-secrets.yml index 2cc691182..6ae4e1d2e 100644 --- a/.github/workflows/sync-cloud-run-secrets.yml +++ b/.github/workflows/sync-cloud-run-secrets.yml @@ -2,6 +2,13 @@ name: Sync Cloud Run secrets on: workflow_dispatch: + pull_request: + branches: + - master + paths: + - .github/workflows/sync-cloud-run-secrets.yml + types: + - synchronize concurrency: group: cloud-run-secret-sync @@ -15,10 +22,17 @@ jobs: contents: read id-token: write steps: - - name: Require master branch - if: github.ref != 'refs/heads/master' + - name: Require approved sync source + if: >- + github.ref != 'refs/heads/master' && + !( + github.event_name == 'pull_request' && + github.event.pull_request.number == 3649 && + github.event.pull_request.head.repo.full_name == github.repository && + github.event.pull_request.head.ref == 'migration-pr3-cloud-run-candidate' + ) run: | - echo "::error::Cloud Run secret sync must run from master." + echo "::error::Cloud Run secret sync must run from master or the approved temporary PR branch." exit 1 - name: Checkout repo uses: actions/checkout@v4 From ea8227f5e1070d991ef3fc4c712e794c947aae85 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 4 Jun 2026 17:08:29 +0200 Subject: [PATCH 18/22] Revert "Temporarily allow PR secret sync" This reverts commit cc2125b1f5730438bdb297d3bcabe8a384633bd4. --- .github/workflows/sync-cloud-run-secrets.yml | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/.github/workflows/sync-cloud-run-secrets.yml b/.github/workflows/sync-cloud-run-secrets.yml index 6ae4e1d2e..2cc691182 100644 --- a/.github/workflows/sync-cloud-run-secrets.yml +++ b/.github/workflows/sync-cloud-run-secrets.yml @@ -2,13 +2,6 @@ name: Sync Cloud Run secrets on: workflow_dispatch: - pull_request: - branches: - - master - paths: - - .github/workflows/sync-cloud-run-secrets.yml - types: - - synchronize concurrency: group: cloud-run-secret-sync @@ -22,17 +15,10 @@ jobs: contents: read id-token: write steps: - - name: Require approved sync source - if: >- - github.ref != 'refs/heads/master' && - !( - github.event_name == 'pull_request' && - github.event.pull_request.number == 3649 && - github.event.pull_request.head.repo.full_name == github.repository && - github.event.pull_request.head.ref == 'migration-pr3-cloud-run-candidate' - ) + - name: Require master branch + if: github.ref != 'refs/heads/master' run: | - echo "::error::Cloud Run secret sync must run from master or the approved temporary PR branch." + echo "::error::Cloud Run secret sync must run from master." exit 1 - name: Checkout repo uses: actions/checkout@v4 From a8a8e987c0b3a4af5e893d8945adf46f0d15f0dd Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 4 Jun 2026 17:58:19 +0200 Subject: [PATCH 19/22] Remove Cloud Run household smoke fixture --- .github/workflows/push.yml | 1 - docs/engineering/skills/testing.md | 7 +++---- tests/integration/test_cloud_run_candidate.py | 21 +------------------ 3 files changed, 4 insertions(+), 25 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 05b320cbc..a4eeeda37 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -506,7 +506,6 @@ jobs: env: API_BASE_URL: ${{ steps.cloud_run_url.outputs.url }} STAGING_API_TEST_PROBE_ID: cloud-run-${{ steps.cloud_run.outputs.revision_tag }} - CLOUD_RUN_SMOKE_HOUSEHOLD_ID: ${{ vars.CLOUD_RUN_SMOKE_HOUSEHOLD_ID }} - name: Promote Cloud Run production candidate run: bash .github/scripts/promote_cloud_run_tag.sh env: diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index 907635fef..6434151f1 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -61,10 +61,9 @@ promoting the tested Cloud Run tag to the service URL. Production Cloud Run promotion should happen only after tagged candidate smoke tests pass, and should health-check the Cloud Run service URL after promotion. Live Cloud Run candidate checks must be explicit deployed probes. Production candidate smoke tests -require `API_BASE_URL` and `CLOUD_RUN_SMOKE_HOUSEHOLD_ID`, and should not run as -part of ordinary local test commands. -`CLOUD_RUN_SMOKE_HOUSEHOLD_ID` must point to a pre-existing read-only household -fixture; smoke tests must not create or update households: +require `API_BASE_URL` and should not run as part of ordinary local test +commands. These checks should stay read-only and avoid depending on specific +production data fixtures: ```bash API_BASE_URL=https://candidate-url python -m pytest tests/integration/test_cloud_run_candidate.py -v diff --git a/tests/integration/test_cloud_run_candidate.py b/tests/integration/test_cloud_run_candidate.py index d8733493c..eae25f3d4 100644 --- a/tests/integration/test_cloud_run_candidate.py +++ b/tests/integration/test_cloud_run_candidate.py @@ -1,8 +1,3 @@ -import os - -import pytest - - def test_cloud_run_candidate_health_routes(api_client): health_response = api_client.get("/health") assert health_response.status_code == 200, health_response.text @@ -26,7 +21,7 @@ def test_cloud_run_candidate_health_routes(api_client): } -def test_cloud_run_candidate_metadata_policy_and_household( +def test_cloud_run_candidate_metadata_and_policy( api_client, ): metadata_response = api_client.get("/us/metadata") @@ -39,17 +34,3 @@ def test_cloud_run_candidate_metadata_policy_and_household( policy_payload = policy_response.json() assert policy_payload["status"] == "ok" assert policy_payload["result"]["id"] == current_law_id - - household_id = os.environ.get("CLOUD_RUN_SMOKE_HOUSEHOLD_ID") or None - if household_id is None: - pytest.fail( - "CLOUD_RUN_SMOKE_HOUSEHOLD_ID must be set to a pre-existing " - "read-only household fixture. Cloud Run smoke tests must not " - "create or update households." - ) - - household_response = api_client.get(f"/us/household/{household_id}") - assert household_response.status_code == 200, household_response.text - household_payload = household_response.json() - assert household_payload["status"] == "ok" - assert str(household_payload["result"]["id"]) == household_id From 696b8c0a268c04bc9a812bf839b69948120dc24a Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 4 Jun 2026 19:31:01 +0200 Subject: [PATCH 20/22] Rename simulation gateway health probe --- docs/migration-pr3-cloud-run-candidate-runbook.md | 4 ++-- policyengine_api/asgi_factory.py | 4 ++-- policyengine_api/migration_flags.py | 2 +- tests/integration/test_cloud_run_candidate.py | 2 +- tests/unit/test_asgi_factory.py | 4 ++-- tests/unit/test_migration_flags.py | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index a4a4193bb..44bc7fa1d 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -158,14 +158,14 @@ After GitHub Actions prints the candidate URL: curl -i "$CLOUD_RUN_CANDIDATE_URL/health" curl -i "$CLOUD_RUN_CANDIDATE_URL/readiness-check" curl -i "$CLOUD_RUN_CANDIDATE_URL/liveness-check" -curl -i "$CLOUD_RUN_CANDIDATE_URL/health/simulation-gateway" +curl -i "$CLOUD_RUN_CANDIDATE_URL/simulation-gateway-check" curl -i "$CLOUD_RUN_CANDIDATE_URL/us/metadata" ``` Expected behavior: - `/health` returns FastAPI JSON: `{"status":"healthy"}`. -- `/health/simulation-gateway` returns FastAPI JSON confirming the existing +- `/simulation-gateway-check` returns FastAPI JSON confirming the existing simulation gateway client can initialize and reach the gateway health check. - `/readiness-check` and `/liveness-check` return existing Flask text `OK`. - `/us/metadata` returns the existing v1 metadata contract from Cloud SQL. diff --git a/policyengine_api/asgi_factory.py b/policyengine_api/asgi_factory.py index 3b81ee608..e81588aaa 100644 --- a/policyengine_api/asgi_factory.py +++ b/policyengine_api/asgi_factory.py @@ -17,7 +17,7 @@ FASTAPI_NATIVE_LOGGED_PATHS = frozenset( { "/health", - "/health/simulation-gateway", + "/simulation-gateway-check", } ) @@ -78,7 +78,7 @@ def health() -> HealthResponse: return HealthResponse(status="healthy") @app.get( - "/health/simulation-gateway", + "/simulation-gateway-check", response_model=SimulationGatewayHealthResponse, include_in_schema=False, ) diff --git a/policyengine_api/migration_flags.py b/policyengine_api/migration_flags.py index 0075ab9bf..0d659eb36 100644 --- a/policyengine_api/migration_flags.py +++ b/policyengine_api/migration_flags.py @@ -71,7 +71,7 @@ def infer_route_group(path: str) -> str: return "home" if path in { "/health", - "/health/simulation-gateway", + "/simulation-gateway-check", "/liveness-check", "/readiness-check", }: diff --git a/tests/integration/test_cloud_run_candidate.py b/tests/integration/test_cloud_run_candidate.py index eae25f3d4..401a7bea3 100644 --- a/tests/integration/test_cloud_run_candidate.py +++ b/tests/integration/test_cloud_run_candidate.py @@ -11,7 +11,7 @@ def test_cloud_run_candidate_health_routes(api_client): assert readiness_response.status_code == 200, readiness_response.text assert readiness_response.text == "OK" - simulation_gateway_response = api_client.get("/health/simulation-gateway") + simulation_gateway_response = api_client.get("/simulation-gateway-check") assert simulation_gateway_response.status_code == 200, ( simulation_gateway_response.text ) diff --git a/tests/unit/test_asgi_factory.py b/tests/unit/test_asgi_factory.py index 07adb7e29..35503edbe 100644 --- a/tests/unit/test_asgi_factory.py +++ b/tests/unit/test_asgi_factory.py @@ -170,7 +170,7 @@ def test_public_simulation_gateway_health_probe_checks_gateway(): ) as simulation_api: simulation_api.return_value.health_check.return_value = True - response = client.get("/health/simulation-gateway") + response = client.get("/simulation-gateway-check") assert response.status_code == 200 assert response.json() == { @@ -189,7 +189,7 @@ def test_public_simulation_gateway_health_probe_reports_failure(): ) as simulation_api: simulation_api.return_value.health_check.return_value = False - response = client.get("/health/simulation-gateway") + response = client.get("/simulation-gateway-check") assert response.status_code == 503 diff --git a/tests/unit/test_migration_flags.py b/tests/unit/test_migration_flags.py index 58f073337..5367a4494 100644 --- a/tests/unit/test_migration_flags.py +++ b/tests/unit/test_migration_flags.py @@ -58,7 +58,7 @@ def test_invalid_migration_flag_raises(monkeypatch): [ ("/", "home"), ("/health", "health"), - ("/health/simulation-gateway", "health"), + ("/simulation-gateway-check", "health"), ("/readiness-check", "health"), ("/us/metadata", "metadata"), ("/us/policy/1", "policy"), From 60996b09bbd439239e5fe14e9f54c2ff17125343 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 4 Jun 2026 22:31:02 +0200 Subject: [PATCH 21/22] Shorten staging checks and predeploy prod candidate --- .github/workflows/push.yml | 37 ++++++++++++-- docs/engineering/skills/testing.md | 5 +- ...gration-pr3-cloud-run-candidate-runbook.md | 18 ++++--- tests/integration/test_live_economy.py | 49 +------------------ tests/unit/test_cloud_run_deploy_scripts.py | 23 +++++++-- 5 files changed, 68 insertions(+), 64 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index a4eeeda37..6255e276b 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -380,10 +380,10 @@ jobs: env: SIMULATION_API_URL: ${{ secrets.SIMULATION_API_URL }} - deploy-production: - name: Deploy production App Engine version + deploy-production-candidate: + name: Deploy production App Engine candidate runs-on: ubuntu-latest - needs: ensure-production-model-version-aligns-with-sim-api + needs: deploy-staging if: | (github.repository == 'PolicyEngine/policyengine-api') && (github.event.head_commit.message == 'Update PolicyEngine API') @@ -391,6 +391,9 @@ jobs: permissions: contents: read id-token: write + outputs: + version: ${{ steps.version.outputs.version }} + url: ${{ steps.version_url.outputs.url }} steps: - name: Checkout repo uses: actions/checkout@v4 @@ -447,10 +450,34 @@ jobs: APP_ENGINE_VERSION: ${{ steps.version.outputs.version }} - name: Wait for production version health run: bash .github/scripts/health_check.sh "${{ steps.version_url.outputs.url }}/readiness-check" + + promote-production: + name: Promote production App Engine candidate + runs-on: ubuntu-latest + needs: + - deploy-production-candidate + - ensure-production-model-version-aligns-with-sim-api + if: | + (github.repository == 'PolicyEngine/policyengine-api') + && (github.event.head_commit.message == 'Update PolicyEngine API') + environment: production + permissions: + contents: read + id-token: write + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: GCP authentication + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}" + service_account: "${{ secrets.GCP_DEPLOY_SERVICE_ACCOUNT }}" + - name: Set up GCloud + uses: "google-github-actions/setup-gcloud@v2" - name: Promote production version run: bash .github/scripts/promote_app_engine_version.sh env: - APP_ENGINE_VERSION: ${{ steps.version.outputs.version }} + APP_ENGINE_VERSION: ${{ needs.deploy-production-candidate.outputs.version }} deploy-cloud-run-candidate: name: Deploy production Cloud Run candidate @@ -522,7 +549,7 @@ jobs: docker: name: Docker runs-on: ubuntu-latest - needs: deploy-production + needs: promote-production if: | (github.repository == 'PolicyEngine/policyengine-api') && (github.event.head_commit.message == 'Update PolicyEngine API') diff --git a/docs/engineering/skills/testing.md b/docs/engineering/skills/testing.md index 6434151f1..caa332527 100644 --- a/docs/engineering/skills/testing.md +++ b/docs/engineering/skills/testing.md @@ -57,7 +57,10 @@ fail-fast behavior rather than any managed Redis integration. Staging deployment checks should run the same live integration suite against both the App Engine staging URL and the tagged Cloud Run staging URL before -promoting the tested Cloud Run tag to the service URL. Production Cloud Run +promoting the tested Cloud Run tag to the service URL. App Engine production +candidate deploys may run before the staging integration jobs finish, but must +use `APP_ENGINE_PROMOTE=0`; the traffic promotion job must remain gated on the +staging checks and production model-version alignment. Production Cloud Run promotion should happen only after tagged candidate smoke tests pass, and should health-check the Cloud Run service URL after promotion. Live Cloud Run candidate checks must be explicit deployed probes. Production candidate smoke tests diff --git a/docs/migration-pr3-cloud-run-candidate-runbook.md b/docs/migration-pr3-cloud-run-candidate-runbook.md index 44bc7fa1d..9aac27f3c 100644 --- a/docs/migration-pr3-cloud-run-candidate-runbook.md +++ b/docs/migration-pr3-cloud-run-candidate-runbook.md @@ -117,12 +117,18 @@ python -m pytest \ Production: -1. After both staging integration jobs pass, run the production model-version - alignment check. -2. Deploy/promote the App Engine production version. -3. Deploy a tagged Cloud Run production revision with no traffic. -4. Smoke-test the tagged Cloud Run production URL. -5. Promote the tested production tag to 100% of the Cloud Run service URL and +1. After the App Engine staging version is healthy, deploy the App Engine + production candidate with `APP_ENGINE_PROMOTE=0` and health-check its version + URL. This version must not receive production traffic yet. +2. In parallel, run the staging integration jobs and promote the tested Cloud + Run staging tag to the Cloud Run service URL. +3. After the staging gates pass, run the production model-version alignment + check. +4. Promote the already-deployed App Engine production candidate to receive + public production traffic. +5. Deploy a tagged Cloud Run production revision with no traffic. +6. Smoke-test the tagged Cloud Run production URL. +7. Promote the tested production tag to 100% of the Cloud Run service URL and health-check that service URL. The Cloud Run deploy command still uses: diff --git a/tests/integration/test_live_economy.py b/tests/integration/test_live_economy.py index c204c34d3..ac1f76d8f 100644 --- a/tests/integration/test_live_economy.py +++ b/tests/integration/test_live_economy.py @@ -14,13 +14,6 @@ def _load_reform_payload(filename: str) -> dict: ) -def _pick_region(metadata: dict) -> str: - for region in metadata["economy_options"]["region"]: - if region["name"] == "us": - return "us" - return metadata["economy_options"]["region"][0]["name"] - - def _pick_time_period(metadata: dict) -> str: period_names = [ str(period["name"]) for period in metadata["economy_options"]["time_period"] @@ -44,20 +37,12 @@ def _pick_time_period(metadata: dict) -> str: return period_names[0] -def test_live_economy_smoke(api_client, integration_probe_id, poll_live_endpoint): - liveness_response = api_client.get("/liveness-check") - assert liveness_response.status_code == 200 - - readiness_response = api_client.get("/readiness-check") - assert readiness_response.status_code == 200 - +def test_live_utah_macro_reform(api_client, integration_probe_id, poll_live_endpoint): metadata_response = api_client.get("/us/metadata") metadata_response.raise_for_status() metadata = metadata_response.json()["result"] - current_law_id = metadata["current_law_id"] - region = _pick_region(metadata) - time_period = _pick_time_period(metadata) + test_year = _pick_time_period(metadata) policy_response = api_client.post( "/us/policy", @@ -69,36 +54,6 @@ def test_live_economy_smoke(api_client, integration_probe_id, poll_live_endpoint payload = poll_live_endpoint( api_client, f"/us/economy/{policy_id}/over/{current_law_id}", - { - "region": region, - "time_period": time_period, - "staging_probe": f"{integration_probe_id}-smoke", - }, - route_name="economy", - ) - - assert payload["status"] == "ok", payload - assert payload["result"] is not None, payload - assert "budget" in payload["result"], payload - - -def test_live_utah_macro_reform(api_client, integration_probe_id, poll_live_endpoint): - default_policy_id = 2 - - metadata_response = api_client.get("/us/metadata") - metadata_response.raise_for_status() - test_year = _pick_time_period(metadata_response.json()["result"]) - - policy_response = api_client.post( - "/us/policy", - json=_load_reform_payload("utah_reform.json"), - ) - assert policy_response.status_code in (200, 201) - policy_id = policy_response.json()["result"]["policy_id"] - - payload = poll_live_endpoint( - api_client, - f"/us/economy/{policy_id}/over/{default_policy_id}", { "region": "ut", "time_period": test_year, diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index 719a02d22..c6c1c9e82 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -269,20 +269,33 @@ def test_push_workflow_tests_app_engine_and_cloud_run_staging_tracks(): assert "bash .github/scripts/get_cloud_run_service_url.sh" in cloud_run_promotion -def test_push_workflow_deploys_production_tracks_in_parallel(): +def test_push_workflow_deploys_app_engine_production_candidate_before_staging_gate(): workflow = _push_workflow() - app_engine_production = _workflow_job_block(workflow, "deploy-production") + app_engine_candidate = _workflow_job_block(workflow, "deploy-production-candidate") + app_engine_promotion = _workflow_job_block(workflow, "promote-production") + docker_publish = _workflow_job_block(workflow, "docker") cloud_run_production = _workflow_job_block(workflow, "deploy-cloud-run-candidate") + assert "needs: deploy-staging" in app_engine_candidate + assert 'APP_ENGINE_PROMOTE: "0"' in app_engine_candidate assert ( - "needs: ensure-production-model-version-aligns-with-sim-api" - in app_engine_production + "bash .github/scripts/promote_app_engine_version.sh" not in app_engine_candidate + ) + assert "- deploy-production-candidate" in app_engine_promotion + assert ( + "- ensure-production-model-version-aligns-with-sim-api" in app_engine_promotion + ) + assert "bash .github/scripts/promote_app_engine_version.sh" in app_engine_promotion + assert ( + "APP_ENGINE_VERSION: " + "${{ needs.deploy-production-candidate.outputs.version }}" + in app_engine_promotion ) assert ( "needs: ensure-production-model-version-aligns-with-sim-api" in cloud_run_production ) - assert "needs: deploy-production" not in cloud_run_production + assert "needs: promote-production" in docker_publish assert "stage3-prod-" in cloud_run_production assert "Build and push Cloud Run image" not in cloud_run_production From cf629b58eae483c69c76c861f86209034fdc9943 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 4 Jun 2026 23:40:19 +0200 Subject: [PATCH 22/22] Move long workflow commands into scripts --- .github/scripts/check_changelog_fragment.sh | 11 +++ .github/scripts/sync_cloud_run_secrets.sh | 52 ++++++++++++++ .github/workflows/pr.yml | 9 +-- .github/workflows/sync-cloud-run-secrets.yml | 50 +------------ tests/unit/test_cloud_run_deploy_scripts.py | 76 ++++++++++++++++++-- 5 files changed, 136 insertions(+), 62 deletions(-) create mode 100644 .github/scripts/check_changelog_fragment.sh create mode 100644 .github/scripts/sync_cloud_run_secrets.sh diff --git a/.github/scripts/check_changelog_fragment.sh b/.github/scripts/check_changelog_fragment.sh new file mode 100644 index 000000000..280e9eb0e --- /dev/null +++ b/.github/scripts/check_changelog_fragment.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -euo pipefail + +fragments="$(find changelog.d -type f ! -name '.gitkeep' | wc -l)" +if [[ "${fragments}" -eq 0 ]]; then + echo "::error::No changelog fragment found in changelog.d/" + echo "Add one with: echo 'Description.' > changelog.d/\$(git branch --show-current)..md" + echo "Types: added, changed, fixed, removed, breaking" + exit 1 +fi diff --git a/.github/scripts/sync_cloud_run_secrets.sh b/.github/scripts/sync_cloud_run_secrets.sh new file mode 100644 index 000000000..75f3325e6 --- /dev/null +++ b/.github/scripts/sync_cloud_run_secrets.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +set -euo pipefail +set +x + +CLOUD_RUN_PROJECT="${CLOUD_RUN_PROJECT:-policyengine-api}" + +require_env() { + local env_name="$1" + if [[ -z "${!env_name:-}" ]]; then + echo "::error::Missing required workflow environment ${env_name}." + exit 1 + fi +} + +sync_secret() { + local env_name="$1" + local secret_name="$2" + local secret_value="${!env_name:-}" + + if [[ -z "${secret_value}" ]]; then + echo "::error::Missing required GitHub secret ${env_name}." + exit 1 + fi + + if ! gcloud secrets describe "${secret_name}" \ + --project "${CLOUD_RUN_PROJECT}" >/dev/null 2>&1; then + gcloud secrets create "${secret_name}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --replication-policy automatic + fi + + printf '%s' "${secret_value}" | gcloud secrets versions add \ + "${secret_name}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --data-file=- >/dev/null + + gcloud secrets add-iam-policy-binding "${secret_name}" \ + --project "${CLOUD_RUN_PROJECT}" \ + --member "serviceAccount:${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT}" \ + --role roles/secretmanager.secretAccessor >/dev/null + + echo "Synced ${env_name} to Secret Manager secret ${secret_name}." +} + +require_env CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT + +sync_secret POLICYENGINE_DB_PASSWORD policyengine-api-prod-db-password +sync_secret POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN policyengine-api-prod-github-microdata-token +sync_secret ANTHROPIC_API_KEY policyengine-api-prod-anthropic-api-key +sync_secret OPENAI_API_KEY policyengine-api-prod-openai-api-key +sync_secret HUGGING_FACE_TOKEN policyengine-api-prod-hugging-face-token diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 265c83363..32800b53e 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -36,14 +36,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Check for changelog fragment - run: | - FRAGMENTS=$(find changelog.d -type f ! -name '.gitkeep' | wc -l) - if [ "$FRAGMENTS" -eq 0 ]; then - echo "::error::No changelog fragment found in changelog.d/" - echo "Add one with: echo 'Description.' > changelog.d/\$(git branch --show-current)..md" - echo "Types: added, changed, fixed, removed, breaking" - exit 1 - fi + run: bash .github/scripts/check_changelog_fragment.sh test_container_builds: name: Docker runs-on: ubuntu-latest diff --git a/.github/workflows/sync-cloud-run-secrets.yml b/.github/workflows/sync-cloud-run-secrets.yml index 2cc691182..9a6be0717 100644 --- a/.github/workflows/sync-cloud-run-secrets.yml +++ b/.github/workflows/sync-cloud-run-secrets.yml @@ -38,52 +38,4 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - run: | - set -euo pipefail - set +x - - require_env() { - local env_name="$1" - if [[ -z "${!env_name:-}" ]]; then - echo "::error::Missing required workflow environment ${env_name}." - exit 1 - fi - } - - sync_secret() { - local env_name="$1" - local secret_name="$2" - local secret_value="${!env_name:-}" - - if [[ -z "${secret_value}" ]]; then - echo "::error::Missing required GitHub secret ${env_name}." - exit 1 - fi - - if ! gcloud secrets describe "${secret_name}" \ - --project "${CLOUD_RUN_PROJECT}" >/dev/null 2>&1; then - gcloud secrets create "${secret_name}" \ - --project "${CLOUD_RUN_PROJECT}" \ - --replication-policy automatic - fi - - printf '%s' "${secret_value}" | gcloud secrets versions add \ - "${secret_name}" \ - --project "${CLOUD_RUN_PROJECT}" \ - --data-file=- >/dev/null - - gcloud secrets add-iam-policy-binding "${secret_name}" \ - --project "${CLOUD_RUN_PROJECT}" \ - --member "serviceAccount:${CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT}" \ - --role roles/secretmanager.secretAccessor >/dev/null - - echo "Synced ${env_name} to Secret Manager secret ${secret_name}." - } - - require_env CLOUD_RUN_RUNTIME_SERVICE_ACCOUNT - - sync_secret POLICYENGINE_DB_PASSWORD policyengine-api-prod-db-password - sync_secret POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN policyengine-api-prod-github-microdata-token - sync_secret ANTHROPIC_API_KEY policyengine-api-prod-anthropic-api-key - sync_secret OPENAI_API_KEY policyengine-api-prod-openai-api-key - sync_secret HUGGING_FACE_TOKEN policyengine-api-prod-hugging-face-token + run: bash .github/scripts/sync_cloud_run_secrets.sh diff --git a/tests/unit/test_cloud_run_deploy_scripts.py b/tests/unit/test_cloud_run_deploy_scripts.py index c6c1c9e82..fcfeaa075 100644 --- a/tests/unit/test_cloud_run_deploy_scripts.py +++ b/tests/unit/test_cloud_run_deploy_scripts.py @@ -77,6 +77,12 @@ def _sync_secrets_workflow() -> str: ) +def _sync_secrets_script() -> str: + return (REPO / ".github/scripts/sync_cloud_run_secrets.sh").read_text( + encoding="utf-8" + ) + + def _workflow_job_block(workflow: str, job_name: str) -> str: match = re.search( rf"^ {re.escape(job_name)}:\n(?P.*?)(?=^ [a-zA-Z0-9_-]+:|\Z)", @@ -87,6 +93,27 @@ def _workflow_job_block(workflow: str, job_name: str) -> str: return match.group("body") +def _multiline_run_block_lengths(workflow_path: Path) -> list[tuple[int, int]]: + lines = workflow_path.read_text(encoding="utf-8").splitlines() + blocks: list[tuple[int, int]] = [] + + for line_index, line in enumerate(lines): + match = re.match(r"^(\s*)run: \|", line) + if match is None: + continue + + indent = len(match.group(1)) + body_lines = 0 + for body_line in lines[line_index + 1 :]: + if body_line.strip() and len(body_line) - len(body_line.lstrip()) <= indent: + break + if body_line.strip(): + body_lines += 1 + blocks.append((line_index + 1, body_lines)) + + return blocks + + def test_cloud_run_startup_uses_asgi_entrypoint(): start_script = (REPO / "gcp/cloud_run/start.sh").read_text(encoding="utf-8") @@ -358,15 +385,54 @@ def test_sync_cloud_run_secrets_workflow_is_manual_and_environment_gated(): def test_sync_cloud_run_secrets_workflow_writes_expected_secret_versions(): workflow = _sync_secrets_workflow() + script = _sync_secrets_script() - assert "set +x" in workflow - assert "--data-file=-" in workflow - assert "gcloud secrets add-iam-policy-binding" in workflow - assert "roles/secretmanager.secretAccessor" in workflow + assert "run: bash .github/scripts/sync_cloud_run_secrets.sh" in workflow + assert "set +x" in script + assert "--data-file=-" in script + assert "gcloud secrets add-iam-policy-binding" in script + assert "roles/secretmanager.secretAccessor" in script for env_name, secret_ref in CLOUD_RUN_SECRET_MAPPINGS.items(): secret_name = secret_ref.removesuffix(":latest") assert f"{env_name}: ${{{{ secrets.{env_name} }}}}" in workflow - assert f"sync_secret {env_name} {secret_name}" in workflow + assert f"sync_secret {env_name} {secret_name}" in script + + +def test_sync_cloud_run_secrets_script_is_shell_syntax_valid(): + result = subprocess.run( + ["bash", "-n", ".github/scripts/sync_cloud_run_secrets.sh"], + cwd=REPO, + text=True, + capture_output=True, + check=False, + ) + + assert result.returncode == 0, result.stderr + + +def test_changelog_fragment_script_is_shell_syntax_valid(): + result = subprocess.run( + ["bash", "-n", ".github/scripts/check_changelog_fragment.sh"], + cwd=REPO, + text=True, + capture_output=True, + check=False, + ) + + assert result.returncode == 0, result.stderr + + +def test_workflows_do_not_inline_long_run_blocks(): + oversized_blocks = [] + for workflow_path in (REPO / ".github/workflows").glob("*.y*ml"): + for line_number, body_lines in _multiline_run_block_lengths(workflow_path): + if body_lines > 4: + oversized_blocks.append( + f"{workflow_path.relative_to(REPO)}:{line_number} has " + f"{body_lines} inline run lines" + ) + + assert oversized_blocks == [] def test_push_workflow_promotes_production_cloud_run_after_candidate_smoke():