From e917094b7d605fa32d6dc3f955ed539f89f2dde5 Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 23:40:32 -0600 Subject: [PATCH 1/4] fix(grafana): wire orphan status-onion-service.yaml into kustomization The file existed in the workloads/grafana directory but wasn't referenced in resources, so the OnionService CR for the public status dashboard was never applied. Adding it brings the onion address listed on onion.makeitwork.cloud (7m3fv4pd5m...onion) live again. --- workloads/grafana/kustomization.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/workloads/grafana/kustomization.yaml b/workloads/grafana/kustomization.yaml index 7ef06be..ab6b138 100644 --- a/workloads/grafana/kustomization.yaml +++ b/workloads/grafana/kustomization.yaml @@ -9,6 +9,7 @@ resources: - status-datasource.yaml - status-dashboard.yaml - onion-service.yaml + - status-onion-service.yaml - tunnel-binding.yaml generators: - ksops-grafana-secrets.yaml From 47d6acd020f40f8e70e673cfdef4f33da4323108 Mon Sep 17 00:00:00 2001 From: xnoto Date: Thu, 30 Apr 2026 08:47:49 -0600 Subject: [PATCH 2/4] docs: refactor README and AGENTS to reflect k3s cluster Also install kubectl in the CD sync job so the bootstrap Applications can actually be patched. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 3 + AGENTS.md | 301 +++++++++++---------------------------- README.md | 95 ++++++------ 3 files changed, 132 insertions(+), 267 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b8382a7..a66cc91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,6 +40,9 @@ jobs: if: github.event_name == 'push' && github.ref == 'refs/heads/main' steps: + - name: Install kubectl + uses: azure/setup-kubectl@v4 + - name: Sync ArgoCD bootstrap Applications run: | for app in bootstrap-secrets gitops-operators gitops-workloads; do diff --git a/AGENTS.md b/AGENTS.md index 29cdf68..c3230d6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,199 +1,120 @@ # Agent Context for kustomize-cluster -## Repository Overview - -GitOps repository for OpenShift CRC cluster. Uses ArgoCD with KSOPS for secret decryption. +GitOps manifests for the k3s cluster behind makeitwork.cloud. ArgoCD reconciles this repo using KSOPS for inline secret decryption. ## Sync Wave Architecture ``` -Wave 0: Bootstrap and cluster baseline configuration -Wave 1: Operator and CRD provider layer -Wave 2: Workload layer depending on installed operators -PostSync: Follow-up operational automation +Wave 0: ArgoCD config, OIDC RBAC, CI service account +Wave 1: bootstrap-secrets, gitops-operators +Wave 2: gitops-workloads +PostSync: ci-token-sync, wait-for-* jobs ``` -- Sync waves are per-Application, not global across all Applications +Sync waves order resources within a single ArgoCD Application — they are **not** global across Applications. The App-of-Apps structure plus `wait-for-*` post-sync jobs enforces cross-Application ordering. ## Domain Architecture -| Domain | Access Method | TLS Handling | -|--------|---------------|--------------| -| `*.makeitwork.cloud` | Cloudflare Tunnel | TLS terminated at Cloudflare edge | -| `*.apps.makeitwork.cloud` | WARP only | Let's Encrypt cert in cluster | -| `api.makeitwork.cloud` | WARP only | Let's Encrypt cert in cluster | +| Domain | Access | TLS | +|---|---|---| +| `*.makeitwork.cloud` | Cloudflare Tunnel (`TunnelBinding`) | Cloudflare edge | +| `*.apps.makeitwork.cloud` | WARP-only | Let's Encrypt (DNS-01) | +| `api.makeitwork.cloud` | WARP-only | Let's Encrypt (DNS-01) | + +There is no in-cluster ingress controller. All `*.makeitwork.cloud` apps reach the cluster via a Cloudflare Tunnel managed by cloudflare-operator. ## Key Namespaces -- `openshift-config` - Cluster-level secrets (certs, OAuth configs) -- `openshift-ingress` - Router/IngressController resources -- `openshift-ingress-operator` - IngressController CR -- `cert-manager` - cert-manager controller pods -- `openshift-gitops` - ArgoCD and KSOPS -- `cloudflare-operator-system` - Cloudflare operator, tunnel deployment, DNS API secret +- `argocd` — ArgoCD, KSOPS plugin, `sops-age-keys` Secret +- `cert-manager` — cert-manager controllers + Cloudflare API token +- `cloudflare-operator-system` — cloudflare-operator, tunnel deployment, Cloudflare API secret +- `arc-system` — ARC controller (Actions Runner Controller) ## Certificate Management -Certificates are managed by cert-manager with Let's Encrypt via DNS-01 (Cloudflare). - -**Critical:** cert-manager needs external DNS servers for DNS-01 validation because cluster DNS cannot resolve external domains. This is configured via `CertManager` CR: +cert-manager issues Let's Encrypt certs via the Cloudflare DNS-01 solver. The cluster DNS cannot resolve external domains, so the controller is configured to use external recursive nameservers: ```yaml -spec: - controllerConfig: - overrideArgs: - - "--dns01-recursive-nameservers=1.1.1.1:53,8.8.8.8:53" - - "--dns01-recursive-nameservers-only" +extraArgs: + - --dns01-recursive-nameservers=1.1.1.1:53,8.8.8.8:53 + - --dns01-recursive-nameservers-only ``` -**Certificate locations:** -- `openshift-config/wildcard-apps-makeitwork-cloud-tls` - for componentRoutes (console, oauth) -- `openshift-config/api-makeitwork-cloud-tls` - for API server -- Cloudflare API token in `cert-manager/cloudflare-api-token` - -**OpenShift config resources:** -- `ingress.config.openshift.io/cluster` - componentRoutes for console/oauth certs -- `apiserver.config.openshift.io/cluster` - API server cert +The Cloudflare API token lives in `cert-manager/cloudflare-api-token` and is referenced from `ClusterIssuer` resources. -## Cloudflare Tunnel DNS Management +## Cloudflare Tunnel DNS -Public `*.makeitwork.cloud` app DNS records are operator-managed from `TunnelBinding` resources. +Public `*.makeitwork.cloud` DNS records are operator-managed from `TunnelBinding` resources. -- Keep `TunnelBinding.tunnelRef.disableDNSUpdates: false` for operator-managed DNS -- `subjects[].name` must match the real Kubernetes `Service` name in the same namespace -- cloudflare-operator stores ownership metadata in `_managed.` TXT records -- Do not delete CNAME records without deleting matching `_managed.` TXT records; stale TXT `DnsId` values cause reconcile failures (`81044`) -- The old `dns-adoption-job` hook is intentionally not used +- Keep `tunnelRef.disableDNSUpdates: false` so the operator owns CNAMEs +- `subjects[].name` must match the real Kubernetes `Service` name in the same namespace; if it doesn't exist, status reports `http_status:404` +- Ownership is tracked in `_managed.` TXT records — deleting a CNAME without removing its matching TXT record causes update-by-stale-id failures (`Record does not exist. (81044)`) -## SOPS/KSOPS Encryption +## SOPS / KSOPS -Secrets are encrypted with age using **selective field encryption**. Only actual secret values are encrypted; metadata, comments, and non-sensitive configuration remain readable. - -### Configuration - -The `.sops.yaml` file defines `encrypted_regex` to target only sensitive fields: +**Selective field encryption only.** `.sops.yaml` defines `encrypted_regex` so only sensitive values are encrypted; manifests stay diffable. ```yaml encrypted_regex: '^(token|api-token|clientID|clientSecret|password|secret|github_token|CLOUDFLARE_API_TOKEN|credentials\.json|.*_SERVICE_KEY|GF_AUTH_GITHUB_CLIENT_SECRET|GF_SECURITY_ADMIN_PASSWORD|dex\.github\.clientID|dex\.github\.clientSecret)$' ``` -### File Structure Best Practices - -**DO:** -- Create separate Secret files for sensitive values -- Reference secrets from Applications/CRDs by name -- Keep non-secret manifests completely unencrypted -- Use comments in secret files to document purpose +### Conventions -**DON'T:** -- Encrypt entire Kubernetes manifests (configs, Namespaces, RBAC) -- Mix secrets with configuration in the same file -- Encrypt metadata fields (names, namespaces, labels, annotations) +- One Secret per file; reference by name from CRDs/Applications +- Never encrypt non-Secret manifests (Namespaces, RBAC, ConfigMaps) +- Never encrypt metadata (names, namespaces, labels, annotations) -### Example: Proper Secret Structure +### Example ```yaml -# GitHub OAuth for ArgoCD - encrypted with sops apiVersion: v1 kind: Secret metadata: name: argocd-github-oauth - namespace: openshift-gitops - labels: - app.kubernetes.io/part-of: argocd - annotations: - argocd.argoproj.io/sync-wave: "0" + namespace: argocd type: Opaque stringData: - # Only these values are encrypted dex.github.clientID: Ov23liV3VghvjBnQjsWQ - dex.github.clientSecret: ae75f6c64ba9833bf7323c205f7b5ea368390788 + dex.github.clientSecret: ``` ### Commands ```bash -# Encrypt a file (applies encrypted_regex from .sops.yaml) -sops -e -i secret.yaml - -# Decrypt for viewing (stdout only, doesn't modify file) -sops -d secret.yaml - -# Edit an encrypted file (decrypts in editor, re-encrypts on save) -sops secret.yaml - -# Check if encryption worked correctly -sops -d secret.yaml | grep -E "(apiVersion|kind|metadata|name|namespace)" +sops -e -i secret.yaml # encrypt in place +sops -d secret.yaml # decrypt to stdout +sops secret.yaml # edit decrypted, re-encrypt on save ``` -### Adding New Secrets - -1. Create a plain YAML Secret file with the sensitive values -2. Run `sops -e -i your-secret.yaml` -3. Verify only the secret values are encrypted (metadata should be readable) -4. Add the file to the appropriate `ksops-*.yaml` generator -5. Never commit unencrypted secret files - -### KSOPS Integration - -Each directory with secrets has a KSOPS generator file that lists encrypted files: +### KSOPS generators -```yaml -# ksops-example-secrets.yaml -apiVersion: viaduct.ai/v1 -kind: ksops -metadata: - name: ksops-example-secrets - annotations: - config.kubernetes.io/function: | - exec: - path: ksops -files: - - github-oauth-secret.yaml - - api-token-secret.yaml -``` - -The kustomization.yaml separates resources (unencrypted) from generators (encrypted): +Each directory with secrets has a generator listing its encrypted files; the kustomization separates plain `resources` from KSOPS `generators`: ```yaml resources: - - deployment.yaml # Unencrypted manifest - - configmap.yaml # Unencrypted config + - deployment.yaml + - configmap.yaml generators: - - ksops-example-secrets.yaml # Decrypts secrets during kustomize build + - ksops-example-secrets.yaml ``` -### Migration from Full-File Encryption - -If you encounter files where everything is encrypted (apiVersion, kind, metadata): - -1. Decrypt the file: `sops -d old-file.yaml > decrypted.yaml` -2. Split into separate files: - - One for Secret resources (re-encrypt with `sops -e -i`) - - One for non-secret resources (keep unencrypted) -3. Update the kustomization.yaml to reference new file names -4. Delete the old over-encrypted files - -**Key:** `age152ek83tm4fj5u70r3fecytn4kg7c5xca24erjchxexx4pfqg6das7q763l` +The age private key is mounted into the ArgoCD repo-server as the `sops-age-keys` Secret in the `argocd` namespace. ## Tor Hidden Services -Managed by tor-controller operator with OnionService CRDs per workload. +Managed by tor-controller with `OnionService` CRDs per workload. -**Critical:** Tor keys must use `data` field (not `stringData`) with base64-encoded raw binary. The key file starts with `== ed25519v1-secret: type0 ==`. +- Tor keys must use `data` (not `stringData`) with base64-encoded raw binary; the file starts with `== ed25519v1-secret: type0 ==` +- Public `.onion` addresses are documented in `../www/onion.makeitwork.cloud/index.html` -Expected .onion addresses are documented in `../www/onion.makeitwork.cloud/index.html`. +## Resource Sizing -## Resource Management +Single-node cluster — default to **no container `resources` block**. -**Single-node CRC policy:** avoid container CPU/memory reservations by default. +- Explicit requests trigger `Insufficient cpu/memory` scheduling failures +- CPU limits cause throttling even with spare capacity +- Memory limits can cause avoidable OOM kills -- Prefer `resources: {}` or no `resources` block on app containers -- Avoid both `requests` and `limits` unless a workload has a proven stability need -- High requests on single-node CRC commonly trigger `Insufficient cpu/memory` scheduling failures -- CPU limits cause throttling; memory limits can cause avoidable OOM kills - -When adding new workloads, default to no container requests/limits: ```yaml containers: - name: app @@ -201,9 +122,7 @@ containers: resources: {} ``` -For operators installed via OLM (Subscription), tune through supported CR/Subscription fields where available (for example `spec.config.resources: {}` or operator-specific `*_resource_requirements: {}`). If the operator ignores these fields, accept operator defaults. - -For operators installed via kustomize remote refs, use JSON patches to remove the entire `resources` block: +For operators installed via Helm/Subscription, prefer values that disable resources (`resources: {}` or operator-specific fields). For operators installed via kustomize remote refs, strip `resources` with a JSON patch: ```yaml patches: @@ -215,110 +134,50 @@ patches: name: controller-manager ``` -If KubeLinter checks require explicit ignores for this cluster policy: +If kube-linter complains, annotate the Deployment: + ```yaml annotations: - ignore-check.kube-linter.io/unset-cpu-requirements: "No requests on single-node cluster" - ignore-check.kube-linter.io/unset-memory-requirements: "No limits on single-node cluster" + ignore-check.kube-linter.io/unset-cpu-requirements: "single-node policy" + ignore-check.kube-linter.io/unset-memory-requirements: "single-node policy" ``` -## Pre-commit Hooks - -This repository uses [pre-commit](https://pre-commit.com/) to enforce code quality and catch issues before they reach the repository. - -### Setup +## Pre-commit ```bash -# Install pre-commit hooks (run once after cloning) pre-commit install --hook-type commit-msg --hook-type pre-push - -# Verify hooks are installed -ls -la .git/hooks/pre-commit .git/hooks/pre-push -``` - -### Pre-commit Checks - -| Hook | Purpose | -|------|---------| -| `conventional-pre-commit` | Validates conventional commit message format | -| `check-yaml` | Validates YAML syntax | -| `detect-private-key` | Prevents accidental commit of private keys | -| `kube-linter` | Validates Kubernetes manifests | -| `trailing-whitespace` | Removes trailing whitespace | -| `end-of-file-fixer` | Ensures files end with newline | - -### Usage - -**Before committing:** -```bash -# Run all checks on changed files -pre-commit run - -# Run all checks on all files pre-commit run --all-files ``` -**If pre-commit fails:** -1. Fix the reported issues -2. Stage your changes (`git add`) -3. Run `pre-commit run` again to verify -4. Then commit - -**Bypass (emergencies only):** -```bash -git commit --no-verify # Skips pre-commit hooks -``` - -### Pre-push Protection - -The pre-push hook runs all checks before allowing `git push`. This prevents broken code from reaching the remote repository. +| Hook | Purpose | +|---|---| +| `conventional-pre-commit` | Conventional commit message format | +| `check-yaml` | YAML syntax | +| `detect-private-key` | Block private key commits | +| `kube-linter` | Kubernetes manifest sanity | +| `trailing-whitespace`, `end-of-file-fixer` | Formatting | ## Common Gotchas -1. **OpenShift operators reconcile routes** - Manual patches to routes get reverted. Use proper config resources (`ingress.config.openshift.io`, etc.) - -2. **componentRoutes vs IngressController default cert** - Different consumers: - - `IngressController.spec.defaultCertificate` - expects secret in `openshift-ingress` - - `Ingress.spec.componentRoutes` - expects secret in `openshift-config` - -3. **CertManager CR vs deployment patch** - The CertManager CR's `controllerConfig.overrideArgs` should apply to deployment, but verify with: - ```bash - kubectl get deploy cert-manager -n cert-manager -o jsonpath='{.spec.template.spec.containers[0].args}' - ``` - -4. **Tor secret format** - Using `stringData` with base64 content causes double-encoding. Use `data` field directly. - -5. **ArgoCD sync waves** - Waves only order resources within a single Application. Cross-Application ordering requires hooks or separate sync operations. - -6. **OAuth Replace=true causes sync failures** - The `argocd.argoproj.io/sync-options: Replace=true` annotation causes ArgoCD to delete+create resources. OpenShift protects singleton resources like `oauths.config.openshift.io/cluster` from deletion. Use `ServerSideApply=true` instead for these resources. - -7. **Cloudflare stale TXT records break DNS reconciliation** - If `_managed.` TXT records point to deleted CNAME IDs, cloudflare-operator attempts update-by-stale-ID and fails with `Record does not exist. (81044)`. Remove stale `_managed.*` TXT records, then reconcile TunnelBindings. - -8. **TunnelBinding subject name is service lookup key** - `subjects[].name` is used to read the Kubernetes Service object. If this name does not exist, operator status falls back to `http_status:404`. +1. **ArgoCD waves are per-Application** — Cross-Application ordering needs hooks or separate sync operations. +2. **TunnelBinding `subjects[].name` is a Service lookup key** — A typo here surfaces as `http_status:404` in operator status, not a missing-Service error. +3. **Cloudflare stale TXT records break reconciliation** — Remove orphan `_managed.` TXT records before recreating CNAMEs. +4. **Tor secret format** — Use `data` with raw binary base64; `stringData` double-encodes. +5. **KSOPS needs the age key in the repo-server pod** — Without `sops-age-keys` mounted, manifest generation fails before any sync. +6. **DNS-01 requires external resolvers** — cluster DNS cannot validate Let's Encrypt challenges; the cert-manager controller args above are required. ## Useful Commands ```bash -# Check cert status -kubectl get certificate -A - -# Check challenges (DNS-01 validation) -kubectl get challenges -A - -# Verify cert on endpoint -openssl s_client -connect host:port -servername host 2>/dev/null | openssl x509 -noout -subject -issuer - -# Decrypt SOPS secret -sops -d path/to/secret.yaml - -# Force ArgoCD sync -argocd app sync - -# Check ArgoCD app status -argocd app get +kubectl get certificate -A # cert status +kubectl get challenges -A # DNS-01 validation +sops -d path/to/secret.yaml # inspect a secret +argocd app sync # force sync +argocd app get # app status ``` ## Related Repositories -- `makeitworkcloud/www` - Static site with .onion address documentation -- `makeitworkcloud/ansible-role-crc` - CRC cluster provisioning +- `makeitworkcloud/ansible-site-cluster` — k3s cluster provisioning +- `makeitworkcloud/www` — static site, source of `.onion` documentation +- `makeitworkcloud/shared-workflows` — reusable GitHub Actions workflows diff --git a/README.md b/README.md index e5514ea..941f600 100644 --- a/README.md +++ b/README.md @@ -1,72 +1,75 @@ # kustomize-cluster -Kustomize configurations for OpenShift cluster workloads. Uses ArgoCD sync waves and KSOPS for secret decryption. +GitOps manifests for the k3s cluster behind [makeitwork.cloud](https://makeitwork.cloud/). ArgoCD reconciles this repo using KSOPS for inline secret decryption. + +## Layout + +``` +bootstrap/ ArgoCD configuration, OIDC RBAC, CI service account, App-of-Apps roots +operators/ Cluster operators that install CRDs (cert-manager, cloudflare, tor, ARC, …) +workloads/ Workload Applications that depend on operator CRDs +``` + +The root `kustomization.yaml` is for local `kustomize build` testing only. ArgoCD drives production sync from the per-Application sources defined in `bootstrap/`. ## Sync Wave Flow ``` -Wave 0: Bootstrap and cluster baseline configuration -Wave 1: Operator layer and CRD providers -Wave 2: Workload layer that depends on installed operators -PostSync: Operational follow-up automation +Wave 0: ArgoCD configuration, RBAC, CI service account +Wave 1: bootstrap-secrets and gitops-operators Applications +Wave 2: gitops-workloads Application +PostSync: ci-token-sync, wait-for-* jobs ``` -Waves are evaluated per ArgoCD Application. They provide ordering intent but do not create global ordering across all Applications. +Sync waves order resources within a single Application — they are not global across Applications. Cross-Application ordering is enforced by the App-of-Apps structure and `wait-for-*` post-sync jobs. -## Features +## External Traffic -- **GitHub SSO**: OpenShift, ArgoCD, AWX, and Grafana all authenticate via GitHub OAuth -- **Cloudflare Tunnels**: External apps via cloudflare-operator with TunnelBindings per app -- **Tor Hidden Services**: Centralized tor-controller with OnionService CRDs per workload -- **Let's Encrypt Certs**: Wildcard `*.apps.makeitwork.cloud` via cert-manager DNS-01 (Cloudflare) -- **Public Status Page**: `status.makeitwork.cloud` served by dedicated anonymous Grafana instance with blackbox probe metrics -- **Pull-Through Cache**: Docker registry mirror for ARC runners to reduce rate limits -- **App-of-Apps**: Each workload is a separate ArgoCD Application for independent sync +| Domain | Path | TLS | +|---|---|---| +| `*.makeitwork.cloud` | Cloudflare Tunnel via cloudflare-operator `TunnelBinding` | Cloudflare edge | +| `*.apps.makeitwork.cloud` | WARP-only | Let's Encrypt in cluster | +| `api.makeitwork.cloud` | WARP-only | Let's Encrypt in cluster | -## Requirements +There is no in-cluster ingress controller. All public traffic flows through a Cloudflare Tunnel; in-cluster TLS is issued by cert-manager using the Cloudflare DNS-01 solver. -- OpenShift GitOps operator -- OpenShift cert-manager operator -- CRC with monitoring enabled (`crc config set enable-cluster-monitoring true`) -- `sops-age-keys` secret in `openshift-gitops` namespace (for SOPS decryption) +### TunnelBinding DNS -## Cloudflare DNS Ownership +Public DNS under `*.makeitwork.cloud` is owned by cloudflare-operator from `TunnelBinding` resources in this repo. -Public app DNS under `*.makeitwork.cloud` is managed by cloudflare-operator from `TunnelBinding` resources in this repo. +- Keep `tunnelRef.disableDNSUpdates: false` so the operator manages CNAMEs +- `subjects[].name` must match the real `Service` name in the same namespace +- The operator stores ownership in `_managed.` TXT records; deleting a CNAME without removing its matching TXT record yields Cloudflare error `81044` -- Keep `TunnelBinding.tunnelRef.disableDNSUpdates: false` for operator-managed DNS -- Set `subjects[].name` to the real Service name in the same namespace -- The operator writes `_managed.` TXT records alongside CNAMEs for ownership tracking -- Do not delete CNAME records without deleting matching `_managed.` TXT records (stale TXT `DnsId` values cause reconcile error `81044`) -- `operators/cloudflare/dns-adoption-job.yaml` is legacy and is intentionally not referenced from `operators/cloudflare/kustomization.yaml` +## Authentication -## CI/CD +GitHub OAuth provides SSO for ArgoCD, Grafana, AWX, and kubectl/Headlamp (via OIDC). Cluster-admin RBAC for the maintainer GitHub team is defined in `bootstrap/oidc-rbac.yaml`. CI uses a dedicated `ci-deployer` ServiceAccount whose token is synced to GitHub Actions secrets by a PostSync job. -On push to `main`, GitHub Actions: -1. Runs pre-commit tests (YAML lint, etc.) -2. Connects to cluster via Cloudflare WARP -3. Triggers ArgoCD sync via OpenShift API +## SOPS / KSOPS -The `ci-deployer` service account provides cluster-admin access for CI/CD workflows. Its token is automatically synced to GitHub Actions secrets (`OPENSHIFT_TOKEN`) via a PostSync job after each ArgoCD sync. +Secrets are age-encrypted with field-level selective encryption. The `.sops.yaml` `encrypted_regex` targets only sensitive values (tokens, passwords, OAuth client secrets) so metadata stays diffable. -## Resource Management +```bash +sops -e -i secret.yaml # encrypt in place +sops -d secret.yaml # decrypt to stdout +sops secret.yaml # decrypt → editor → re-encrypt on save +``` -This is a single-node CRC cluster. Prefer **no container requests/limits** unless there is a proven stability need: +The age public key is committed in `.sops.yaml`. The matching private key is loaded into the cluster as the `sops-age-keys` Secret in the `argocd` namespace and consumed by the KSOPS plugin during ArgoCD manifest generation. -- High requests commonly trigger `Insufficient cpu/memory` and block scheduling -- CPU limits cause throttling even with spare capacity -- Memory limits can cause avoidable OOM kills +## CI/CD -See `AGENTS.md` for detailed guidance on resource configuration. +`.github/workflows/ci.yml`: -## SOPS Encryption +1. **test** (`ubuntu-latest`) — runs pre-commit (yamllint, kube-linter, conventional-commit, etc.) +2. **sync** (`arc` runner, `main` only) — `kubectl patch` each App-of-Apps root (`bootstrap-secrets`, `gitops-operators`, `gitops-workloads`) to trigger an ArgoCD sync at the new SHA -Secrets are encrypted with age. Each directory with secrets has a KSOPS generator: +The in-cluster ARC runner uses its ServiceAccount token to talk to the API directly. -```bash -# Encrypt a secret -sops -e --age age152ek83tm4fj5u70r3fecytn4kg7c5xca24erjchxexx4pfqg6das7q763l secret.yaml +## Resource Sizing -# Decrypt for viewing -sops -d secret.yaml -``` +This is a single-node cluster. Default to **no `resources` block** on app containers — explicit requests trigger `Insufficient cpu/memory` and limits cause throttling or OOM kills with spare capacity. See `AGENTS.md` for guidance on operators installed via remote refs. + +## License + +GPLv3 From f78e355c5a37216d288def3e33ce13a4c7e4842e Mon Sep 17 00:00:00 2001 From: xnoto Date: Thu, 30 Apr 2026 08:52:27 -0600 Subject: [PATCH 3/4] ci: rename workflow to test-and-sync, target arc-tf runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous job installed kubectl explicitly even though the tfroot-runner image already ships it (Containerfile). Drop the redundant setup step and target the actual scale set name (arc-tf) so the job dispatches to an in-cluster runner whose auto-mounted SA token authenticates kubectl directly — no Actions secrets needed. Renamed from ci.yml because this workflow lints and deploys; "ci" implies test-only. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/{ci.yml => test-and-sync.yml} | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) rename .github/workflows/{ci.yml => test-and-sync.yml} (66%) diff --git a/.github/workflows/ci.yml b/.github/workflows/test-and-sync.yml similarity index 66% rename from .github/workflows/ci.yml rename to .github/workflows/test-and-sync.yml index a66cc91..f7d97cd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/test-and-sync.yml @@ -1,5 +1,5 @@ --- -name: CI/CD +name: Test & Sync on: push: @@ -15,8 +15,6 @@ permissions: jobs: test: name: Pre-commit Tests - # ubuntu-latest while arc-dind runners are unavailable during the libvirt - # migration. Revert to `arc` once the new k3s cluster has ARC runners up. runs-on: ubuntu-latest steps: - name: Checkout repository @@ -32,17 +30,13 @@ jobs: sync: name: Sync ArgoCD - # In-cluster `arc` runner uses its SA token to talk to the API directly; - # this job stays on `arc` because it needs cluster access. It will not run - # until ARC dind runners are deployed by kustomize-cluster post-bootstrap. - runs-on: arc + # In-cluster runner; kubectl uses its auto-mounted SA token to talk to + # the API directly. The tfroot-runner image has kubectl preinstalled. + runs-on: arc-tf needs: [test] if: github.event_name == 'push' && github.ref == 'refs/heads/main' steps: - - name: Install kubectl - uses: azure/setup-kubectl@v4 - - name: Sync ArgoCD bootstrap Applications run: | for app in bootstrap-secrets gitops-operators gitops-workloads; do From 5e11dc34ad9ec5bc65b2e764f9f4daa824acbcce Mon Sep 17 00:00:00 2001 From: xnoto Date: Thu, 30 Apr 2026 08:55:26 -0600 Subject: [PATCH 4/4] feat(arc): grant arc-tf runner SA permission to patch Applications Creates an arc-tf-runner ServiceAccount in arc-runners and binds a Role in argocd that allows get+patch on applications.argoproj.io. The arc-tf scale set template now mounts this SA, so the in-cluster sync workflow's `kubectl patch application` runs against the API using the auto-mounted token. Co-Authored-By: Claude Opus 4.7 (1M context) --- workloads/arc/arc-tf-application.yaml | 1 + workloads/arc/kustomization.yaml | 1 + workloads/arc/runner-rbac.yaml | 39 +++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 workloads/arc/runner-rbac.yaml diff --git a/workloads/arc/arc-tf-application.yaml b/workloads/arc/arc-tf-application.yaml index df23a5c..dfcfecd 100644 --- a/workloads/arc/arc-tf-application.yaml +++ b/workloads/arc/arc-tf-application.yaml @@ -31,6 +31,7 @@ spec: minRunners: 0 template: spec: + serviceAccountName: arc-tf-runner containers: - name: runner image: ghcr.io/makeitworkcloud/tfroot-runner:latest diff --git a/workloads/arc/kustomization.yaml b/workloads/arc/kustomization.yaml index 971a970..d92610e 100644 --- a/workloads/arc/kustomization.yaml +++ b/workloads/arc/kustomization.yaml @@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - namespace.yaml + - runner-rbac.yaml - arc-tf-application.yaml generators: - ksops-arc-secrets.yaml diff --git a/workloads/arc/runner-rbac.yaml b/workloads/arc/runner-rbac.yaml new file mode 100644 index 0000000..80280b2 --- /dev/null +++ b/workloads/arc/runner-rbac.yaml @@ -0,0 +1,39 @@ +--- +# SA used by arc-tf runner pods. The runner's auto-mounted token authenticates +# kubectl in-cluster, so no GitHub Actions secret is required. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: arc-tf-runner + namespace: arc-runners + annotations: + argocd.argoproj.io/sync-wave: "0" +--- +# Allow the runner to trigger ArgoCD syncs by patching Application CRs. +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: argocd-application-patcher + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "0" +rules: + - apiGroups: ["argoproj.io"] + resources: ["applications"] + verbs: ["get", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: arc-tf-runner-application-patcher + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "0" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: argocd-application-patcher +subjects: + - kind: ServiceAccount + name: arc-tf-runner + namespace: arc-runners