From 8a21af834529c8302185832caf7d09744509b25f Mon Sep 17 00:00:00 2001 From: xnoto Date: Fri, 24 Apr 2026 14:00:52 -0600 Subject: [PATCH 01/17] chore: add repo-local opencode config --- opencode.json | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 opencode.json diff --git a/opencode.json b/opencode.json new file mode 100644 index 0000000..01ecf42 --- /dev/null +++ b/opencode.json @@ -0,0 +1,36 @@ +{ + "$schema": "https://opencode.ai/config.json", + "mcp": { + "agent-hub": {"type": "local", "command": ["npx", "-y", "agent-hub-mcp@latest"], "enabled": true}, + "context-mode": {"type": "local", "command": ["context-mode"], "enabled": true}, + "context7": {"type": "remote", "url": "https://mcp.context7.com/mcp", "enabled": true}, + "github": {"type": "remote", "url": "https://api.githubcopilot.com/mcp/", "enabled": true, "headers": {"Authorization": "Bearer {env:GITHUB_TOKEN}"}}, + "opentofu-docs": {"type": "local", "command": ["npx", "-y", "@opentofu/opentofu-mcp-server"], "enabled": true}, + "opencode-docs": {"enabled": false}, + "aws-docs": {"enabled": false}, + "kubernetes": {"enabled": false}, + "tmux": {"enabled": false}, + "linear": {"enabled": false}, + "notion": {"enabled": false}, + "aws-api-staging": {"enabled": false}, + "aws-api-prod": {"enabled": false}, + "grafana": {"enabled": false}, + "terraform-docs": {"enabled": false}, + "argocd-staging-eks": {"enabled": false}, + "argocd-prod-eks": {"enabled": false} + }, + "tools": { + "opencode-docs_*": false, + "aws-docs_*": false, + "kubernetes_*": false, + "tmux_*": false, + "linear_*": false, + "notion_*": false, + "aws-api-staging_*": false, + "aws-api-prod_*": false, + "grafana_*": false, + "terraform-docs_*": false, + "argocd-staging-eks_*": false, + "argocd-prod-eks_*": false + } +} From c95b5fc282266c4359912dd42a511a80e069bd74 Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 14:09:45 -0600 Subject: [PATCH 02/17] ci: use ubuntu-latest + ghcr image while arc-dind unavailable The arc-dind runner pool and the OpenShift internal registry that hosted the tfroot-runner image both depended on the CRC cluster, which is offline during the libvirt-host migration. Swap to GitHub- hosted ubuntu-latest runners and pull the canonical image from ghcr.io. Revert when the new k3s cluster is up. --- .github/workflows/opentofu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/opentofu.yml b/.github/workflows/opentofu.yml index 89e2683..bc2ea8b 100644 --- a/.github/workflows/opentofu.yml +++ b/.github/workflows/opentofu.yml @@ -16,8 +16,8 @@ jobs: opentofu: uses: makeitworkcloud/shared-workflows/.github/workflows/opentofu.yml@main with: - runs-on: arc-dind - container: image-registry.openshift-image-registry.svc:5000/public-registry/tfroot-runner:latest + runs-on: ubuntu-latest + container: ghcr.io/makeitworkcloud/tfroot-runner:latest setup-ssh: true secrets: SOPS_AGE_KEY: ${{ secrets.SOPS_AGE_KEY }} From 94db3694b9682eabdb2ace662731cb9ec0f9e2b3 Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 14:10:43 -0600 Subject: [PATCH 03/17] feat!: replace CRC/AAP provisioning with k3s + cloud-init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drops the entire Ansible Automation Platform integration (the aap provider, awx_* secrets, and runner module's enable_aap arguments) and ports its sole consumer — the configure_runner playbook — into the runner VM's cloud-init runcmd. The runner now self-installs the GitHub Actions binary and registers via the existing github_token secret (which is shared with tfroot-github). The PAT-bearing installer is written to /run/ so it does not survive reboot. Adds a new module "k3s" backed by a Fedora cloud image with cloud- init that: - relaxes SELinux to permissive - installs k3s (Traefik + ServiceLB disabled) - installs upstream Argo CD into ns argocd - applies a root Application pointing at kustomize-cluster's main /, which then self-manages the cluster Adds a dedicated libvirt_pool "cluster" backed by /mnt/nvme/cluster on hero's RAID-1 NVMe, keeping cluster volumes off the root LV. The host directory must be created once: ssh user@hero 'sudo mkdir -p /mnt/nvme/cluster' (hero has SELinux disabled, so no fcontext step). BREAKING CHANGE: tfroot-libvirt no longer requires the aap provider, the awx_controller / awx_username / awx_password / proxyhost sops keys, or the ansible-project-libvirt repo. Operators consuming this TF root must remove those references and provide a github_token sops key (matches the value in tfroot-github/secrets/secrets.yaml). Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 4 +- cloud-init/k3s/cloud_init.cfg | 78 ++++++++++++++++++++++++++++++++ cloud-init/runner/cloud_init.cfg | 55 +++++++++++++++++++++- main.tf | 77 ++++++++++++++++++++++++++----- providers.tf | 10 ---- secrets/secrets.yaml | 10 ++-- 6 files changed, 204 insertions(+), 30 deletions(-) create mode 100644 cloud-init/k3s/cloud_init.cfg diff --git a/README.md b/README.md index cc29c82..407f284 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.3 | -| [aap](#requirement\_aap) | ~> 1.4.0 | | [libvirt](#requirement\_libvirt) | ~> 0.9.0 | | [sops](#requirement\_sops) | ~> 1.3.0 | @@ -12,18 +11,21 @@ | Name | Version | |------|---------| +| [libvirt](#provider\_libvirt) | ~> 0.9.0 | | [sops](#provider\_sops) | ~> 1.3.0 | ## Modules | Name | Source | Version | |------|--------|---------| +| [k3s](#module\_k3s) | git::https://github.com/makeitworkcloud/terraform-libvirt-domain.git | n/a | | [runner](#module\_runner) | git::https://github.com/makeitworkcloud/terraform-libvirt-domain.git | n/a | ## Resources | Name | Type | |------|------| +| [libvirt_pool.cluster](https://registry.terraform.io/providers/dmacvicar/libvirt/latest/docs/resources/pool) | resource | | [sops_file.secret_vars](https://registry.terraform.io/providers/carlpett/sops/latest/docs/data-sources/file) | data source | ## Inputs diff --git a/cloud-init/k3s/cloud_init.cfg b/cloud-init/k3s/cloud_init.cfg new file mode 100644 index 0000000..5b55e73 --- /dev/null +++ b/cloud-init/k3s/cloud_init.cfg @@ -0,0 +1,78 @@ +#cloud-config +# https://cloudinit.readthedocs.io/en/latest/topics/examples.html + +write_files: + # ArgoCD root Application — points at the kustomize-cluster repo root, which + # builds bootstrap + workloads. Once applied, ArgoCD self-manages from here. + - path: /etc/k3s-bootstrap/root-app.yaml + permissions: '0644' + content: | + apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: cluster-root + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io + spec: + project: default + source: + repoURL: ${cluster_repo_url} + targetRevision: ${cluster_repo_branch} + path: ${cluster_repo_path} + destination: + server: https://kubernetes.default.svc + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + +groups: + - default + - name: wheel + +users: + - default + - name: user + groups: [wheel] + sudo: ['ALL=(ALL) NOPASSWD:ALL'] + shell: /bin/bash + lock_passwd: true + ssh_authorized_keys: + - ${ssh_authorized_key} + +packages: + - curl + +fs_setup: + - device: /dev/vdb + filesystem: xfs + overwrite: false + +mounts: + - ["/dev/vdb", "/var/lib/rancher", "xfs", "defaults", "0", "0"] + +runcmd: + - sed -i 's/^SELINUX=.*/SELINUX=permissive/' /etc/selinux/config + - setenforce 0 + - mkdir -p /var/lib/rancher + - | + set -e + curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION='${k3s_version}' \ + sh -s - server --disable=traefik --disable=servicelb --write-kubeconfig-mode=0644 + - | + set -e + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + until kubectl get nodes 2>/dev/null | grep -q ' Ready '; do sleep 3; done + - | + set -e + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl create namespace argocd + kubectl apply -n argocd --server-side -f \ + "https://raw.githubusercontent.com/argoproj/argo-cd/${argocd_version}/manifests/install.yaml" + until kubectl -n argocd get crd applications.argoproj.io 2>/dev/null; do sleep 3; done + until kubectl -n argocd rollout status deploy/argocd-server --timeout=10s 2>/dev/null; do sleep 5; done + kubectl apply -f /etc/k3s-bootstrap/root-app.yaml diff --git a/cloud-init/runner/cloud_init.cfg b/cloud-init/runner/cloud_init.cfg index e6b835a..b5ddaca 100644 --- a/cloud-init/runner/cloud_init.cfg +++ b/cloud-init/runner/cloud_init.cfg @@ -34,12 +34,59 @@ write_files: # Prune build cache older than 7 days docker builder prune -f --filter "until=168h" + - path: /usr/local/bin/runner-work-cleanup.sh + permissions: '0755' + content: | + #!/bin/bash + find /opt/actions-runner/_work -mindepth 2 -maxdepth 2 -type d -mtime +1 \ + -exec rm -rf {} \; 2>/dev/null || true + + # PAT-bearing installer lives on tmpfs so it does not survive first reboot. + - path: /run/install-gha-runner.sh + permissions: '0700' + content: | + #!/bin/bash + set -euo pipefail + + GITHUB_ORG='${github_org}' + GITHUB_TOKEN='${github_token}' + + RUNNER_VERSION=$(curl -sSL https://api.github.com/repos/actions/runner/releases/latest | jq -r .tag_name) + RUNNER_VER_NUM="$${RUNNER_VERSION#v}" + + cd /opt/actions-runner + curl -sSL -o runner.tar.gz \ + "https://github.com/actions/runner/releases/download/$RUNNER_VERSION/actions-runner-linux-x64-$RUNNER_VER_NUM.tar.gz" + tar xzf runner.tar.gz + chown -R user:user /opt/actions-runner + rm -f runner.tar.gz + + REG_TOKEN=$(curl -sSL -X POST \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/orgs/$GITHUB_ORG/actions/runners/registration-token" \ + | jq -r .token) + RANDOM_ID=$(tr -dc 'a-z' > /var/log/docker-cleanup.log 2>&1" > /etc/cron.d/docker-cleanup + - echo "30 */6 * * * user /usr/local/bin/runner-work-cleanup.sh" > /etc/cron.d/runner-work-cleanup diff --git a/main.tf b/main.tf index 5dc194e..9f6ab24 100644 --- a/main.tf +++ b/main.tf @@ -3,8 +3,34 @@ data "sops_file" "secret_vars" { } locals { - # Use direct mirror that provides Content-Length header (required by libvirt provider) - boot_image_url = "https://dl.fedoraproject.org/pub/fedora/linux/releases/43/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-43-1.6.x86_64.qcow2?v=3" + # Boot images + # Direct mirror that provides Content-Length header (required by libvirt provider) + fedora_image_url = "https://dl.fedoraproject.org/pub/fedora/linux/releases/43/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-43-1.6.x86_64.qcow2?v=3" + + # GitHub + github_org = "makeitworkcloud" + + # ArgoCD bootstrap target (root of repo manages bootstrap + workloads) + cluster_repo_url = "https://github.com/makeitworkcloud/kustomize-cluster" + cluster_repo_branch = "main" + cluster_repo_path = "." + + # k3s + k3s_ip = "192.168.102.2" + k3s_version = "v1.31.4+k3s1" # bump as needed; see https://github.com/k3s-io/k3s/releases + argocd_version = "v2.13.2" # bump as needed; see https://github.com/argoproj/argo-cd/releases +} + +# Dedicated libvirt pool on /mnt/nvme RAID-1 for cluster volumes (keeps cluster IO off the root LV). +# One-time host setup required before first apply (hero has SELinux disabled, so no fcontext step): +# ssh user@hero 'sudo mkdir -p /mnt/nvme/cluster' +resource "libvirt_pool" "cluster" { + name = "cluster" + type = "dir" + + target = { + path = "/mnt/nvme/cluster" + } } module "runner" { @@ -12,7 +38,7 @@ module "runner" { name = "runner" description = "GitHub Actions self-hosted runner" memory = 8192 - boot_image_url = local.boot_image_url + boot_image_url = local.fedora_image_url extra_volumes = [ { name = "runner-var-lib-docker.qcow2" @@ -23,14 +49,43 @@ module "runner" { size = 32212254720 # 30 GiB } ] - cloudinit_meta_data_template = "${path.module}/cloud-init/meta_data.cfg" - cloudinit_meta_data_vars = { hostname = "runner" } - cloudinit_user_data_template = "${path.module}/cloud-init/runner/cloud_init.cfg" - cloudinit_user_data_vars = { ssh_authorized_key = data.sops_file.secret_vars.data["ssh_admin_pubkey"] } + cloudinit_meta_data_template = "${path.module}/cloud-init/meta_data.cfg" + cloudinit_meta_data_vars = { hostname = "runner" } + cloudinit_user_data_template = "${path.module}/cloud-init/runner/cloud_init.cfg" + cloudinit_user_data_vars = { + ssh_authorized_key = data.sops_file.secret_vars.data["ssh_admin_pubkey"] + github_org = local.github_org + github_token = data.sops_file.secret_vars.data["github_token"] + } cloudinit_network_config_template = "${path.module}/cloud-init/network_config.cfg" cloudinit_network_config_vars = { private_ip_addr = data.sops_file.secret_vars.data["runner_ip_addr"] } - private_ip_addr = data.sops_file.secret_vars.data["runner_ip_addr"] - proxyhost = data.sops_file.secret_vars.data["proxyhost"] - enable_aap = true - aap_inventory_name = "libvirt" +} + +module "k3s" { + source = "git::https://github.com/makeitworkcloud/terraform-libvirt-domain.git" + name = "k3s" + description = "k3s single-node cluster" + vcpu = 6 + memory = 16384 + storage_pool = libvirt_pool.cluster.name + boot_image_url = local.fedora_image_url + extra_volumes = [ + { + name = "k3s-var-lib-rancher.qcow2" + size = 107374182400 # 100 GiB + } + ] + cloudinit_meta_data_template = "${path.module}/cloud-init/meta_data.cfg" + cloudinit_meta_data_vars = { hostname = "k3s" } + cloudinit_user_data_template = "${path.module}/cloud-init/k3s/cloud_init.cfg" + cloudinit_user_data_vars = { + ssh_authorized_key = data.sops_file.secret_vars.data["ssh_admin_pubkey"] + k3s_version = local.k3s_version + argocd_version = local.argocd_version + cluster_repo_url = local.cluster_repo_url + cluster_repo_branch = local.cluster_repo_branch + cluster_repo_path = local.cluster_repo_path + } + cloudinit_network_config_template = "${path.module}/cloud-init/network_config.cfg" + cloudinit_network_config_vars = { private_ip_addr = local.k3s_ip } } diff --git a/providers.tf b/providers.tf index 4180b0a..8b523fc 100644 --- a/providers.tf +++ b/providers.tf @@ -8,10 +8,6 @@ terraform { source = "dmacvicar/libvirt" version = "~> 0.9.0" } - aap = { - source = "registry.terraform.io/ansible/aap" - version = "~> 1.4.0" - } sops = { source = "carlpett/sops" version = "~> 1.3.0" @@ -23,10 +19,4 @@ provider "libvirt" { uri = data.sops_file.secret_vars.data["libvirt_uri"] } -provider "aap" { - host = data.sops_file.secret_vars.data["awx_controller"] - username = data.sops_file.secret_vars.data["awx_username"] - password = data.sops_file.secret_vars.data["awx_password"] -} - provider "sops" {} diff --git a/secrets/secrets.yaml b/secrets/secrets.yaml index 5aca038..3c0602d 100644 --- a/secrets/secrets.yaml +++ b/secrets/secrets.yaml @@ -1,3 +1,4 @@ +github_token: ENC[AES256_GCM,data:c5PI4Pedry9iSR3CE2dfKdWsJKA83meI87QCt9CIgdxFKkycNi1Uvg==,iv:ZqFsa9Yt2aIvXaTxGdtTeD6u5FO0zU9hsKt/vWJCF3s=,tag:AFQB/U0j+++q5lCdcMGhyQ==,type:str] proxyhost: ENC[AES256_GCM,data:2JMu3PbepHozJfoe4hs7chT24Uvxke2YDVU=,iv:kIJJXLZhVz8Kpz0/h64CWLi8Xx2YKpiPe4xwYQAS7r8=,tag:8d7mO7O5483Crh8QZv/oNg==,type:str] torwww_ip_addr: ENC[AES256_GCM,data:ZYEWHdA9A+MJ1CS+zqI=,iv:LrSWaVzjA23vGoG0CDCaL1yw7JQTc3Nd9CWH8zMrwGM=,tag:gldEwwz2FwTdArgfi28b4A==,type:str] runner_ip_addr: ENC[AES256_GCM,data:1X765pvHZcPXd4VC/Mg=,iv:U1LsCbzDb88nIYxbeQ1mxbXDZeSKqjHy8+MgRsOdLA0=,tag:uMtczv3JM9X6uwLsw+pGnw==,type:str] @@ -8,9 +9,6 @@ s3_key: ENC[AES256_GCM,data:HnbBP1/Yl+7x4QWytcaVMck=,iv:/kPXIKVHp6/0IJAPyz6LX73d s3_region: ENC[AES256_GCM,data:7Y+mGUUXo9Jd,iv:GohshS4fKqvziEcdPqGFsyWDQjLHf9O7PVs1Z0vXORw=,tag:PSTQ5DuQILDglOAj+OJnBQ==,type:str] s3_access_key: ENC[AES256_GCM,data:dvhYpsqgxm2hpNjuNA5SDPR4leU=,iv:ejPmt4Kqk9pRuAFccxvH9XMc1yzvA9Xs8RQ/S465bRM=,tag:oS660L4tVyqjIgHUt+B4tw==,type:str] s3_secret_key: ENC[AES256_GCM,data:9kgVRgWrIusg4F7NqIfAX5GxR1ya7sY/mCdOaoV6+v+SYJxDxpR2zg==,iv:/qUB565Eh31SOtF1YvyLdxxFftHqRDsFdRTUoyq2w08=,tag:IDS3RQlgVsg1LU8OxGRXSg==,type:str] -awx_controller: ENC[AES256_GCM,data:zyrk6j48ssU16V1jxUUHrQrJzd/lBp6ZsLW06a2AvqI=,iv:DJ9SG7CaX39yIwHvtL3l7j1xP+jDz9RQfR8t0KE9nO8=,tag:aYetqf1tdh7WlE4Q3Wm30A==,type:str] -awx_username: ENC[AES256_GCM,data:6sfFzVI=,iv:HG6k+IENkHdxD/vRKoip5tvUhfjCHvDwZcN3s9H0yFg=,tag:hNfP5Er4F230wcfqVwHtuw==,type:str] -awx_password: ENC[AES256_GCM,data:4Vc5Up89WY9HSfmW6Q4VRBbs6YPyqLjt1rO1wlW+ANg=,iv:Ub9PGKrrTDlqDmVuBBZfGUXwHHF2rjE3A4ISxLfJPb4=,tag:WXyxjUKUy4neENSp4Qte1Q==,type:str] sops: age: - recipient: age152ek83tm4fj5u70r3fecytn4kg7c5xca24erjchxexx4pfqg6das7q763l @@ -22,7 +20,7 @@ sops: R05MZlJDY1JnVjBlb01Hdm10d3k3VXMKpYhy+H82z9yBAREn2O0cUQp+m9laXyAx 5Hn86bDGLP4LxsVKbQS/77Weg0HI26WsKkTwOR8DB72TFia1SzQNqQ== -----END AGE ENCRYPTED FILE----- - lastmodified: "2025-12-22T02:47:11Z" - mac: ENC[AES256_GCM,data:I6pnR7D7FRyakhRbewO1/pIFr/E4wtwekW6EXHIYhsmSf/+PuZzPYOrEWz8b4NvJNRA5llzv/mwiAzFxJMpvZ4OmA7ZW/0l+jJyLRX2WyotJ/vDRbdgRtR3uDE3YivsxtYLUpdsIab6bGzadwGqusIN+vIJlZwtrMwJh/HP4wiI=,iv:dhepV5lMHfyN4eZRL9jcJVkR4zNNfd2iCghq2i0dXyg=,tag:3DlqgxossCGXUonVVz6vHQ==,type:str] + lastmodified: "2026-04-29T17:56:19Z" + mac: ENC[AES256_GCM,data:7rymnZ45gTOu3PeR3FgZpcG9/8z6DyJKhZQ9NY7o+Sj0PLYeykV34am0llGRzMfc4HQgK6+N/suoHdUKSJiS1FLuCJWrEuK/JMfSlpyATkXDaxMzqfAdKDK4UjUcyd2VKFhks2KsOUSXl+pnzB9Y5U1pIypmqwzYY7fGvUF6muE=,iv:hBaQQ2ZlWVvIuwru4KtbjacD60hxMVotJLLUvN467AA=,tag:cak6Ia/F7BMViXbkuldYFA==,type:str] unencrypted_suffix: _unencrypted - version: 3.10.2 + version: 3.12.2 From 09e275d84fad2350fea66ad29f348f66b591427a Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 15:05:43 -0600 Subject: [PATCH 04/17] fix(k3s-cloud-init): handle multiline sops_age_key safely The age private key in sops is stored as a YAML | (literal block scalar), so data.sops_file...sops_age_key is a multiline string. Threading it through --from-literal=key.txt='${sops_age_key}' inside a cloud-init runcmd would inject literal newlines mid-YAML, breaking cloud-init parsing. Switch to: 1. write_files entry that materialises /run/age-key on tmpfs, with indent(6, sops_age_key) so YAML block-scalar indentation is preserved across all lines of the secret. 2. kubectl --from-file=key.txt=/run/age-key in runcmd. The key file lives only on tmpfs and is reaped on first reboot. --- cloud-init/k3s/cloud_init.cfg | 59 +++++++++++++++++------------------ 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/cloud-init/k3s/cloud_init.cfg b/cloud-init/k3s/cloud_init.cfg index 5b55e73..abfb066 100644 --- a/cloud-init/k3s/cloud_init.cfg +++ b/cloud-init/k3s/cloud_init.cfg @@ -1,34 +1,13 @@ #cloud-config # https://cloudinit.readthedocs.io/en/latest/topics/examples.html +# AGE private key for KSOPS, written to tmpfs so it does not survive reboot. +# Loaded into the argocd/sops-age-keys Secret in runcmd via --from-file. write_files: - # ArgoCD root Application — points at the kustomize-cluster repo root, which - # builds bootstrap + workloads. Once applied, ArgoCD self-manages from here. - - path: /etc/k3s-bootstrap/root-app.yaml - permissions: '0644' + - path: /run/age-key + permissions: '0600' content: | - apiVersion: argoproj.io/v1alpha1 - kind: Application - metadata: - name: cluster-root - namespace: argocd - finalizers: - - resources-finalizer.argocd.argoproj.io - spec: - project: default - source: - repoURL: ${cluster_repo_url} - targetRevision: ${cluster_repo_branch} - path: ${cluster_repo_path} - destination: - server: https://kubernetes.default.svc - syncPolicy: - automated: - prune: true - selfHeal: true - syncOptions: - - CreateNamespace=true - - ServerSideApply=true + ${indent(6, sops_age_key)} groups: - default @@ -67,12 +46,30 @@ runcmd: set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml until kubectl get nodes 2>/dev/null | grep -q ' Ready '; do sleep 3; done + # KSOPS in argocd's repo-server expects /sops-age-keys/key.txt; create the + # namespace + Secret BEFORE the ArgoCD CR is reconciled or the repo-server + # CrashLoops on missing volume mount. - | set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl create namespace argocd - kubectl apply -n argocd --server-side -f \ - "https://raw.githubusercontent.com/argoproj/argo-cd/${argocd_version}/manifests/install.yaml" - until kubectl -n argocd get crd applications.argoproj.io 2>/dev/null; do sleep 3; done - until kubectl -n argocd rollout status deploy/argocd-server --timeout=10s 2>/dev/null; do sleep 5; done - kubectl apply -f /etc/k3s-bootstrap/root-app.yaml + kubectl -n argocd create secret generic sops-age-keys \ + --from-file=key.txt=/run/age-key + # Install argocd-operator (community) which provides the + # argoproj.io/v1beta1 ArgoCD CRD consumed by kustomize-cluster's + # bootstrap/argocd-config.yaml. + - | + set -e + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl apply --server-side -k \ + 'https://github.com/argoproj-labs/argocd-operator//config/default?ref=${argocd_operator_version}' + until kubectl get crd argocds.argoproj.io 2>/dev/null; do sleep 3; done + # Apply kustomize-cluster bootstrap path. This contains the ArgoCD CR + # (which the operator reconciles into a running argocd-server) plus the + # operators-app and workloads-app Applications. Once argocd-server starts, + # it picks up the Applications and self-manages from there. + - | + set -e + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl apply --server-side -k \ + '${cluster_repo_url}//${cluster_repo_path}?ref=${cluster_repo_branch}' From 1c44e85fa55fe34e455ae8c927cc3800c150edaa Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 15:51:40 -0600 Subject: [PATCH 05/17] =?UTF-8?q?feat:=20wire=20sops=5Fage=5Fkey=20+=20ren?= =?UTF-8?q?ame=20argocd=5Fversion=20=E2=86=92=20argocd=5Foperator=5Fversio?= =?UTF-8?q?n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Threads sops_age_key from secrets into the k3s cloud-init template (paired with the multiline-safe write_files handling already in place), and renames the ArgoCD version local to argocd_operator_version to match the operator- based install (v0.14.0 of argoproj-labs/argocd-operator). Regenerates README.md with terraform-docs v0.22.0 (now matching the republished tfroot-runner image). Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 8 ++++---- main.tf | 28 +++++++++++++++++----------- secrets/secrets.yaml | 5 +++-- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 407f284..79ecffb 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Requirements | Name | Version | -|------|---------| +| ---- | ------- | | [terraform](#requirement\_terraform) | >= 1.3 | | [libvirt](#requirement\_libvirt) | ~> 0.9.0 | | [sops](#requirement\_sops) | ~> 1.3.0 | @@ -10,21 +10,21 @@ ## Providers | Name | Version | -|------|---------| +| ---- | ------- | | [libvirt](#provider\_libvirt) | ~> 0.9.0 | | [sops](#provider\_sops) | ~> 1.3.0 | ## Modules | Name | Source | Version | -|------|--------|---------| +| ---- | ------ | ------- | | [k3s](#module\_k3s) | git::https://github.com/makeitworkcloud/terraform-libvirt-domain.git | n/a | | [runner](#module\_runner) | git::https://github.com/makeitworkcloud/terraform-libvirt-domain.git | n/a | ## Resources | Name | Type | -|------|------| +| ---- | ---- | | [libvirt_pool.cluster](https://registry.terraform.io/providers/dmacvicar/libvirt/latest/docs/resources/pool) | resource | | [sops_file.secret_vars](https://registry.terraform.io/providers/carlpett/sops/latest/docs/data-sources/file) | data source | diff --git a/main.tf b/main.tf index 9f6ab24..2cf16d9 100644 --- a/main.tf +++ b/main.tf @@ -10,15 +10,20 @@ locals { # GitHub github_org = "makeitworkcloud" - # ArgoCD bootstrap target (root of repo manages bootstrap + workloads) + # ArgoCD bootstrap target — kustomize-cluster's bootstrap/ kustomization + # contains the ArgoCD CR (consumed by argocd-operator) plus the operators-app + # and workloads-app Applications that drive the rest of the sync. cluster_repo_url = "https://github.com/makeitworkcloud/kustomize-cluster" cluster_repo_branch = "main" - cluster_repo_path = "." + cluster_repo_path = "bootstrap" # k3s - k3s_ip = "192.168.102.2" - k3s_version = "v1.31.4+k3s1" # bump as needed; see https://github.com/k3s-io/k3s/releases - argocd_version = "v2.13.2" # bump as needed; see https://github.com/argoproj/argo-cd/releases + k3s_ip = "192.168.102.2" + k3s_version = "v1.31.4+k3s1" # bump as needed; see https://github.com/k3s-io/k3s/releases + + # argocd-operator (community) — provides the argoproj.io/v1beta1 ArgoCD CRD + # consumed by kustomize-cluster/bootstrap/argocd-config.yaml + argocd_operator_version = "v0.14.0" # bump as needed; see https://github.com/argoproj-labs/argocd-operator/releases } # Dedicated libvirt pool on /mnt/nvme RAID-1 for cluster volumes (keeps cluster IO off the root LV). @@ -79,12 +84,13 @@ module "k3s" { cloudinit_meta_data_vars = { hostname = "k3s" } cloudinit_user_data_template = "${path.module}/cloud-init/k3s/cloud_init.cfg" cloudinit_user_data_vars = { - ssh_authorized_key = data.sops_file.secret_vars.data["ssh_admin_pubkey"] - k3s_version = local.k3s_version - argocd_version = local.argocd_version - cluster_repo_url = local.cluster_repo_url - cluster_repo_branch = local.cluster_repo_branch - cluster_repo_path = local.cluster_repo_path + ssh_authorized_key = data.sops_file.secret_vars.data["ssh_admin_pubkey"] + sops_age_key = data.sops_file.secret_vars.data["sops_age_key"] + k3s_version = local.k3s_version + argocd_operator_version = local.argocd_operator_version + cluster_repo_url = local.cluster_repo_url + cluster_repo_branch = local.cluster_repo_branch + cluster_repo_path = local.cluster_repo_path } cloudinit_network_config_template = "${path.module}/cloud-init/network_config.cfg" cloudinit_network_config_vars = { private_ip_addr = local.k3s_ip } diff --git a/secrets/secrets.yaml b/secrets/secrets.yaml index 3c0602d..7780d93 100644 --- a/secrets/secrets.yaml +++ b/secrets/secrets.yaml @@ -9,6 +9,7 @@ s3_key: ENC[AES256_GCM,data:HnbBP1/Yl+7x4QWytcaVMck=,iv:/kPXIKVHp6/0IJAPyz6LX73d s3_region: ENC[AES256_GCM,data:7Y+mGUUXo9Jd,iv:GohshS4fKqvziEcdPqGFsyWDQjLHf9O7PVs1Z0vXORw=,tag:PSTQ5DuQILDglOAj+OJnBQ==,type:str] s3_access_key: ENC[AES256_GCM,data:dvhYpsqgxm2hpNjuNA5SDPR4leU=,iv:ejPmt4Kqk9pRuAFccxvH9XMc1yzvA9Xs8RQ/S465bRM=,tag:oS660L4tVyqjIgHUt+B4tw==,type:str] s3_secret_key: ENC[AES256_GCM,data:9kgVRgWrIusg4F7NqIfAX5GxR1ya7sY/mCdOaoV6+v+SYJxDxpR2zg==,iv:/qUB565Eh31SOtF1YvyLdxxFftHqRDsFdRTUoyq2w08=,tag:IDS3RQlgVsg1LU8OxGRXSg==,type:str] +sops_age_key: ENC[AES256_GCM,data:xwyvLD5uu4Umd1rF8dEoBi1DPZ5ts2xROd4MYVxiGbHxPsPEHcs8CiE6qebKyvbufLK74Mi2fzgKO9CQE0qIMSgRSvNzWv+zvw6i5Lp5AImn/St+pn3KIEOJHPN2t46Ure6Iy+ZE32PT/YoAPQki8tMY86q4ewvFdDH+pLTbh2tDLl2HmAR7Wxtgz58/3srw+7lteyvfHQmTYu/LBKJL5fN9ps/r0Jv4/UhDCeegjEQmX9iGFdd1kVdI3nME,iv:CJzikgkgA+dMoMso6wfzqK5gWV+c8U7Y2tb7OkMtBuM=,tag:T0bp4oImmKDQ791MZWlpwg==,type:str] sops: age: - recipient: age152ek83tm4fj5u70r3fecytn4kg7c5xca24erjchxexx4pfqg6das7q763l @@ -20,7 +21,7 @@ sops: R05MZlJDY1JnVjBlb01Hdm10d3k3VXMKpYhy+H82z9yBAREn2O0cUQp+m9laXyAx 5Hn86bDGLP4LxsVKbQS/77Weg0HI26WsKkTwOR8DB72TFia1SzQNqQ== -----END AGE ENCRYPTED FILE----- - lastmodified: "2026-04-29T17:56:19Z" - mac: ENC[AES256_GCM,data:7rymnZ45gTOu3PeR3FgZpcG9/8z6DyJKhZQ9NY7o+Sj0PLYeykV34am0llGRzMfc4HQgK6+N/suoHdUKSJiS1FLuCJWrEuK/JMfSlpyATkXDaxMzqfAdKDK4UjUcyd2VKFhks2KsOUSXl+pnzB9Y5U1pIypmqwzYY7fGvUF6muE=,iv:hBaQQ2ZlWVvIuwru4KtbjacD60hxMVotJLLUvN467AA=,tag:cak6Ia/F7BMViXbkuldYFA==,type:str] + lastmodified: "2026-04-29T21:03:21Z" + mac: ENC[AES256_GCM,data:+E0MzZUtODdA0rtFdXBEq2HHDYcpAe0FOuDJGJobHjz3yKFpG3Mh0h8W0BMKn2ddmQjDvCdQLujqUai2ETtkMhGx/LH5PzXjM+14ACd3UEBUAPT5qzwMXbaRlJCaUDsinVn6UJ9WV2578FNxxkhj0XuxStveX8IG0royFnHtezk=,iv:W2vuZQZUNHhEqlkANPbzSApB2gdgX71kxuXrGVY+w8g=,tag:wey29L7sDtWlbzPfy38++g==,type:str] unencrypted_suffix: _unencrypted version: 3.12.2 From 7c469f1767e26d3c94b5c3b984a2b804beacba87 Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 16:11:26 -0600 Subject: [PATCH 06/17] feat(libvirt): manage qemu+ssh credentials via sops, drop ~/.ssh/id_rsa dep Generates a dedicated ed25519 keypair for the libvirt provider's qemu+ssh transport, encrypts the private half + hero's host pubkeys into secrets/secrets.yaml, and has the Makefile materialize both under .terraform/libvirt-ssh/ before tofu init. providers.tf builds the URI from the sops libvirt_uri base + the materialized keyfile/knownhosts paths. Local users no longer need ~/.ssh/id_rsa (incompatible with bitwarden-agent setups), and CI gets the same flow with no extra GHA secret. Host-key rotations on hero become a sops re-encrypt instead of a per-machine ssh-keygen -R + accept-new dance. Co-Authored-By: Claude Opus 4.7 (1M context) --- Makefile | 17 +++++++++++++++-- providers.tf | 11 ++++++++++- secrets/secrets.yaml | 8 +++++--- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 2081c6d..882eed7 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ S3_KEY := $(shell sops decrypt secrets/secrets.yaml | grep ^s3_key S3_ACCESS_KEY := $(shell sops decrypt secrets/secrets.yaml | grep ^s3_access_key | cut -d ' ' -f 2) S3_SECRET_KEY := $(shell sops decrypt secrets/secrets.yaml | grep ^s3_secret_key | cut -d ' ' -f 2) -.PHONY: help init plan apply migrate test pre-commit-check-deps pre-commit-install-hooks clean +.PHONY: help init plan apply migrate test libvirt-ssh pre-commit-check-deps pre-commit-install-hooks clean help: @echo "General targets" @@ -36,7 +36,20 @@ clean: init: clean .terraform/terraform.tfstate -.terraform/terraform.tfstate: +# SSH key + known_hosts for the libvirt provider's qemu+ssh transport. Decrypted +# from sops at make-time so neither local users nor CI need a private key on disk. +libvirt-ssh: .terraform/libvirt-ssh/id_ed25519 .terraform/libvirt-ssh/known_hosts + +.terraform/libvirt-ssh/id_ed25519: secrets/secrets.yaml + @mkdir -p $(@D) + @sops --decrypt --extract '["ops_ssh_privkey"]' secrets/secrets.yaml > $@ + @chmod 0600 $@ + +.terraform/libvirt-ssh/known_hosts: secrets/secrets.yaml + @mkdir -p $(@D) + @sops --decrypt --extract '["hero_known_hosts"]' secrets/secrets.yaml > $@ + +.terraform/terraform.tfstate: libvirt-ssh @${TERRAFORM} init -reconfigure -upgrade -input=false -backend-config="key=${S3_KEY}" -backend-config="bucket=${S3_BUCKET}" -backend-config="region=${S3_REGION}" -backend-config="access_key=${S3_ACCESS_KEY}" -backend-config="secret_key=${S3_SECRET_KEY}" plan: init .terraform/plan diff --git a/providers.tf b/providers.tf index 8b523fc..c8ee67a 100644 --- a/providers.tf +++ b/providers.tf @@ -15,8 +15,17 @@ terraform { } } +locals { + # SSH key + known_hosts are decrypted from sops by `make` and written under + # .terraform/libvirt-ssh/ before tofu init. Keeping them out of the user's + # ~/.ssh means local and CI both work without dev-key dependencies. + libvirt_ssh_dir = "${path.module}/.terraform/libvirt-ssh" + libvirt_keyfile = "${local.libvirt_ssh_dir}/id_ed25519" + libvirt_known_hosts = "${local.libvirt_ssh_dir}/known_hosts" +} + provider "libvirt" { - uri = data.sops_file.secret_vars.data["libvirt_uri"] + uri = "${data.sops_file.secret_vars.data["libvirt_uri"]}?sshauth=privkey&keyfile=${local.libvirt_keyfile}&knownhosts=${local.libvirt_known_hosts}" } provider "sops" {} diff --git a/secrets/secrets.yaml b/secrets/secrets.yaml index 7780d93..027356f 100644 --- a/secrets/secrets.yaml +++ b/secrets/secrets.yaml @@ -3,13 +3,15 @@ proxyhost: ENC[AES256_GCM,data:2JMu3PbepHozJfoe4hs7chT24Uvxke2YDVU=,iv:kIJJXLZhV torwww_ip_addr: ENC[AES256_GCM,data:ZYEWHdA9A+MJ1CS+zqI=,iv:LrSWaVzjA23vGoG0CDCaL1yw7JQTc3Nd9CWH8zMrwGM=,tag:gldEwwz2FwTdArgfi28b4A==,type:str] runner_ip_addr: ENC[AES256_GCM,data:1X765pvHZcPXd4VC/Mg=,iv:U1LsCbzDb88nIYxbeQ1mxbXDZeSKqjHy8+MgRsOdLA0=,tag:uMtczv3JM9X6uwLsw+pGnw==,type:str] ssh_admin_pubkey: ENC[AES256_GCM,data:3YuHvteiPTTxVS6wVfAgjTIpx8saFar6rXCki/LhD9aC2zL5ATMzywoLRJWV/2f2RL5Vpj3/RFQwoTzV0T0ilmfyVB7lcp4L/bh7O81ryC2PTLnNcpII26V0BpxCmER5Tiz+cwS5KYZ9p6PGfnTe2b8pYIpoqrE0nkQo3wSkZ/N5KJ7MIp9xm5/JQ3jd864lU0fkjvlE8V/6W4g4hPq1ZKNmW27ALsPHdD7g9IIJS9We83ge6EA9/BG2WlWLSPa9uTBB7MeZbXT7qGwj/ovzQ9VUbB47RIPddCkx0xLKQoHDnGEx55U2/iGKJiWRxA+amVE2DA9MMj745nuC2yQIZx0QVFO3Eu+fNCEsLDUNPI0GaJCkm08CUu8vNG18rgHzc6vQsWASVnUT9Ct22pMaVdew5vc49mp0GgAmzxOPzOEvE647SRx4IA7iaKqMPqDHYPO29BofkD9ZK4z0s3UaYCBz+QzroO5SEYGfgmA23cRCmKF2nbtCsrF3ApU+KeyjCTBGeYL1B1ghzvbt3//i4yNkveHMeKvLrVtrG9RJzMMDQI9ksze/hP5kNVRJUZHlVlRBDujyULkOK6vxXHj0hn3ZwV3JTeI1WWL3hqbBX0j+utYF68G61qsjnfJdUz1FaysjUtVt0uUNym/wmDX4fpONXyuu7kaPyCuJ3e1/Q0DGYq9Q24/awkjhrsdHhsL1MGGyWe9HYDxS+5OfIf2U+3zwKjpnMPazX8heuABuHJ8P/c+gAqseJWohe/hOCGFS+tKd3kvQBiiCyCgjRBTCDh/TrVOJMKYNcILFxCBuyJQ1HM9UGQTp4QBEA7Sexqb5DoS3XZk209nG6tpIUPthiycIIFAhPnGngFqVmLdamRgJgqsXLZL5c8iItysjDtztrBoj3tA3nI22sv3kh1iUECvf8A4GrjAJqi82MBT3pJdtce7uqkJS0/SdgX1ZcfoXo6UahA==,iv:RSqrTaR2lwROxPQ0qmRNyYkgxCh/D2DWWKJgza51cnw=,tag:q/+CgANHWE3cbrzbiGUdqw==,type:str] -libvirt_uri: ENC[AES256_GCM,data:Oqo5gZw9XDcWHVJFBOeibde8AUgjdPsgg2I9N972FuGsnWS1PQDwxzLoYNALIZU07009aAMj2u4cCgWagtnh/di7Sz8krMlw9UFCxC2vGxq65Q==,iv:WohxbZMwsHHwk+J9SHFYtdtAOvGpzhanrVLYBnFol24=,tag:vjoiK3qagoFNyhMeIxZWFg==,type:str] +libvirt_uri: ENC[AES256_GCM,data:DxvUPVIgyqI80A6EmTqHbnprRc8imsds82rL+km2FjXcvBkIYwiEJ58/+Xc=,iv:SVoAIgSGkFsQpMtANfERReaAwg0TzV2zOVoU7ojUHfY=,tag:2c95yYAZyORCQJlmcouapQ==,type:str] s3_bucket: ENC[AES256_GCM,data:URTul/yP5yDzPUgZ/Nfqnj+m5qsz,iv:MWb0FA2rAoHlqCt9bfFeVCMt/cVRt/d01a1gWr98TPQ=,tag:ICrjp8sPlJluS0+BZ2DUxA==,type:str] s3_key: ENC[AES256_GCM,data:HnbBP1/Yl+7x4QWytcaVMck=,iv:/kPXIKVHp6/0IJAPyz6LX73dX8ajf94dSyKaIkP84Yw=,tag:Yvfsk7lUYa8x90RCg+b0lw==,type:str] s3_region: ENC[AES256_GCM,data:7Y+mGUUXo9Jd,iv:GohshS4fKqvziEcdPqGFsyWDQjLHf9O7PVs1Z0vXORw=,tag:PSTQ5DuQILDglOAj+OJnBQ==,type:str] s3_access_key: ENC[AES256_GCM,data:dvhYpsqgxm2hpNjuNA5SDPR4leU=,iv:ejPmt4Kqk9pRuAFccxvH9XMc1yzvA9Xs8RQ/S465bRM=,tag:oS660L4tVyqjIgHUt+B4tw==,type:str] s3_secret_key: ENC[AES256_GCM,data:9kgVRgWrIusg4F7NqIfAX5GxR1ya7sY/mCdOaoV6+v+SYJxDxpR2zg==,iv:/qUB565Eh31SOtF1YvyLdxxFftHqRDsFdRTUoyq2w08=,tag:IDS3RQlgVsg1LU8OxGRXSg==,type:str] sops_age_key: ENC[AES256_GCM,data:xwyvLD5uu4Umd1rF8dEoBi1DPZ5ts2xROd4MYVxiGbHxPsPEHcs8CiE6qebKyvbufLK74Mi2fzgKO9CQE0qIMSgRSvNzWv+zvw6i5Lp5AImn/St+pn3KIEOJHPN2t46Ure6Iy+ZE32PT/YoAPQki8tMY86q4ewvFdDH+pLTbh2tDLl2HmAR7Wxtgz58/3srw+7lteyvfHQmTYu/LBKJL5fN9ps/r0Jv4/UhDCeegjEQmX9iGFdd1kVdI3nME,iv:CJzikgkgA+dMoMso6wfzqK5gWV+c8U7Y2tb7OkMtBuM=,tag:T0bp4oImmKDQ791MZWlpwg==,type:str] +ops_ssh_privkey: ENC[AES256_GCM,data:2Ig2T4oNJoqHfRjoFGDiTWWz6hiU4ZAuK71SGO5yz9BZwZ5XODiH0GRm2gRzasUc0IB9UtnEv1keG27dKKx8J0XPF3jCI9HxMaW91C04IQ0VZcgnyLJiixx9kyx4DWz3lI63fTQMrz9SnTGNQnYXWb75MRw3lFW33nlFb2xa1h0+8lrOFPduFS1+iodSrhETL0L4uA8SgkB3xz/Of85rXG8ZRnBzrscMKQz56MNqVEXJ1kVEZBgu7TlWovFCEWxrGEkIC5KSGIs57ZhGLzGYn4nopVzuNmaywgbKpWCPjQULBgQvuU6zUG3Ul9/p9nc5V7xCMUFeC0/TeeiYPVv2OfYSSLUNtz24J5u4oQ/D47AKWfRxN42eUA8zbMgJIYFY3czZ+PUseDXsBbF0r2J8C9AIrg8aV3uxLRZoCfPiFqKvZDJMDJaDx5t2cJtMdvtBSX9Vd9zObzR5P5kcb1cEhQfXWTgNsPuzhbRyLNCLdf59sp0C/pUiLdFpQDm72sLwgunNzlvi1yoidbIvX9QCdCqGQp8KOmxT1/djlpR17nX0DGEf4a6NAy+CMlcG8p9U,iv:fDQ8XQRUJDvoHvJzs+wZwMH6ePGx2Q1Wh6qmqgTTC80=,tag:9F2eO49JX0hgRnGxVNo7jg==,type:str] +hero_known_hosts: ENC[AES256_GCM,data:7/TRHASfMqpQ7JvigV42DFjBz+XJpyKlhyi8T9Ex3+G5uYL0DkWSPwbkhoHRZPKxugHbtG6jGd3PF86lfsXV0AdKCj3W4xXsUYaP3l9SluuMCIIPAlNbvrsB6u/4v38yqWnILxSS5Hj87Ju8PnENje5ArDmP8oXKZqosVdAuFCDFycmjwqjIgzq9yLQGP9koGmlWorAZ94/PBN8Y5oF9eBc8g1kij+aFN38kLFB6vpN858xzzbUG0N7iheCLyOwyXYauegqmPHu6uaEEDLJ0B15CyF7RcKqZUiIXZWKMYAJabHBNGPL6pyH4ybB8kdXIG9xN48MNe+E2lWJKYu6t3x7TTpnaZ1ulA5jYgWyYplUmSoOs9YOZoOJ9/Xgq91wAFrgywKmog8ph1Sjg5hZaovQsl4difKElFPHnkVC+LWK1nX0CwHfpo1FIzxJeiTcm1VQ9r9jet2jnMoQep0yqaCKxZYo5sXW2m+jG7ppXW/qxyaeWclPnOho18QKkJSpU3ffAH6mEHgGYFVjdrRV5/24TY+yU0s4JfHhJ2hdY95ZH+xbzUaiE8RF/GaS8VtrEgZZrIlQJoMZk+fGv2Y7k0em9OZX8k9jjmEqVpjwlH0WMjectLcvYGeZosuv1VXk+wEv2/Q4YYmt/ylxjO8xsDSTV4B/SsP2guzUcCbXV1TnEbumvqxOLJ2yOXgwvH574zC3Qh0ztiELj/7yEp18u8jJFqXt9BSVkgv3CRwlZkXD9Gyzz+dhegrSm/dVGNXm0rKBiqzjvtcvjafqJcl7NOa2t3AwfTEIubtb21Rb29MZW48zmRf5RRtmapEFcf3xM1pCYG2NpJquCLe3iidCZ6mVFcOSV2gc1olMiuu1j7Z2zp92A0pxHvFuiftlugXQEmwLksdWHIBnrFVZId4Jaw0XlMTutWwU98YDmEw4ynyR4aocYgFRP8auu3u29fnVb0XgsRVwGuTeOpwcLPXZH2tczfZhxFyyiWuqE6IbpCLymHc5yiiw8REJOK2WQDg7r/fkrVMnQEhbyUsL9Ne9/uayjrdiI7iPOHRh+WIDqAP3X+1Dj7k7qffdPl2ADJmv1qBLLi++AefUkGi6BnLEWFWrYJSdekBTRct6eZuDMlpUZMnG58pxUcr0V+Ihg9LNpl00ZZbU7RdOK1UkjOhCj8xWKje7Rrm9RKcBKr7iEktfIU7V/d22TR5Lk+DiloKHxQMTrfnkM88CuiaJfl1ZOLF4oofeNDxw/auHgrf6acC1qmJUG8B/4ix5xYn0P5UxyPJTmdm4PDJ/3vFPlE66Bj7w2Oi4RIM8QTjllk9dy2RAR7ZNXVW4m8Kg5,iv:Ukxelc0oU9HY73FMP4twk9ZH8eVjaYybB7fMt6hOcC8=,tag:dfH5LhKoGcZ88n3A3nnJDw==,type:str] sops: age: - recipient: age152ek83tm4fj5u70r3fecytn4kg7c5xca24erjchxexx4pfqg6das7q763l @@ -21,7 +23,7 @@ sops: R05MZlJDY1JnVjBlb01Hdm10d3k3VXMKpYhy+H82z9yBAREn2O0cUQp+m9laXyAx 5Hn86bDGLP4LxsVKbQS/77Weg0HI26WsKkTwOR8DB72TFia1SzQNqQ== -----END AGE ENCRYPTED FILE----- - lastmodified: "2026-04-29T21:03:21Z" - mac: ENC[AES256_GCM,data:+E0MzZUtODdA0rtFdXBEq2HHDYcpAe0FOuDJGJobHjz3yKFpG3Mh0h8W0BMKn2ddmQjDvCdQLujqUai2ETtkMhGx/LH5PzXjM+14ACd3UEBUAPT5qzwMXbaRlJCaUDsinVn6UJ9WV2578FNxxkhj0XuxStveX8IG0royFnHtezk=,iv:W2vuZQZUNHhEqlkANPbzSApB2gdgX71kxuXrGVY+w8g=,tag:wey29L7sDtWlbzPfy38++g==,type:str] + lastmodified: "2026-04-29T22:07:57Z" + mac: ENC[AES256_GCM,data:AoZzHy71zcuXC2mO/KzuUCEBBN8epCMsRjaDYmVskFIFylrUt2CPFes6bOqk8PrHF2g6lB6lPANE+6dA8udYwKrdu9tHvGhghiUH367XZipeW+A6+LIi3xKhkVFX3SnwJ0zTPuhj2+wSxO6tkke0DkAENx0JdlWdyXTYVEeV9NU=,iv:XHUXgsIyZcJy/G6NZR/7eVWpojYbP53R7oOjFii96DY=,tag:GwOYNKcMaP+/2OoOjymJww==,type:str] unencrypted_suffix: _unencrypted version: 3.12.2 From de6ea12bd5030db6c1238ac701de67836295a95d Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 16:14:47 -0600 Subject: [PATCH 07/17] chore(pre-commit): point terraform_validate at opentofu via PCT_TFPATH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pre-commit-terraform hooks call `terraform` from PATH. In CI the tfroot-runner image symlinks tofu→terraform, so it resolves correctly; locally Homebrew's HashiCorp terraform binary rejects tofu-only backend attributes (e.g. assume_role_duration_seconds) and aborts validation. Sets PCT_TFPATH=$(command -v tofu) in three complementary spots: - Makefile `test` target — covers `make test`. - `.envrc` — direnv users get it auto-sourced via `direnv allow`. - AGENTS.md — documents the manual export for non-direnv shells. Co-Authored-By: Claude Opus 4.7 (1M context) --- .envrc | 8 ++++++++ AGENTS.md | 13 +++++++++++++ Makefile | 2 +- 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 .envrc diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..d243e16 --- /dev/null +++ b/.envrc @@ -0,0 +1,8 @@ +# Tells antonbabenko/pre-commit-terraform's terraform_validate, terraform_fmt, +# and terraform_docs hooks to use OpenTofu rather than HashiCorp Terraform — +# matches the tfroot-runner CI image (which symlinks tofu→terraform) and is +# required because the s3 backend config uses tofu-only attributes +# (assume_role_duration_seconds) that the HashiCorp terraform binary rejects. +# +# Auto-sourced by direnv on cd. Non-direnv users: see AGENTS.md. +export PCT_TFPATH="$(command -v tofu)" diff --git a/AGENTS.md b/AGENTS.md index 437b6f2..87ca6a4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -21,6 +21,19 @@ make test This automatically fetches the canonical config if not present. +### OpenTofu vs HashiCorp Terraform + +The pre-commit-terraform hooks call `terraform` from PATH. In CI the +`tfroot-runner` image symlinks `tofu → terraform` so the call resolves to +OpenTofu. Locally most developers have HashiCorp `terraform` from Homebrew, +which rejects tofu-only backend attributes (e.g. `assume_role_duration_seconds`). + +`make test` already exports `PCT_TFPATH=$(command -v tofu)` so the hooks +invoke OpenTofu. For `git commit`-triggered pre-commit runs, either: + +- use direnv: `direnv allow` will source the repo's `.envrc`; or +- export it manually: `export PCT_TFPATH=$(command -v tofu)` in your shell. + ## CI/CD This repo uses the shared `opentofu.yml` workflow from `shared-workflows`, but with **custom configuration**: diff --git a/Makefile b/Makefile index 882eed7..1b4190d 100644 --- a/Makefile +++ b/Makefile @@ -69,7 +69,7 @@ migrate: @${TERRAFORM} init -migrate-state -backend-config="key=${S3_KEY}" -backend-config="bucket=${S3_BUCKET}" -backend-config="region=${S3_REGION}" -backend-config="access_key=${S3_ACCESS_KEY}" -backend-config="secret_key=${S3_SECRET_KEY}" test: .pre-commit-config.yaml .git/hooks/pre-commit - @pre-commit run -a + @PCT_TFPATH=$$(command -v tofu) pre-commit run -a .pre-commit-config.yaml: @curl -sSL -o .pre-commit-config.yaml \ From 23986044dad19a55047e1b7cd5ad3a67defba7be Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 16:22:10 -0600 Subject: [PATCH 08/17] ci: target arc-dind self-hosted runner Reverts the temporary ubuntu-latest fallback that was needed while the CRC cluster was decommissioned. Once kustomize-cluster's ARC stack is running on k3s, the dind RunnerDeployment registers org-scoped runners with label `arc-dind`, which this workflow now targets. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/opentofu.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/opentofu.yml b/.github/workflows/opentofu.yml index bc2ea8b..aa3c160 100644 --- a/.github/workflows/opentofu.yml +++ b/.github/workflows/opentofu.yml @@ -16,7 +16,10 @@ jobs: opentofu: uses: makeitworkcloud/shared-workflows/.github/workflows/opentofu.yml@main with: - runs-on: ubuntu-latest + # Self-hosted dind runner from kustomize-cluster/workloads/arc; matches the + # `arc-dind` label on the RunnerDeployment. Until the k3s cluster is up + # and the runner pods are registered, this job will queue. + runs-on: arc-dind container: ghcr.io/makeitworkcloud/tfroot-runner:latest setup-ssh: true secrets: From 1e772f93ff0e367f6cd7925621226444d0130e97 Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 16:22:15 -0600 Subject: [PATCH 09/17] feat: bump Fedora cloud image from 43-1.6 to 44-1.7 Co-Authored-By: Claude Opus 4.7 (1M context) --- main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.tf b/main.tf index 2cf16d9..9ed24f1 100644 --- a/main.tf +++ b/main.tf @@ -5,7 +5,7 @@ data "sops_file" "secret_vars" { locals { # Boot images # Direct mirror that provides Content-Length header (required by libvirt provider) - fedora_image_url = "https://dl.fedoraproject.org/pub/fedora/linux/releases/43/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-43-1.6.x86_64.qcow2?v=3" + fedora_image_url = "https://dl.fedoraproject.org/pub/fedora/linux/releases/44/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-44-1.7.x86_64.qcow2" # GitHub github_org = "makeitworkcloud" From 7109ff109a72ce8012deb86244d14ce6c25c154f Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 18:51:55 -0600 Subject: [PATCH 10/17] chore(secrets): rotate ssh_admin_pubkey from ssh-rsa to ssh-ed25519 --- secrets/secrets.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/secrets/secrets.yaml b/secrets/secrets.yaml index 027356f..032c68f 100644 --- a/secrets/secrets.yaml +++ b/secrets/secrets.yaml @@ -2,7 +2,7 @@ github_token: ENC[AES256_GCM,data:c5PI4Pedry9iSR3CE2dfKdWsJKA83meI87QCt9CIgdxFKk proxyhost: ENC[AES256_GCM,data:2JMu3PbepHozJfoe4hs7chT24Uvxke2YDVU=,iv:kIJJXLZhVz8Kpz0/h64CWLi8Xx2YKpiPe4xwYQAS7r8=,tag:8d7mO7O5483Crh8QZv/oNg==,type:str] torwww_ip_addr: ENC[AES256_GCM,data:ZYEWHdA9A+MJ1CS+zqI=,iv:LrSWaVzjA23vGoG0CDCaL1yw7JQTc3Nd9CWH8zMrwGM=,tag:gldEwwz2FwTdArgfi28b4A==,type:str] runner_ip_addr: ENC[AES256_GCM,data:1X765pvHZcPXd4VC/Mg=,iv:U1LsCbzDb88nIYxbeQ1mxbXDZeSKqjHy8+MgRsOdLA0=,tag:uMtczv3JM9X6uwLsw+pGnw==,type:str] -ssh_admin_pubkey: ENC[AES256_GCM,data:3YuHvteiPTTxVS6wVfAgjTIpx8saFar6rXCki/LhD9aC2zL5ATMzywoLRJWV/2f2RL5Vpj3/RFQwoTzV0T0ilmfyVB7lcp4L/bh7O81ryC2PTLnNcpII26V0BpxCmER5Tiz+cwS5KYZ9p6PGfnTe2b8pYIpoqrE0nkQo3wSkZ/N5KJ7MIp9xm5/JQ3jd864lU0fkjvlE8V/6W4g4hPq1ZKNmW27ALsPHdD7g9IIJS9We83ge6EA9/BG2WlWLSPa9uTBB7MeZbXT7qGwj/ovzQ9VUbB47RIPddCkx0xLKQoHDnGEx55U2/iGKJiWRxA+amVE2DA9MMj745nuC2yQIZx0QVFO3Eu+fNCEsLDUNPI0GaJCkm08CUu8vNG18rgHzc6vQsWASVnUT9Ct22pMaVdew5vc49mp0GgAmzxOPzOEvE647SRx4IA7iaKqMPqDHYPO29BofkD9ZK4z0s3UaYCBz+QzroO5SEYGfgmA23cRCmKF2nbtCsrF3ApU+KeyjCTBGeYL1B1ghzvbt3//i4yNkveHMeKvLrVtrG9RJzMMDQI9ksze/hP5kNVRJUZHlVlRBDujyULkOK6vxXHj0hn3ZwV3JTeI1WWL3hqbBX0j+utYF68G61qsjnfJdUz1FaysjUtVt0uUNym/wmDX4fpONXyuu7kaPyCuJ3e1/Q0DGYq9Q24/awkjhrsdHhsL1MGGyWe9HYDxS+5OfIf2U+3zwKjpnMPazX8heuABuHJ8P/c+gAqseJWohe/hOCGFS+tKd3kvQBiiCyCgjRBTCDh/TrVOJMKYNcILFxCBuyJQ1HM9UGQTp4QBEA7Sexqb5DoS3XZk209nG6tpIUPthiycIIFAhPnGngFqVmLdamRgJgqsXLZL5c8iItysjDtztrBoj3tA3nI22sv3kh1iUECvf8A4GrjAJqi82MBT3pJdtce7uqkJS0/SdgX1ZcfoXo6UahA==,iv:RSqrTaR2lwROxPQ0qmRNyYkgxCh/D2DWWKJgza51cnw=,tag:q/+CgANHWE3cbrzbiGUdqw==,type:str] +ssh_admin_pubkey: ENC[AES256_GCM,data:Wg0gR203YArHZ4m1H4meT1kQmH8oXGBT/YX+hYWBCuYCGmAP0HJOpLFxiIFCEzxw2R1vmhSRl6NPlaM+Yusrea1XvV0i5TCphqfIidEDgxo=,iv:O+TVlZnfHfK/uqteubgzp+c7/1t8jsM/ZEfY5Q+GXw0=,tag:/YSTeD9XxyrouWjHcj5Hfg==,type:str] libvirt_uri: ENC[AES256_GCM,data:DxvUPVIgyqI80A6EmTqHbnprRc8imsds82rL+km2FjXcvBkIYwiEJ58/+Xc=,iv:SVoAIgSGkFsQpMtANfERReaAwg0TzV2zOVoU7ojUHfY=,tag:2c95yYAZyORCQJlmcouapQ==,type:str] s3_bucket: ENC[AES256_GCM,data:URTul/yP5yDzPUgZ/Nfqnj+m5qsz,iv:MWb0FA2rAoHlqCt9bfFeVCMt/cVRt/d01a1gWr98TPQ=,tag:ICrjp8sPlJluS0+BZ2DUxA==,type:str] s3_key: ENC[AES256_GCM,data:HnbBP1/Yl+7x4QWytcaVMck=,iv:/kPXIKVHp6/0IJAPyz6LX73dX8ajf94dSyKaIkP84Yw=,tag:Yvfsk7lUYa8x90RCg+b0lw==,type:str] @@ -23,7 +23,7 @@ sops: R05MZlJDY1JnVjBlb01Hdm10d3k3VXMKpYhy+H82z9yBAREn2O0cUQp+m9laXyAx 5Hn86bDGLP4LxsVKbQS/77Weg0HI26WsKkTwOR8DB72TFia1SzQNqQ== -----END AGE ENCRYPTED FILE----- - lastmodified: "2026-04-29T22:07:57Z" - mac: ENC[AES256_GCM,data:AoZzHy71zcuXC2mO/KzuUCEBBN8epCMsRjaDYmVskFIFylrUt2CPFes6bOqk8PrHF2g6lB6lPANE+6dA8udYwKrdu9tHvGhghiUH367XZipeW+A6+LIi3xKhkVFX3SnwJ0zTPuhj2+wSxO6tkke0DkAENx0JdlWdyXTYVEeV9NU=,iv:XHUXgsIyZcJy/G6NZR/7eVWpojYbP53R7oOjFii96DY=,tag:GwOYNKcMaP+/2OoOjymJww==,type:str] + lastmodified: "2026-04-29T22:35:54Z" + mac: ENC[AES256_GCM,data:/psr3jetNh7hC0qcXJB+PMlUEHgpLBHa8rmYlzV2NBB5IsbeiWYNWCYp62oownV8QBfRMl72Pp1HdF/4eo9Kjhy2CQ2HsMREpx9OVjlfk/oreFqquqBQLC+5lQV30QIKjc9uwMZAukZdNzOLRsuIQjHyDQHLTaT4Nkx5wpIo4Cc=,iv:A+vVP8eyj/sKb+AZvAfYguLe6QMidOLYRZd9D0Sw1Ew=,tag:piKeKZW9D/h4leszVkupkA==,type:str] unencrypted_suffix: _unencrypted version: 3.12.2 From 2b1c58e8f00fa6779c990a59f29dd987b0399e5f Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 18:52:06 -0600 Subject: [PATCH 11/17] fix(cloud-init): handle Fedora 44 package gaps + persistent-volume idempotency Fedora 44 cloud-base images don't ship git or libicu by default, which broke both bootstrap flows on first apply against the new image: - k3s: argocd-operator install via kubectl apply -k 'git+https://...' needs git - runner: actions runner ships dotnet 6 binaries that need libicu/lttng-ust; config.sh --unattended fails with "Libicu's dependencies is missing" Also make both runcmd flows idempotent against persistent extra volumes (/var/lib/rancher and /opt/actions-runner have overwrite:false). Boot-disk replacement now reuses cluster + runner state instead of erroring on "namespace already exists" or "runner already configured". --- cloud-init/k3s/cloud_init.cfg | 6 ++++-- cloud-init/runner/cloud_init.cfg | 36 +++++++++++++++++++------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/cloud-init/k3s/cloud_init.cfg b/cloud-init/k3s/cloud_init.cfg index abfb066..361099b 100644 --- a/cloud-init/k3s/cloud_init.cfg +++ b/cloud-init/k3s/cloud_init.cfg @@ -25,6 +25,7 @@ users: packages: - curl + - git fs_setup: - device: /dev/vdb @@ -52,9 +53,10 @@ runcmd: - | set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml - kubectl create namespace argocd + kubectl get ns argocd >/dev/null 2>&1 || kubectl create namespace argocd kubectl -n argocd create secret generic sops-age-keys \ - --from-file=key.txt=/run/age-key + --from-file=key.txt=/run/age-key \ + --dry-run=client -o yaml | kubectl apply -f - # Install argocd-operator (community) which provides the # argoproj.io/v1beta1 ArgoCD CRD consumed by kustomize-cluster's # bootstrap/argocd-config.yaml. diff --git a/cloud-init/runner/cloud_init.cfg b/cloud-init/runner/cloud_init.cfg index b5ddaca..eb7f70c 100644 --- a/cloud-init/runner/cloud_init.cfg +++ b/cloud-init/runner/cloud_init.cfg @@ -61,20 +61,28 @@ write_files: chown -R user:user /opt/actions-runner rm -f runner.tar.gz - REG_TOKEN=$(curl -sSL -X POST \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "Accept: application/vnd.github+json" \ - "https://api.github.com/orgs/$GITHUB_ORG/actions/runners/registration-token" \ - | jq -r .token) - RANDOM_ID=$(tr -dc 'a-z' Date: Wed, 29 Apr 2026 19:04:34 -0600 Subject: [PATCH 12/17] fix(cloud-init): bootstrap cert-manager before argocd-operator on k3s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit argocd-operator's config/default mounts a webhook-server-cert Secret in its manager Deployment, but cert-manager bits are commented out upstream — so the secret never materializes outside OLM. Pod hung in ContainerCreating with FailedMount errors, blocking the entire ArgoCD bootstrap chain. Bootstrap upstream cert-manager (pinned via local var) and provision a self-signed Issuer + Certificate targeting webhook-server-cert in the operator namespace. Wait for the operator deployment to roll out before continuing the bootstrap so the ArgoCD CR has something to reconcile against. cert-manager being a bootstrap dependency means kustomize-cluster's operators/cert-manager/operator.yaml (an OpenShift OLM Subscription) is now redundant for the operator install itself; that file becomes a Phase B cleanup item — Issuer/ClusterIssuer resources can stay since they depend on cert-manager being there, but the Subscription needs to go. --- cloud-init/k3s/cloud_init.cfg | 39 +++++++++++++++++++++++++++++++++++ main.tf | 8 +++++++ 2 files changed, 47 insertions(+) diff --git a/cloud-init/k3s/cloud_init.cfg b/cloud-init/k3s/cloud_init.cfg index 361099b..286ef02 100644 --- a/cloud-init/k3s/cloud_init.cfg +++ b/cloud-init/k3s/cloud_init.cfg @@ -57,6 +57,16 @@ runcmd: kubectl -n argocd create secret generic sops-age-keys \ --from-file=key.txt=/run/age-key \ --dry-run=client -o yaml | kubectl apply -f - + # cert-manager — argocd-operator's deployment mounts a webhook-server-cert + # Secret that nothing in config/default actually creates (cert-manager bits + # are commented out in upstream's kustomization). Install cert-manager and + # provision the cert ourselves before the operator install. + - | + set -e + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl apply -f \ + "https://github.com/cert-manager/cert-manager/releases/download/${cert_manager_version}/cert-manager.yaml" + kubectl -n cert-manager rollout status deployment/cert-manager-webhook --timeout=180s # Install argocd-operator (community) which provides the # argoproj.io/v1beta1 ArgoCD CRD consumed by kustomize-cluster's # bootstrap/argocd-config.yaml. @@ -66,6 +76,35 @@ runcmd: kubectl apply --server-side -k \ 'https://github.com/argoproj-labs/argocd-operator//config/default?ref=${argocd_operator_version}' until kubectl get crd argocds.argoproj.io 2>/dev/null; do sleep 3; done + # Self-signed Issuer + Certificate for the operator's admission webhook. + # Service name comes from config/default's namePrefix + ../webhook/service.yaml. + - | + set -e + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl apply -f - <<'EOF' + apiVersion: cert-manager.io/v1 + kind: Issuer + metadata: + name: argocd-operator-selfsigned + namespace: argocd-operator-system + spec: + selfSigned: {} + --- + apiVersion: cert-manager.io/v1 + kind: Certificate + metadata: + name: argocd-operator-serving-cert + namespace: argocd-operator-system + spec: + secretName: webhook-server-cert + dnsNames: + - argocd-operator-webhook-service.argocd-operator-system.svc + - argocd-operator-webhook-service.argocd-operator-system.svc.cluster.local + issuerRef: + kind: Issuer + name: argocd-operator-selfsigned + EOF + kubectl -n argocd-operator-system rollout status deployment/argocd-operator-controller-manager --timeout=180s # Apply kustomize-cluster bootstrap path. This contains the ArgoCD CR # (which the operator reconciles into a running argocd-server) plus the # operators-app and workloads-app Applications. Once argocd-server starts, diff --git a/main.tf b/main.tf index 9ed24f1..e05a747 100644 --- a/main.tf +++ b/main.tf @@ -24,6 +24,13 @@ locals { # argocd-operator (community) — provides the argoproj.io/v1beta1 ArgoCD CRD # consumed by kustomize-cluster/bootstrap/argocd-config.yaml argocd_operator_version = "v0.14.0" # bump as needed; see https://github.com/argoproj-labs/argocd-operator/releases + + # cert-manager — required by argocd-operator's config/default to provision the + # webhook-server-cert Secret. Installed during k3s bootstrap (before argocd- + # operator) since argocd-operator can't come up without it, and the operator + # is what brings ArgoCD up. The operators-app Application no longer manages + # cert-manager itself, only the Issuer/ClusterIssuer resources downstream. + cert_manager_version = "v1.20.2" # bump as needed; see https://github.com/cert-manager/cert-manager/releases } # Dedicated libvirt pool on /mnt/nvme RAID-1 for cluster volumes (keeps cluster IO off the root LV). @@ -87,6 +94,7 @@ module "k3s" { ssh_authorized_key = data.sops_file.secret_vars.data["ssh_admin_pubkey"] sops_age_key = data.sops_file.secret_vars.data["sops_age_key"] k3s_version = local.k3s_version + cert_manager_version = local.cert_manager_version argocd_operator_version = local.argocd_operator_version cluster_repo_url = local.cluster_repo_url cluster_repo_branch = local.cluster_repo_branch From 8b5e8667f8bb09fc4d872808ede4e1ee18ff8510 Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 19:07:05 -0600 Subject: [PATCH 13/17] docs(agents): refresh CI/local-apply guidance for k3s era - Container ref points at GHCR (was the OpenShift internal registry) - Drop the OpenShift-only failure-mode section (oc import-image, etc.) - Add a Local apply section: sops/age requirement, libvirt-ssh target, PCT_TFPATH via direnv, SSH-into-VM one-liners (user is `user`, not the local login) - Replace failure modes with the ones we actually hit on k3s: flaky boot-image uploads, stale volumes needing virsh vol-delete, state-rm for orphaned volumes, deterministic boot-disk hash names that need taint to rebuild on cloud-init changes, persistent extra volumes that require idempotent cloud-init scripts --- AGENTS.md | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 87ca6a4..360195d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -39,27 +39,35 @@ invoke OpenTofu. For `git commit`-triggered pre-commit runs, either: This repo uses the shared `opentofu.yml` workflow from `shared-workflows`, but with **custom configuration**: - **Runner:** `arc-dind` (self-hosted, not `ubuntu-latest`) -- **Container:** `image-registry.openshift-image-registry.svc:5000/public-registry/tfroot-runner:latest` (internal OpenShift registry, not GHCR) +- **Container:** `ghcr.io/makeitworkcloud/tfroot-runner:latest` -This is required because the workflow needs SSH access to libvirt hosts, which is only available from the self-hosted runner network. +The self-hosted runner is required because the workflow needs SSH access to the libvirt host, which is only reachable from the runner network. -### Failure Modes +## Local apply -**"name unknown" or image pull failures:** The `tfroot-runner` image doesn't exist in the OpenShift internal registry. This happens when: +`make init` / `make plan` / `make apply` need: -1. The `images` repo Build workflow failed (check for transient network errors, re-run if needed) -2. The `images` repo Pull workflow failed to import (the `|| true` masks failures - check logs for "Unable to connect" errors) +- `sops` available locally with the team's age key (so `data.sops_file.secret_vars` decrypts) +- The makefile's `libvirt-ssh` target (auto-run by `init`) materializes the qemu+ssh keypair from sops into `.terraform/libvirt-ssh/` — no `~/.ssh/id_rsa` needed +- `tofu` on PATH, plus `direnv` (recommended) so `.envrc` exports `PCT_TFPATH` for pre-commit + +### SSH-ing into the VMs + +Both VMs are behind the libvirt host. The cloud-init user is `user`, not your local username: -**To fix:** Re-run the Pull workflow in the `images` repo, or manually import: ```bash -oc import-image tfroot-runner:latest \ - --from=ghcr.io/makeitworkcloud/tfroot-runner:latest \ - -n public-registry \ - --confirm \ - --reference-policy=local +ssh -J user@hero.makeitwork.cloud user@192.168.102.2 # k3s +ssh -J user@hero.makeitwork.cloud user@192.168.102.12 # runner ``` -**Pre-commit failures:** If hooks fail unexpectedly, the canonical config may have changed. Delete `.pre-commit-config.yaml` locally and re-run `make test` to fetch the latest. +### Common apply hiccups + +- **`Volume Upload Failed: unexpected EOF`** while creating boot disks — flaky upload of the ~700 MB Fedora qcow2. Just re-run `make apply`; partial volumes get cleaned up automatically on retry. Boot-disk creation legitimately takes 5–7 minutes per VM. +- **`Storage volume X exists already`** on a fresh apply — host has stale volumes (e.g. from a previous failed apply). Delete via `ssh user@hero "sudo virsh -c qemu:///system vol-delete --pool "`. `sudo` is required. Run `pool-refresh ` after. +- **`Storage volume not found: no storage vol with matching path …`** during refresh — state references a volume that was deleted out-of-band. `tofu state rm ` and re-apply to recreate. +- **Boot-disk filenames are a deterministic URL hash** (e.g. `k3s-94d57345.qcow2`). Tofu won't recreate them when the boot image content changes server-side or when cloud-init templates change. Force a rebuild with `tofu taint module..libvirt_volume.boot module..libvirt_volume.cloudinit module..libvirt_cloudinit_disk.commoninit`. +- **Cluster + runner state survives boot-disk replacement.** `/var/lib/rancher` (k3s) and `/opt/actions-runner` are on persistent xfs `extra` volumes (`overwrite: false`). Cloud-init scripts are idempotent against this — see the `[ ! -f .runner ]` check in the runner template and the `kubectl get … || create` in the k3s template. +- **Pre-commit failures** — the canonical config may have changed. `rm .pre-commit-config.yaml && make test` fetches the latest. ## Related Repositories From afd29083a54a6f1943c28a83c5e0a5ea0ffef4a4 Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 19:15:25 -0600 Subject: [PATCH 14/17] fix(cloud-init): patch cert-manager controller for DNS-01 external resolvers Cluster CoreDNS doesn't recursively resolve external domains, which breaks ACME DNS-01 challenge validation. Pass --dns01-recursive-nameservers and --dns01-recursive-nameservers-only to the cert-manager controller so it queries 1.1.1.1 / 8.8.8.8 directly. Tighten the surrounding comment too. --- cloud-init/k3s/cloud_init.cfg | 7 +++++++ main.tf | 10 +++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cloud-init/k3s/cloud_init.cfg b/cloud-init/k3s/cloud_init.cfg index 286ef02..e636aac 100644 --- a/cloud-init/k3s/cloud_init.cfg +++ b/cloud-init/k3s/cloud_init.cfg @@ -67,6 +67,13 @@ runcmd: kubectl apply -f \ "https://github.com/cert-manager/cert-manager/releases/download/${cert_manager_version}/cert-manager.yaml" kubectl -n cert-manager rollout status deployment/cert-manager-webhook --timeout=180s + # Cluster CoreDNS can't recursively resolve external domains for ACME + # DNS-01 challenges; force cert-manager to use public resolvers directly. + kubectl -n cert-manager patch deployment cert-manager --type=json -p='[ + {"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--dns01-recursive-nameservers=1.1.1.1:53,8.8.8.8:53"}, + {"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--dns01-recursive-nameservers-only"} + ]' + kubectl -n cert-manager rollout status deployment/cert-manager --timeout=120s # Install argocd-operator (community) which provides the # argoproj.io/v1beta1 ArgoCD CRD consumed by kustomize-cluster's # bootstrap/argocd-config.yaml. diff --git a/main.tf b/main.tf index e05a747..c38dfb6 100644 --- a/main.tf +++ b/main.tf @@ -25,11 +25,11 @@ locals { # consumed by kustomize-cluster/bootstrap/argocd-config.yaml argocd_operator_version = "v0.14.0" # bump as needed; see https://github.com/argoproj-labs/argocd-operator/releases - # cert-manager — required by argocd-operator's config/default to provision the - # webhook-server-cert Secret. Installed during k3s bootstrap (before argocd- - # operator) since argocd-operator can't come up without it, and the operator - # is what brings ArgoCD up. The operators-app Application no longer manages - # cert-manager itself, only the Issuer/ClusterIssuer resources downstream. + # cert-manager — argocd-operator's config/default mounts a webhook-server-cert + # Secret that only materializes via cert-manager. Installed during k3s + # bootstrap (before argocd-operator) so argocd-operator can come up and + # reconcile the ArgoCD CR; ClusterIssuer/Issuer resources are managed by + # ArgoCD downstream. cert_manager_version = "v1.20.2" # bump as needed; see https://github.com/cert-manager/cert-manager/releases } From 2b68774e1ec5edb71e387a6d6575cc049e447dbe Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 19:28:31 -0600 Subject: [PATCH 15/17] fix(cloud-init): grant ArgoCD cluster-config scope via operator env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without ARGOCD_CLUSTER_CONFIG_NAMESPACES on the argocd-operator deployment, the spawned ArgoCD application-controller runs in namespaced mode and can't manage cluster-scoped resources (ClusterRole/ClusterRoleBinding/etc.). Any operator that ships those — tor-controller, cloudflare-operator, etc. — fails to sync with `cannot be managed when in namespaced mode`. Set the env var to `argocd` so the ArgoCD CR in that namespace gets cluster-scope permissions on reconcile. --- cloud-init/k3s/cloud_init.cfg | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cloud-init/k3s/cloud_init.cfg b/cloud-init/k3s/cloud_init.cfg index e636aac..52c0f9b 100644 --- a/cloud-init/k3s/cloud_init.cfg +++ b/cloud-init/k3s/cloud_init.cfg @@ -77,12 +77,19 @@ runcmd: # Install argocd-operator (community) which provides the # argoproj.io/v1beta1 ArgoCD CRD consumed by kustomize-cluster's # bootstrap/argocd-config.yaml. + # + # ARGOCD_CLUSTER_CONFIG_NAMESPACES grants cluster-config scope to ArgoCD CRs + # in the named namespace; without it the application-controller can only + # manage namespaced resources, blocking sync of any ClusterRole/CRB. - | set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl apply --server-side -k \ 'https://github.com/argoproj-labs/argocd-operator//config/default?ref=${argocd_operator_version}' until kubectl get crd argocds.argoproj.io 2>/dev/null; do sleep 3; done + kubectl -n argocd-operator-system set env \ + deployment/argocd-operator-controller-manager \ + ARGOCD_CLUSTER_CONFIG_NAMESPACES=argocd # Self-signed Issuer + Certificate for the operator's admission webhook. # Service name comes from config/default's namePrefix + ../webhook/service.yaml. - | From 42d4cb45afbc838013485962fe1e62797d1c6078 Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 22:22:24 -0600 Subject: [PATCH 16/17] feat(cloud-init): enable OIDC token validation on the k3s apiserver Drop a /etc/rancher/k3s/config.yaml.d/oidc.yaml that points the kube-apiserver at ArgoCD's embedded Dex issuer. Headlamp (and any OIDC-aware kubectl) forwards the user's Dex-issued ID token to the apiserver; without these flags the apiserver treats the token as unknown and 401s every request. Username comes from the email claim, groups from Dex's GitHub team mapping. RBAC binding for makeitworkcloud:admins -> cluster-admin lives in kustomize-cluster/bootstrap/oidc-rbac.yaml. --- cloud-init/k3s/cloud_init.cfg | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cloud-init/k3s/cloud_init.cfg b/cloud-init/k3s/cloud_init.cfg index 52c0f9b..c30206c 100644 --- a/cloud-init/k3s/cloud_init.cfg +++ b/cloud-init/k3s/cloud_init.cfg @@ -8,6 +8,19 @@ write_files: permissions: '0600' content: | ${indent(6, sops_age_key)} + # k3s reads /etc/rancher/k3s/config.yaml.d/*.yaml on startup; this enables + # OIDC token validation by kube-apiserver. Headlamp/kubectl forward the + # user's Dex-issued ID token here, the apiserver validates it against the + # Dex issuer, and RBAC bindings in kustomize-cluster/bootstrap/oidc-rbac.yaml + # grant access by GitHub team membership (groups claim). + - path: /etc/rancher/k3s/config.yaml.d/oidc.yaml + permissions: '0600' + content: | + kube-apiserver-arg: + - oidc-issuer-url=https://argocd.makeitwork.cloud/api/dex + - oidc-client-id=headlamp + - oidc-username-claim=email + - oidc-groups-claim=groups groups: - default From 615a4ac4832d48e0c511622415fdd1e7e969624d Mon Sep 17 00:00:00 2001 From: xnoto Date: Wed, 29 Apr 2026 22:52:19 -0600 Subject: [PATCH 17/17] ci: switch to arc-tf native runner Drop the nested container override now that the arc-tf runner-set in kustomize-cluster runs the tfroot-runner image directly. --- .github/workflows/opentofu.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/opentofu.yml b/.github/workflows/opentofu.yml index aa3c160..5d8adde 100644 --- a/.github/workflows/opentofu.yml +++ b/.github/workflows/opentofu.yml @@ -16,11 +16,9 @@ jobs: opentofu: uses: makeitworkcloud/shared-workflows/.github/workflows/opentofu.yml@main with: - # Self-hosted dind runner from kustomize-cluster/workloads/arc; matches the - # `arc-dind` label on the RunnerDeployment. Until the k3s cluster is up - # and the runner pods are registered, this job will queue. - runs-on: arc-dind - container: ghcr.io/makeitworkcloud/tfroot-runner:latest + # Native tfroot-runner scale set in kustomize-cluster/workloads/arc. + # The runner pod IS the tfroot-runner image — no nested container. + runs-on: arc-tf setup-ssh: true secrets: SOPS_AGE_KEY: ${{ secrets.SOPS_AGE_KEY }}