diff --git a/.cursorrules b/.cursorrules new file mode 100644 index 0000000..47065f9 --- /dev/null +++ b/.cursorrules @@ -0,0 +1,59 @@ +# Cursor Rules for Puller + +## Critical Rules + +1. ALWAYS read project files (Tiltfile, Makefile, source) before acting. Never guess. +2. Documentation: short, concise, high-level. No volatile details. +3. Simplicity over complexity. DRY is NOT always best. No premature optimization. +4. Kubernetes: use kubectl explain or read CRD types before suggesting specs. +5. Security: never expose secrets in code or docs. +6. Tilt handles the dev loop. tilt up does everything. Don't suggest manual commands for automated steps. + +## Project Context +Kubernetes operator (Go 1.23.0, Kubebuilder, controller-runtime). +Module: github.com/Breee/puller +API group: puller.corewire.io/v1alpha1. All CRDs cluster-scoped. + +## Key Commands +- Build: go build ./... +- Test: make test +- Lint: make lint +- CRD gen: make manifests +- Deepcopy gen: make generate +- All codegen: make codegen +- AI docs gen: make docs-gen + +## Structure +- api/v1alpha1 — Package v1alpha1 contains API Schema definitions for the puller v1alpha1 API group. +- internal/controller — Reconciler implementations (one per CRD) +- internal/discovery — Discovery source interface + implementations +- internal/metrics — Prometheus metrics registration +- internal/pacing — Shared pacing engine for rate-limited pulls +- internal/podbuilder — Pure Pod construction function (no k8s client) +- charts/puller/ — Helm chart +- test/e2e/ — Chainsaw E2E tests +- hack/gen-ai-docs/ — generates all docs from source + +## CRDs → Controllers +- CachedImage → internal/controller/cachedimage_controller.go +- CachedImageSet → internal/controller/cachedimageset_controller.go +- PullPolicy (config-only, no controller) +- DiscoveryPolicy → internal/controller/discoverypolicy_controller.go + +## Conventions +- All CRDs are cluster-scoped +- Status uses metav1.Condition with type "Ready" +- No privileged containers — kubelet-based image pulls only +- Single responsibility reconcilers — one controller per CRD +- Pod builder is a pure function in internal/podbuilder/ (no k8s client) +- Pacing logic lives exclusively in internal/pacing/ +- ownerReferences: CachedImageSet→CachedImage, controller→Pod +- Table-driven tests preferred; envtest for controllers +- Pods use nodeName placement + command: ["true"] +- Don't manually edit generated files — run make docs-gen + +## Don't +- Edit generated files (zz_generated.deepcopy.go, config/crd/bases/, llms.txt, llms-full.txt, knowledge.yaml) +- Add privileged containers or CRI socket mounts +- Create namespaced CRDs +- Put pacing logic outside internal/pacing/ diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..39c51d0 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,24 @@ +{ + "name": "Kubebuilder DevContainer", + "image": "docker.io/golang:1.26", + "features": { + "ghcr.io/devcontainers/features/docker-in-docker:2": {}, + "ghcr.io/devcontainers/features/git:1": {} + }, + + "runArgs": ["--network=host"], + + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.shell.linux": "/bin/bash" + }, + "extensions": [ + "ms-kubernetes-tools.vscode-kubernetes-tools", + "ms-azuretools.vscode-docker" + ] + } + }, + + "onCreateCommand": "bash .devcontainer/post-install.sh" +} diff --git a/.devcontainer/post-install.sh b/.devcontainer/post-install.sh new file mode 100644 index 0000000..265c43e --- /dev/null +++ b/.devcontainer/post-install.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -x + +curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64 +chmod +x ./kind +mv ./kind /usr/local/bin/kind + +curl -L -o kubebuilder https://go.kubebuilder.io/dl/latest/linux/amd64 +chmod +x kubebuilder +mv kubebuilder /usr/local/bin/ + +KUBECTL_VERSION=$(curl -L -s https://dl.k8s.io/release/stable.txt) +curl -LO "https://dl.k8s.io/release/$KUBECTL_VERSION/bin/linux/amd64/kubectl" +chmod +x kubectl +mv kubectl /usr/local/bin/kubectl + +docker network create -d=bridge --subnet=172.19.0.0/24 kind + +kind version +kubebuilder version +docker --version +go version +kubectl version --client diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2943268 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +# More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file +# Ignore build and test binaries. +bin/ + +# Docs and dev artifacts +docs/ +ai-docs/ +hack/ +test/ +.github/ +*.md diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..50fd0ae --- /dev/null +++ b/.editorconfig @@ -0,0 +1,19 @@ +root = true + +[*] +indent_style = tab +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.{yaml,yml}] +indent_style = space +indent_size = 2 + +[*.md] +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..794ad2e --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,79 @@ +# Copilot Instructions for Drop + +## Critical Rules + +1. **ALWAYS read project files before acting.** Read the Tiltfile, Makefile, and relevant source before writing docs, suggesting workflows, or describing how things work. Never guess based on general knowledge. +2. **Documentation must be short and concise.** Focus on high-level overview and usage. Avoid volatile implementation details. Avoid information that will change frequently. +3. **Simplicity over complexity.** If a simple solution exists, use it. DRY is NOT always best. No premature optimization. +4. **Kubernetes: always verify.** Use `kubectl explain` or read the CRD types before suggesting field values or resource specs. +5. **Security-conscious.** Never expose secrets in code or docs. Follow secure coding practices. +6. **Tilt handles the dev loop.** `tilt up` does everything: cluster creation, build, deploy, port-forwards, Hugo docs, e2e infra, dev samples. Don't suggest manual commands for things Tilt automates. + +## Project + +Kubernetes operator (Go 1.23.0, Kubebuilder, controller-runtime) that pre-caches container images on cluster nodes. +API group: `drop.corewire.io/v1alpha1`. All CRDs are cluster-scoped. + +## Build Commands + +```bash +make generate # regenerate deepcopy +make manifests # regenerate CRD + RBAC YAML +make codegen # both of the above +go build ./... # compile +make test # unit tests (envtest) +make test-e2e # e2e tests (chainsaw, needs kind) +make lint # golangci-lint +make docs-gen # regenerate AI docs from source +``` + +## Code Conventions + +- All CRDs are cluster-scoped +- Status uses metav1.Condition with type "Ready" +- No privileged containers — kubelet-based image pulls only +- Single responsibility reconcilers — one controller per CRD +- Pod builder is a pure function in internal/podbuilder/ (no k8s client) +- Pacing logic lives exclusively in internal/pacing/ +- ownerReferences: CachedImageSet→CachedImage, controller→Pod +- Table-driven tests preferred; envtest for controllers +- Pods use nodeName placement + command: ["true"] +- Don't manually edit generated files — run make docs-gen + +## Testing Patterns + +- Controller tests use envtest (`internal/controller/*_test.go`) +- Table-driven tests preferred +- E2E uses Kyverno Chainsaw in `test/e2e/` +- Test fixtures in `config/samples/` and `hack/dev-samples.yaml` + +## CRD Quick Reference + +| Kind | Controller | Purpose | +|------|-----------|---------| +| CachedImage | internal/controller/cachedimage_controller.go | CachedImage is the Schema for the cachedimages API. | +| CachedImageSet | internal/controller/cachedimageset_controller.go | CachedImageSet is the Schema for the cachedimagesets API. | +| PullPolicy | | PullPolicy is the Schema for the pullpolicies API. It is a configuration-only resource with no status. | +| DiscoveryPolicy | internal/controller/discoverypolicy_controller.go | DiscoveryPolicy is the Schema for the discoverypolicies API. | + +## Package Dependency Graph + +``` +api/v1alpha1 — Package v1alpha1 contains API Schema definitions for the drop v1alpha1 API group. +internal/controller — Reconciler implementations (one per CRD) + imports: api/v1alpha1, internal/discovery, internal/metrics, internal/pacing, internal/podbuilder +internal/discovery — Discovery source interface + implementations +internal/metrics — Prometheus metrics registration +internal/pacing — Shared pacing engine for rate-limited pulls + imports: api/v1alpha1, internal/podbuilder +internal/podbuilder — Pure Pod construction function (no k8s client) + imports: api/v1alpha1 +``` + +## Don'ts + +- Don't add CRI socket access or privileged containers — we use kubelet image pulls only +- Don't put pacing logic outside `internal/pacing/` +- Don't create namespaced CRDs — all resources are cluster-scoped +- Don't manually edit generated files (`zz_generated.deepcopy.go`, `config/crd/bases/`) +- Don't manually edit `llms.txt`, `llms-full.txt`, `.cursorrules`, `AGENTS.md` — run `make docs-gen` diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0597037 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,121 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +# Each job maps to a local make target for easy debugging: +# lint → make lint +# test → make test +# build → make build +# helm-lint → make helm-lint && make helm-template +# docs-build → make docs-build +# e2e → make e2e-local + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + - uses: golangci/golangci-lint-action@v9 + with: + version: v2.12.2 + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + - name: Run tests + run: make test + + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + - name: Build + run: make build + + helm-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: azure/setup-helm@v4 + - name: Lint Helm chart + run: helm lint charts/drop + - name: Template Helm chart + run: helm template drop charts/drop + + docs-build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.26' + cache: false + - name: Setup Hugo + uses: peaceiris/actions-hugo@v2 + with: + hugo-version: 'latest' + extended: true + - name: Build docs + working-directory: docs + run: | + hugo mod get + hugo --minify + + e2e: + runs-on: ubuntu-latest + needs: [build] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + - name: Install kind + run: | + curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.24.0/kind-linux-amd64 + chmod +x ./kind + sudo mv ./kind /usr/local/bin/kind + - name: Create kind cluster + run: make kind-create KIND=kind + - name: Build and load image + run: | + make docker-build IMG=controller:ci + make kind-load KIND=kind IMG=controller:ci + - name: Install CRDs + run: | + make controller-gen + make manifests + kubectl apply -f config/crd/bases/ + - name: Deploy E2E infrastructure (Prometheus + Registry) + run: make e2e-infra + - name: Deploy operator + run: | + helm install drop charts/drop \ + --namespace drop-system \ + --create-namespace \ + --set image.repository=controller \ + --set image.tag=ci \ + --set image.pullPolicy=Never \ + --set leaderElection.enabled=false \ + --set metrics.enabled=true \ + --set metrics.secureServing=false \ + --wait --timeout 120s + - name: Run E2E tests + run: make test-e2e diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..0f61840 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,61 @@ +name: Docs + +on: + push: + branches: [main] + paths: + - 'docs/**' + pull_request: + paths: + - 'docs/**' + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.26' + cache: false + + - name: Setup Hugo + uses: peaceiris/actions-hugo@v2 + with: + hugo-version: 'latest' + extended: true + + - name: Build docs + working-directory: docs + run: | + hugo mod get + hugo --minify --baseURL "https://breee.github.io/drop/" + + - name: Upload artifact + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + uses: actions/upload-pages-artifact@v3 + with: + path: docs/public + + deploy: + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + needs: build + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..80f694b --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,66 @@ +name: Release + +on: + push: + tags: + - "v*" + +permissions: + contents: write + packages: write + +jobs: + ci: + uses: ./.github/workflows/ci.yml + + release: + needs: ci + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + + - name: Build and push multi-arch image + uses: docker/build-push-action@v6 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Package and push Helm chart + run: | + helm package charts/drop --version ${GITHUB_REF_NAME#v} --app-version ${GITHUB_REF_NAME#v} + helm push drop-*.tgz oci://ghcr.io/${{ github.repository_owner }}/charts + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + generate_release_notes: true diff --git a/.github/workflows/weekly-release.yml b/.github/workflows/weekly-release.yml new file mode 100644 index 0000000..488a0c1 --- /dev/null +++ b/.github/workflows/weekly-release.yml @@ -0,0 +1,92 @@ +name: Weekly Release + +on: + schedule: + # Every Monday at 06:00 UTC + - cron: "0 6 * * 1" + workflow_dispatch: {} + +permissions: + contents: write + packages: write + +jobs: + ci: + uses: ./.github/workflows/ci.yml + + weekly-release: + needs: ci + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Determine next version + id: version + run: | + # Get latest tag or default to v0.0.0 + LATEST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + echo "latest=$LATEST_TAG" >> "$GITHUB_OUTPUT" + + # Bump patch version + VERSION=${LATEST_TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$VERSION" + PATCH=$((PATCH + 1)) + NEXT="v${MAJOR}.${MINOR}.${PATCH}" + echo "next=$NEXT" >> "$GITHUB_OUTPUT" + + - name: Create and push tag + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git tag -a "${{ steps.version.outputs.next }}" -m "Weekly release ${{ steps.version.outputs.next }}" + git push origin "${{ steps.version.outputs.next }}" + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository }} + tags: | + type=semver,pattern={{version}},value=${{ steps.version.outputs.next }} + type=semver,pattern={{major}}.{{minor}},value=${{ steps.version.outputs.next }} + + - name: Build and push multi-arch image + uses: docker/build-push-action@v6 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Package and push Helm chart + run: | + VERSION=${{ steps.version.outputs.next }} + helm package charts/drop --version ${VERSION#v} --app-version ${VERSION#v} + helm push drop-*.tgz oci://ghcr.io/${{ github.repository_owner }}/charts + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ steps.version.outputs.next }} + generate_release_notes: true + body: | + Automated weekly release to keep images up to date with latest base images and dependency patches. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..06989d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,41 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib +bin/ +testbin/ +Dockerfile.cross + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Go workspace file +go.work + +# Kubernetes Generated files - skip generated files, except for vendored files +!vendor/**/zz_generated.* + +# editor and IDE paraphernalia +.idea +.vscode +*.swp +*.swo +*~ + +# Coverage +cover.out +coverage.html + +# Hugo docs build output +docs/public/ +docs/resources/ +docs/.hugo_build.lock + +# Generated docs-gen binary +/gen-ai-docs +.kubeconfig diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..f2fd24f --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,61 @@ +version: "2" + +run: + allow-parallel-runners: true + +linters: + default: none + enable: + - copyloopvar + - dupl + - errcheck + - ginkgolinter + - goconst + - gocyclo + - govet + - ineffassign + - lll + - misspell + - nakedret + - prealloc + - revive + - staticcheck + - unconvert + - unparam + - unused + settings: + revive: + rules: + - name: comment-spacings + goconst: + min-occurrences: 5 + exclusions: + presets: [] + rules: + - path: "api/*" + linters: + - lll + - path: "internal/*" + linters: + - dupl + - lll + - path: "hack/*" + linters: + - lll + - goconst + - staticcheck + - path: "_test\\.go" + linters: + - goconst + - path: "test/*" + linters: + - goconst + - staticcheck + - path: "internal/metrics/*" + linters: + - goconst + +formatters: + enable: + - gofmt + - goimports diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..cee7c01 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,21 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + args: ['--allow-multiple-documents'] + - id: check-added-large-files + + - repo: https://github.com/golangci/golangci-lint + rev: v1.62.2 + hooks: + - id: golangci-lint + + - repo: https://github.com/norwoodj/helm-docs + rev: v1.14.2 + hooks: + - id: helm-docs + args: + - --chart-search-root=charts diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..33d65e7 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,65 @@ +# Agent Instructions + +## Critical Rules + +1. ALWAYS read project files (Tiltfile, Makefile, source) before acting. Never guess. +2. Documentation: short, concise, high-level. No volatile details. +3. Simplicity over complexity. DRY is NOT always best. No premature optimization. +4. Kubernetes: use kubectl explain or read CRD types before suggesting specs. +5. Security: never expose secrets in code or docs. +6. Tilt handles the dev loop. `tilt up` does everything. Don't suggest manual commands for automated steps. + +## Project: Drop + +Kubernetes operator (Go 1.23.0) that pre-caches container images on cluster nodes. + +## Quick Start + +```bash +make codegen # generate deepcopy + CRD manifests +go build ./... # compile +make test # unit tests +make docs-gen # regenerate AI docs +``` + +## Architecture + +- API group: `drop.corewire.io/v1alpha1` (cluster-scoped) +- Framework: Kubebuilder + controller-runtime +- Pull mechanism: short-lived Pods with `nodeName` + `command: ["true"]` + +## CRDs + +| Kind | Purpose | +|------|---------| +| CachedImage | CachedImage is the Schema for the cachedimages API. | +| CachedImageSet | CachedImageSet is the Schema for the cachedimagesets API. | +| PullPolicy | PullPolicy is the Schema for the pullpolicies API. It is a configuration-only resource with no status. | +| DiscoveryPolicy | DiscoveryPolicy is the Schema for the discoverypolicies API. | + +## Key Directories + +| Path | Contents | +|------|----------| +| api/v1alpha1 | Package v1alpha1 contains API Schema definitions for the drop v1alpha1 API group. | +| internal/controller | Reconciler implementations (one per CRD) | +| internal/discovery | Discovery source interface + implementations | +| internal/metrics | Prometheus metrics registration | +| internal/pacing | Shared pacing engine for rate-limited pulls | +| internal/podbuilder | Pure Pod construction function (no k8s client) | +| charts/drop/ | Helm chart | +| test/e2e/ | Chainsaw E2E tests | +| hack/gen-ai-docs/ | This doc generator | + +## Rules + +1. Run `make codegen` after changing api/v1alpha1/ types +2. Run `make docs-gen` after changing types or Makefile (regenerates this file) +3. Never edit generated files directly +4. All CRDs are cluster-scoped — no namespaced resources +5. No privileged containers — kubelet-based image pulls only +6. Status uses `metav1.Condition` with type "Ready" + +## Full Reference + +See [llms-full.txt](llms-full.txt) for complete CRD field documentation. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..dd9ed99 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +# Build the manager binary +FROM docker.io/golang:1.26 AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum +# cache deps before building and copying source so that we don't need to re-download as much +# and so that source changes don't invalidate our downloaded layer +RUN go mod download + +# Copy the go source +COPY cmd/main.go cmd/main.go +COPY api/ api/ +COPY internal/ internal/ + +# Build +# the GOARCH has not a default value to allow the binary be built according to the host where the command +# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO +# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, +# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. +RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go + +# Use distroless as minimal base image to package the manager binary +# Refer to https://github.com/GoogleContainerTools/distroless for more details +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/manager . +USER 65532:65532 + +ENTRYPOINT ["/manager"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0005348 --- /dev/null +++ b/Makefile @@ -0,0 +1,194 @@ +# Image URL to use all building/pushing image targets +IMG ?= controller:latest + +# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) +ifeq (,$(shell go env GOBIN)) +GOBIN=$(shell go env GOPATH)/bin +else +GOBIN=$(shell go env GOBIN) +endif + +CONTAINER_TOOL ?= docker +SHELL = /usr/bin/env bash -o pipefail +.SHELLFLAGS = -ec + +.PHONY: all +all: build + +##@ General + +.PHONY: help +help: ## Display this help. + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ Development + +.PHONY: build +build: ## Build manager binary. + go build -o bin/manager cmd/main.go + +.PHONY: run +run: ## Run controller from your host. + go run ./cmd/main.go + +.PHONY: fmt +fmt: ## Run go fmt. + go fmt ./... + +.PHONY: vet +vet: ## Run go vet. + go vet ./... + +.PHONY: lint +lint: golangci-lint ## Run golangci-lint. + $(GOLANGCI_LINT) run + +.PHONY: lint-fix +lint-fix: golangci-lint ## Run golangci-lint with auto-fix. + $(GOLANGCI_LINT) run --fix + +##@ Code Generation + +.PHONY: generate +generate: controller-gen ## Generate DeepCopy methods. + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." + +.PHONY: manifests +manifests: controller-gen ## Generate CRD and RBAC manifests. + $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases + +.PHONY: codegen +codegen: generate manifests docs-gen ## Run all code generation (deepcopy + CRDs + docs). + +##@ Testing + +.PHONY: test +test: setup-envtest ## Run unit tests. + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out + +.PHONY: test-e2e +test-e2e: chainsaw ## Run Chainsaw E2E tests (requires kind cluster). + $(CHAINSAW) test test/e2e/ + +##@ Cluster + +.PHONY: kind-create +kind-create: ## Create kind cluster for development. + $(KIND) create cluster --name drop-dev --config hack/kind-config.yaml --wait 5m + +.PHONY: kind-delete +kind-delete: ## Delete the kind cluster. + $(KIND) delete cluster --name drop-dev + +.PHONY: install +install: manifests kustomize ## Install CRDs into cluster. + $(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f - + +.PHONY: uninstall +uninstall: manifests kustomize ## Uninstall CRDs from cluster. + $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found -f - + +.PHONY: e2e-infra +e2e-infra: ## Deploy Prometheus + Registry for E2E/dev. + @chmod +x hack/e2e-infra/setup.sh && hack/e2e-infra/setup.sh + +##@ Docker + +.PHONY: docker-build +docker-build: ## Build docker image. + $(CONTAINER_TOOL) build -t ${IMG} . + +.PHONY: docker-push +docker-push: ## Push docker image. + $(CONTAINER_TOOL) push ${IMG} + +.PHONY: kind-load +kind-load: docker-build ## Build and load image into kind. + $(KIND) load docker-image ${IMG} --name drop-dev + +##@ Helm & Docs + +.PHONY: helm-lint +helm-lint: ## Lint the Helm chart. + helm lint charts/drop + +.PHONY: helm-template +helm-template: ## Render Helm templates locally. + helm template drop charts/drop + +.PHONY: docs-serve +docs-serve: ## Serve Hugo docs locally. + cd docs && hugo server --buildDrafts --port 1313 + +.PHONY: docs-gen +docs-gen: ## Regenerate AI agent docs (llms.txt, instructions, etc.) from source. + go run ./hack/gen-ai-docs/ + +.PHONY: docs-gen-check +docs-gen-check: docs-gen ## Verify generated AI docs are up to date. + @git diff --exit-code knowledge.yaml llms.txt llms-full.txt .github/copilot-instructions.md .cursorrules AGENTS.md docs/doc-generation.md docs/content/docs/reference/_generated_*.md || \ + (echo "ERROR: generated docs are out of date — run 'make docs-gen'" && exit 1) + +.PHONY: tools +tools: ## Install local tooling and check optional docs/chart binaries. + @$(MAKE) kustomize controller-gen setup-envtest golangci-lint chainsaw + @command -v hugo >/dev/null 2>&1 || echo "WARNING: hugo not found — needed for docs" + @command -v helm >/dev/null 2>&1 || echo "WARNING: helm not found — needed for chart dev" + +##@ Tool Dependencies + +LOCALBIN ?= $(shell pwd)/bin +$(LOCALBIN): + mkdir -p $(LOCALBIN) + +KUBECTL ?= kubectl +KIND ?= kind +KUSTOMIZE ?= $(LOCALBIN)/kustomize +CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen +ENVTEST ?= $(LOCALBIN)/setup-envtest +GOLANGCI_LINT = $(LOCALBIN)/golangci-lint +CHAINSAW ?= $(LOCALBIN)/chainsaw + +KUSTOMIZE_VERSION ?= v5.6.0 +CONTROLLER_TOOLS_VERSION ?= v0.17.2 +ENVTEST_VERSION ?= $(shell go list -m -f "{{ .Version }}" sigs.k8s.io/controller-runtime | awk -F'[v.]' '{printf "release-%d.%d", $$2, $$3}') +ENVTEST_K8S_VERSION ?= $(shell go list -m -f "{{ .Version }}" k8s.io/api | awk -F'[v.]' '{printf "1.%d", $$3}') +GOLANGCI_LINT_VERSION ?= v2.12.2 +CHAINSAW_VERSION ?= v0.2.15 + +.PHONY: kustomize +kustomize: $(KUSTOMIZE) +$(KUSTOMIZE): $(LOCALBIN) + $(call go-install-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v5,$(KUSTOMIZE_VERSION)) + +.PHONY: controller-gen +controller-gen: $(CONTROLLER_GEN) +$(CONTROLLER_GEN): $(LOCALBIN) + $(call go-install-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen,$(CONTROLLER_TOOLS_VERSION)) + +.PHONY: setup-envtest +setup-envtest: $(ENVTEST) +$(ENVTEST): $(LOCALBIN) + $(call go-install-tool,$(ENVTEST),sigs.k8s.io/controller-runtime/tools/setup-envtest,$(ENVTEST_VERSION)) + +.PHONY: golangci-lint +golangci-lint: $(GOLANGCI_LINT) +$(GOLANGCI_LINT): $(LOCALBIN) + $(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/v2/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION)) + +.PHONY: chainsaw +chainsaw: $(CHAINSAW) +$(CHAINSAW): $(LOCALBIN) + $(call go-install-tool,$(CHAINSAW),github.com/kyverno/chainsaw,$(CHAINSAW_VERSION)) + +define go-install-tool +@[ -f "$(1)-$(3)" ] || { \ +set -e; \ +package=$(2)@$(3) ;\ +echo "Downloading $${package}" ;\ +rm -f $(1) || true ;\ +GOBIN=$(LOCALBIN) GOTOOLCHAIN=local go install $${package} ;\ +mv $(1) $(1)-$(3) ;\ +} ;\ +ln -sf $(1)-$(3) $(1) +endef diff --git a/PROJECT b/PROJECT new file mode 100644 index 0000000..b6ddea0 --- /dev/null +++ b/PROJECT @@ -0,0 +1,46 @@ +# Code generated by tool. DO NOT EDIT. +# This file is used to track the info used to scaffold your project +# and allow the plugins properly work. +# More info: https://book.kubebuilder.io/reference/project-config.html +domain: corewire.io +layout: +- go.kubebuilder.io/v4 +projectName: drop +repo: github.com/Breee/drop +resources: +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: corewire.io + group: drop + kind: CachedImage + path: github.com/Breee/drop/api/v1alpha1 + version: v1alpha1 +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: corewire.io + group: drop + kind: CachedImageSet + path: github.com/Breee/drop/api/v1alpha1 + version: v1alpha1 +- api: + crdVersion: v1 + namespaced: true + domain: corewire.io + group: drop + kind: PullPolicy + path: github.com/Breee/drop/api/v1alpha1 + version: v1alpha1 +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: corewire.io + group: drop + kind: DiscoveryPolicy + path: github.com/Breee/drop/api/v1alpha1 + version: v1alpha1 +version: "3" diff --git a/README.md b/README.md index 82f5565..afc32e8 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,91 @@ -# puller -K8s Operator that pre-pulls images onto Kubernetes nodes without destroying Containerd +# drop + +A Kubernetes operator that pre-pulls container images onto nodes — safely, with pacing, and with automatic discovery. + +## Why + +When many CI jobs or workloads start simultaneously, Kubernetes nodes face a thundering herd of image pulls. We hit this running large-scale GitLab CI — concurrent pods on the same node all pulling the same large image would saturate bandwidth, stall containerd, and cascade into failures. + +**The problems:** + +- **Thundering herd** — a spike of pods on one node triggers parallel pulls of the same image, saturating node bandwidth and destabilizing containerd. +- **Registry overload** — sudden pull surges hit registry rate limits or cause outages. +- **Cold-start latency** — large images take minutes to pull, delaying workloads that need them immediately. + +**Drop's approach:** pre-cache images on nodes *before* workloads need them, pace pulls to stay within safe limits, and automatically discover which images matter most. + +## What it does + +- **Pre-caches images** on selected nodes before workloads need them +- **Discovers images** automatically from Prometheus metrics or OCI registries based on your criteria (e.g. top-pulled images) +- **Paces pulls** to avoid saturating node bandwidth or registry rate limits +- **Reports errors** using standard Kubernetes status patterns (`ErrImagePull`, `ConnectionRefused`, etc.) + +## Quick Start + +```bash +# Install CRDs and operator via Helm +helm install drop charts/drop -n drop-system --create-namespace + +# Cache a single image +kubectl apply -f - <` in HTML head + +```html + +``` + +Agents parsing HTML discover the markdown variant without guessing URL patterns. + +### 4. `llms-full.txt` on the site + +Serve the complete project reference as a static file. One GET = entire context. Agents that support URL ingestion (ChatGPT, Claude) can consume the whole project. + +### 5. `llmsDescription` frontmatter + +Every page gets a machine-readable summary in frontmatter: + +```yaml +llmsDescription: | + Installation guide for drop. Prerequisites: K8s 1.28+, Helm 3.12+. + Install via: helm install drop oci://ghcr.io/breee/charts/drop +``` + +This feeds Hextra's llms.txt generation and gives agents per-page context without reading the full body. + +### 6. Context menu integration + +Hextra v0.12+ has built-in "Copy as Markdown" + custom links: + +```yaml +params: + page: + contextMenu: + enable: true + links: + - name: Open in ChatGPT + icon: chatgpt + url: "https://chatgpt.com/?q=Read+{markdown_url}+and+help+me+with+{title}" + - name: Open in Claude + icon: claude + url: "https://claude.ai/new?q=Read+{markdown_url}+and+help+me+with+{title}" +``` + +Zero custom code required. + +--- + +## What Doesn't Work (Anti-Patterns) + +| Anti-pattern | Why it fails | +|-------------|--------------| +| Feature list on landing page | Says nothing an agent can act on. "Declarative CRDs" is not information. | +| Custom JavaScript for copy buttons | Theme updates break it. Use built-in features. | +| Hand-written CRD reference | Drifts within one sprint. Generate or die. | +| Deep navigation hierarchy | Agents (and humans) can't orient. Max 2 levels. | +| `_generated_` in URLs | Ugly, hard to remember. Use Hugo `aliases` for clean paths. | +| Separate "AI docs" section | The whole site should be AI-friendly. Not a ghetto. | +| Decorative gradients on cards | Noise. Adds nothing to information density. | +| Mixing install + usage in one page | Different questions at different times. Split. | +| Future/speculative pages in user docs | Noise. Keep them in `ai-docs/` planning folder. | +| LR Mermaid with many nodes | Renders tiny. Use TD (top-down) with fewer nodes for hero diagrams. | +| CSS in `head-end.html` partial | Use `assets/css/custom.css` — Hextra auto-loads it. | + +--- + +## Landing Page Formula + +``` +1. Title (one word) +2. Subtitle (one sentence — what it does) +3. Diagram (shows the mechanism — Mermaid or SVG) +4. One-line explanation below diagram +5. "I want to..." — 3 persona cards routing to: + - USE it (install → usage → monitoring) + - DEVELOP it (architecture → reference → contributing) + - FEED to AI (llms-full.txt) +``` + +No feature lists. No badges. No hero buttons that say "Documentation" (that's what the whole site is). + +--- + +## Site Structure Formula + +``` +/ Landing: diagram + persona routing +/docs/ Nav hub: table of sections + one-line descriptions +/docs/install/ Prerequisites + helm command +/docs/usage/ YAML examples for each CRD +/docs/[topic]/ One page per distinct concern +/docs/monitoring/ Metrics, events, health checks +/docs/reference/ Section: generated field tables +/docs/reference/crds/ Generated: every field +/docs/reference/errors/ Generated: condition reasons +/docs/reference/metrics/ Generated: Prometheus metrics +/docs/reference/arch/ Generated: package graph + sequence diagrams +/docs/developing/ Build, test, lint, conventions +/llms.txt Auto-generated page index (Hextra built-in) +/llms-full.txt Static: complete reference in one file +``` + +Key principles: +- **Flat**: max 2 levels deep +- **Task-oriented**: pages named for what you DO, not what the system HAS +- **Examples first**: every CRD page starts with working YAML before the field table +- **Generated reference**: never hand-write what can be extracted from types + +--- + +## Hugo + Hextra Configuration (Complete) + +```yaml +# hugo.yaml — the essential config for AI-friendly docs +baseURL: https://your-site.io/project/ +enableGitInfo: true + +outputs: + home: [html, llms] + page: [html, markdown] + section: [html, rss, markdown] + +params: + page: + width: wide + contextMenu: + enable: true + links: + - name: Open in ChatGPT + icon: chatgpt + url: "https://chatgpt.com/?q=Read+{markdown_url}+and+help+me+with+{title}" + - name: Open in Claude + icon: claude + url: "https://claude.ai/new?q=Read+{markdown_url}+and+help+me+with+{title}" + displayUpdatedDate: true + search: + enable: true + type: flexsearch + flexsearch: + index: content +``` + +Only custom partial needed: `layouts/partials/custom/head-end.html` for ``. + +--- + +## AI-Friendliness Scoring (0–50) + +Use this to evaluate any doc site: + +| # | Dimension | What to check | +|---|-----------|---------------| +| 1 | Discoverability | `/llms.txt` exists? `` in head? | +| 2 | Machine-Readable Output | Pages available as clean markdown? No HTML leakage? | +| 3 | Structured Data | Consistent tables? Predictable field schemas? | +| 4 | Context Density | Information-to-noise ratio. Zero decorative text in markdown output. | +| 5 | Navigation Clarity | Flat hierarchy? Descriptive names? 2 clicks to anything? | +| 6 | Completeness | All APIs, fields, errors documented? | +| 7 | Actionability | Copy-pasteable YAML examples? Working commands? | +| 8 | Self-Description | `llmsDescription` frontmatter? Site explains its own structure? | +| 9 | Freshness Signals | Last-updated dates? Generation timestamps? | +| 10 | Integration Surface | "Open in ChatGPT/Claude" links? `llms-full.txt` endpoint? | + +Score each 0–5. Grade: A (45-50), B (38-44), C (30-37), D (<30). + +Full audit prompt in `hack/ai-friendliness-audit.md`. + +--- + +## Generation Architecture + +``` +hack/gen-ai-docs/ +├── main.go — parse types, extract data, render all templates +└── templates.go — Go text/templates for every output file +``` + +### Extraction sources + +| Data | Source | Method | +|------|--------|--------| +| CRD fields, types, docs | `api/v1alpha1/*_types.go` | `go/parser` + `go/ast` | +| Defaults, enums | Kubebuilder markers | Regex on `+kubebuilder:` | +| Metrics | `internal/metrics/metrics.go` | Regex on Name/Help | +| Error reasons | Controller constants | AST grep | +| Make targets | `Makefile` | Regex on `target: ## description` | +| Package graph | Import statements | AST parse | +| Samples | `hack/dev-samples.yaml` | File read | + +### Output + +| File | Audience | +|------|----------| +| `llms.txt` | USE agents — project overview + page index | +| `llms-full.txt` | USE agents — complete field reference | +| `docs/static/llms-full.txt` | Same, served on Hugo site | +| `.github/copilot-instructions.md` | CODE agents (Copilot) | +| `.cursorrules` | CODE agents (Cursor) | +| `AGENTS.md` | CODE agents (generic) | +| `docs/content/docs/reference/_generated_*.md` | Humans (Hugo) | + +--- + +## Staleness Prevention + +| Mechanism | Purpose | +|-----------|---------| +| `make docs-gen` in CI | Fails if generated != committed | +| `# DO NOT EDIT` header | Humans don't accidentally modify | +| knowledge.yaml intermediate | New output = new template, no extractor change | +| Hugo `enableGitInfo` | Every page shows "Last updated" date | + +--- + +## Key Insight + +**Don't document features. Show the mechanism.** + +A Mermaid diagram that shows `CR → Operator → Pod → kubelet pulls → image cached` communicates more in 2 seconds than 6 feature cards ever will. The feature list is what the project already does — the diagram is how to think about it. + +Documentation is navigation. Route people by intent (use / develop / integrate), not by topic (CRDs / metrics / architecture). Topics are for the sidebar after you've already found the right section. diff --git a/ai-docs/13-discovery-architecture.md b/ai-docs/13-discovery-architecture.md new file mode 100644 index 0000000..b826314 --- /dev/null +++ b/ai-docs/13-discovery-architecture.md @@ -0,0 +1,332 @@ +# Feature: Discovery Architecture + +## Goal + +Replace legacy bash-script-based image discovery (Prometheus queries + registry tag fetching + DaemonSet YAML generation) with a declarative, operator-managed flow. The operator handles querying, filtering, ranking, and materializing `CachedImage` resources — no scripts, no manual `jq`/`yq`/`curl` pipelines. + +--- + +## How it replaces legacy scripts + +| Legacy step | Operator equivalent | +|-------------|-------------------| +| `curl` Prometheus with basic auth | `DiscoveryPolicy` source `type: prometheus` with `secretRef` | +| `jq` to parse response, rank by count | Operator parses Prometheus response, ranks internally | +| `curl` GitLab/registry API for tags | `DiscoveryPolicy` source `type: registry` with `secretRef` | +| Build image refs from tag+commit | Operator uses `imageTemplate` to construct full image refs | +| `jq -s sort_by | reverse | [:30]` | `topX` field on source + `maxImages` on policy | +| Generate DaemonSet YAML with `yq` | Operator creates/updates `CachedImage` resources (owned by `CachedImageSet`) | +| Manual re-run / cron | `syncInterval` triggers automatic periodic reconciliation | + +--- + +## Reconciliation flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ DiscoveryPolicy Reconciler │ +│ │ +│ 1. For each source in spec.sources: │ +│ a. Build HTTP client (endpoint + secretRef → auth/TLS) │ +│ b. Execute query/request │ +│ c. Parse response into unified ImageResult list │ +│ │ +│ 2. Merge results from all sources │ +│ 3. Apply imageFilter (regex) │ +│ 4. Rank by score (descending), truncate to maxImages │ +│ 5. Write discovered images to status.discoveredImages[] │ +│ 6. Requeue after syncInterval │ +└──────────────────────────────────────┬──────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ CachedImageSet Reconciler │ +│ │ +│ 1. If discoveryPolicyRef set: │ +│ a. Read DiscoveryPolicy.status.discoveredImages[] │ +│ b. Diff against existing child CachedImage resources │ +│ c. Create new CachedImage for newly discovered images │ +│ d. Delete CachedImage for images no longer discovered │ +│ e. All children have ownerReference → set for GC │ +│ │ +│ 2. If static images[] set: │ +│ a. Reconcile child CachedImage list to match spec │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Query result contract + +Every source type must produce a **unified internal result**: a list of `ImageResult` items. The operator normalizes all backend responses into this shape. + +### `ImageResult` (internal, not a CRD) + +```go +type ImageResult struct { + Image string // fully qualified image reference (registry/repo:tag or @sha256:...) + Score float64 // ranking score (higher = more important, e.g. usage count) +} +``` + +### What the Prometheus query must return + +The operator expects Prometheus to return results where **each result has a label called `image`** containing the full image reference. The associated value is used as the score for ranking. + +**Required label:** `image` — the fully qualified image reference. + +**Score source:** +- For `query` (instant query): the current value of each result series. +- For `query_range`: the operator sums all values in the range (total usage). + +**Example query — top 30 images by container count over 7 days:** + +```promql +topk(30, + count by (image) ( + container_memory_working_set_bytes{ + container!="", + container!="POD", + namespace="build-stuff", + cluster="mycluster", + pod=~"runner-.*", + image!~".+\\.ecr\\.eu-central-1\\.amazonaws\\.com.+" + } + ) +) +``` + +The operator will: +1. Execute this query against the configured endpoint (with auth from `secretRef`). +2. Parse the response: extract `image` label → `ImageResult.Image`, metric value → `ImageResult.Score`. +3. Results are already ranked by Prometheus (`topk`), but operator re-sorts by score anyway for consistency. + +**Prometheus response format (standard `/api/v1/query` JSON):** + +```json +{ + "status": "success", + "data": { + "resultType": "vector", + "result": [ + { "metric": { "image": "registry.example.com/team/runner:v1.2.3" }, "value": [1716368400, "42"] }, + { "metric": { "image": "registry.example.com/team/helper:latest" }, "value": [1716368400, "38"] } + ] + } +} +``` + +The operator reads `result[].metric.image` and `result[].value[1]` (as float64 score). + +--- + +### What the registry source returns + +The operator queries OCI Distribution API (`GET /v2//tags/list`) for each configured repository, then: +1. Filters tags by `tagFilter` regex. +2. Sorts by semver (if parseable) or lexicographic/date order. +3. Takes top X per repository. +4. Constructs full image refs: `/:`. +5. Optionally applies `imageTemplate` for complex ref construction (e.g. GitLab helper images with commit-based tags). + +**Registry response format (OCI standard):** + +```json +{ + "name": "gitlab-org/gitlab-runner/gitlab-runner-helper", + "tags": ["v17.0.0", "v16.11.0", "v16.10.0", "x86_64-abc1234", "x86_64-v17.0.0"] +} +``` + +--- + +## Image template (for complex image ref construction) + +Some registries use non-standard tag formats (e.g. GitLab runner helper uses `x86_64-` and `x86_64-`). The `imageTemplate` field supports Go template syntax to construct the final image reference from tag metadata. + +```yaml +sources: + - type: registry + registry: + url: https://registry.gitlab.com + repositories: + - gitlab-org/gitlab-runner/gitlab-runner-helper + tagFilter: "^v[0-9]+\\.[0-9]+\\.[0-9]+$" + topX: 5 + imageTemplate: "registry.gitlab.com/gitlab-org/gitlab-runner/gitlab-runner-helper:x86_64-{{ .Tag }}" + secretRef: + name: gitlab-registry-creds +``` + +Template variables available: +- `{{ .Tag }}` — the matched tag string +- `{{ .Repository }}` — the repository path +- `{{ .Registry }}` — the registry URL (without scheme) + +If `imageTemplate` is not set, the default is `/:`. + +--- + +## Concrete example: Replacing the legacy GitLab helper script + +**Legacy:** bash script curls GitLab API, extracts top 5 tags + commits, builds image refs with `x86_64-` and `x86_64-` suffixes, writes JSON. + +**Operator equivalent:** + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: gitlab-runner-helpers +spec: + sources: + - type: registry + registry: + url: https://registry.gitlab.com + repositories: + - gitlab-org/gitlab-runner/gitlab-runner-helper + tagFilter: "^v[0-9]+\\.[0-9]+\\.[0-9]+$" # only semver release tags + topX: 5 # top 5 most recent + imageTemplate: "registry.gitlab.com/gitlab-org/gitlab-runner/gitlab-runner-helper:x86_64-{{ .Tag }}" + secretRef: + name: gitlab-registry-token # optional: token for private registry + syncInterval: 1h + maxImages: 5 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: gitlab-runner-helpers +spec: + discoveryPolicyRef: + name: gitlab-runner-helpers + policyRef: + name: build-pool-safe + nodeSelector: + node-role.kubernetes.io/build: "true" + tolerations: + - key: "node-role.kubernetes.io/build" + operator: "Exists" + effect: "NoSchedule" + pullPolicy: Always # helpers use moving tags + repullPolicy: OnSchedule +``` + +**Result:** operator discovers the 5 latest release tags, constructs `x86_64-v17.0.0` style refs, creates 5 `CachedImage` children, pulls them onto build nodes with safe pacing. No bash, no cron, no manual YAML generation. + +--- + +## Concrete example: Replacing the legacy Prometheus top-images script + +**Legacy:** bash script curls Prometheus with basic auth, queries `container_memory_working_set_bytes`, parses with `jq`, sorts, takes top 30, generates DaemonSet YAML with `yq`. + +**Operator equivalent:** + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: popular-build-images +spec: + sources: + - type: prometheus + prometheus: + endpoint: https://mimir.example.com/prometheus + query: | + topk(30, + count by (image) ( + container_memory_working_set_bytes{ + container!="", + container!="POD", + namespace="build-stuff", + cluster="mycluster", + pod=~"runner-.*", + image!~".+\\.ecr\\.eu-central-1\\.amazonaws\\.com.+" + } + ) + ) + interval: 6h # re-query every 6 hours + secretRef: + name: prometheus-creds # Secret: username=admin, password= + syncInterval: 6h + maxImages: 30 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: popular-build-images +spec: + discoveryPolicyRef: + name: popular-build-images + policyRef: + name: build-pool-safe + nodeSelector: + node-role.kubernetes.io/build: "true" + tolerations: + - key: "node-role.kubernetes.io/build" + operator: "Exists" + effect: "NoSchedule" + pullPolicy: IfNotPresent + repullPolicy: OnSchedule +``` + +**Result:** operator queries Prometheus every 6h, discovers top 30 images by usage, creates/updates 30 `CachedImage` children (GC'd when they drop out of top 30), pulls them onto build nodes. No bash, no jq, no yq, no DaemonSet templating. + +--- + +## Design principles + +1. **Declarative over imperative** — user declares _what_ to discover, operator handles _how_. +2. **Simple query contract** — Prometheus queries must return an `image` label. That's the only requirement. +3. **Score-based ranking** — all sources produce scored results; operator merges and ranks uniformly. +4. **Template-based ref construction** — handles complex tag-to-image-ref mappings (GitLab helper pattern) without custom code. +5. **Secret-based auth** — any auth scheme works via standard k8s Secrets. No operator changes needed for new auth patterns. +6. **Automatic lifecycle** — discovered images that drop out of results get their `CachedImage` garbage-collected via owner references. +7. **Multi-source merge** — a single `DiscoveryPolicy` can combine Prometheus + registry results, deduplicating by image ref. + +--- + +## Status reporting + +```yaml +status: + lastSyncTime: "2026-05-22T09:00:00Z" + discoveredImages: + - image: "registry.example.com/team/runner:v17.0.0" + score: 42 + source: prometheus + - image: "registry.gitlab.com/gitlab-org/gitlab-runner/gitlab-runner-helper:x86_64-v17.0.0" + score: 0 # registry sources don't have usage scores, sorted by recency + source: registry + conditions: + - type: Ready + status: "True" + lastTransitionTime: "2026-05-22T09:00:00Z" + - type: SourceHealthy + status: "True" + message: "All 2 sources responding" +``` + +--- + +## Error handling + +| Failure | Behavior | +|---------|----------| +| Source endpoint unreachable | Retry with backoff, report condition `SourceHealthy=False` | +| Auth failure (401/403) | Report condition, don't clear previous results (stale-but-valid) | +| Query returns no results | Report condition `NoResults`, keep previous discovered set | +| Query returns invalid format (no `image` label) | Report condition `InvalidResponse`, keep previous set | +| Source timeout | Configurable via Secret or source config, default 30s | + +**Key principle:** on transient failures, keep the last known good discovery set. Only update when a source returns valid results. This prevents cache thrashing during outages. + +--- + +## Implementation phases + +1. **Phase 1:** Prometheus source only (covers the main use case). +2. **Phase 2:** Registry source with tag listing + `imageTemplate`. +3. **Phase 3:** Additional source types as needed (webhook, etc.). + +Each phase is independently useful and shippable. diff --git a/ai-docs/14-architecture.md b/ai-docs/14-architecture.md new file mode 100644 index 0000000..c3f28f0 --- /dev/null +++ b/ai-docs/14-architecture.md @@ -0,0 +1,546 @@ +# Architecture Plan + +## Overview + +The **drop** operator caches container images onto Kubernetes nodes declaratively. It replaces manual DaemonSet/script-based pre-pulling with a controller-driven reconciliation loop that is safe, paced, and observable. + +**Design principles:** +- Simple over clever — no over-abstraction, no premature optimization. +- Follow Go and Kubernetes operator best practices (Kubebuilder conventions, idempotent reconciliation, status subresource, owner references). +- Single-concern resources — each CRD does one thing well. +- Declarative intent — users declare *what* to cache; operator handles *how*. + +--- + +## System Architecture + +``` +┌──────────────────────────────────────────────────────────────────────────────┐ +│ Kubernetes API Server │ +│ │ +│ CRDs (drop.corewire.io/v1alpha1, all cluster-scoped): │ +│ ┌──────────────┐ ┌────────────────┐ ┌────────────┐ ┌─────────────────┐ │ +│ │ CachedImage │ │ CachedImageSet │ │ PullPolicy │ │ DiscoveryPolicy │ │ +│ └──────────────┘ └────────────────┘ └────────────┘ └─────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────────┘ + ▲ ▲ │ + │ owns │ reads status │ + │ (ownerRef) │ ▼ +┌───────┴────────────────────┴─────────────────────────────────────────────────┐ +│ drop-controller-manager (single Deployment, leader-elected) │ +│ │ +│ ┌─────────────────────┐ ┌─────────────────────────┐ ┌──────────────────┐ │ +│ │ CachedImage │ │ CachedImageSet │ │ DiscoveryPolicy │ │ +│ │ Reconciler │ │ Reconciler │ │ Reconciler │ │ +│ │ │ │ │ │ │ │ +│ │ • create drop Pod │ │ • diff spec vs children │ │ • query sources │ │ +│ │ • track completion │ │ • create/delete children│ │ • write status │ │ +│ │ • update status │ │ • propagate defaults │ │ • requeue │ │ +│ └─────────────────────┘ └─────────────────────────┘ └──────────────────┘ │ +│ │ +│ Shared components: │ +│ • PullPolicy cache (in-memory read of PullPolicy resources) │ +│ • Rate limiter / pacing engine (enforces maxConcurrentNodes + delays) │ +│ • Metrics exporter (Prometheus /metrics endpoint) │ +└──────────────────────────────────────────────────────────────────────────────┘ + │ + │ creates Pods (drop jobs) + ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ Kubernetes Nodes │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ Drop Pod (short-lived, one per image×node) │ │ +│ │ spec: │ │ +│ │ nodeName: │ │ +│ │ containers: │ │ +│ │ - name: pull │ │ +│ │ image: │ │ +│ │ command: ["true"] # exits immediately after pull │ │ +│ │ restartPolicy: Never │ │ +│ └──────────────────────────────────────────────────────────────────┘ │ +│ │ +│ containerd/CRI pulls the image layers (parallel layer downloads built-in) │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Pull Mechanism + +### Approach: Short-lived Pods with `nodeName` + +The operator creates a short-lived Pod per (image, node) pair. The Pod's container uses the target image with `command: ["true"]` and `restartPolicy: Never`. The kubelet pulls the image onto the node as part of normal Pod scheduling, then the container exits immediately. + +**Why this approach (not DaemonSet, not crictl):** + +| Approach | Pros | Cons | +|----------|------|------| +| DaemonSet with initContainers | Simple, native k8s | Hard to manage lifecycle, can't target individual nodes easily, restarts on change | +| Job per node with `crictl` | Direct CRI control | Requires privileged access, mounts runtime socket, security concern | +| **Pod with `nodeName` + `command: ["true"]`** | No privilege needed, uses standard kubelet image pull, easy cleanup, per-node targeting | Slightly more Pods to manage | + +The chosen approach: +- **No elevated privileges** — works with standard RBAC. +- **Uses native kubelet image pull** — respects node-level pull secrets, mirrors, and runtime configuration. +- **Simple lifecycle** — Pod completes → operator observes `.status.phase == Succeeded` → marks node as ready in `CachedImage` status. +- **Easy cleanup** — completed Pods are deleted by the operator after status is recorded. +- **Per-node control** — `nodeName` field pins the Pod to a specific node; operator controls which nodes get Pods and when. + +### Pod Spec (template) + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: drop-- + labels: + app.kubernetes.io/managed-by: drop + drop.corewire.io/cachedimage: + drop.corewire.io/node: + ownerReferences: + - apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImage + name: + uid: + controller: true +spec: + nodeName: + containers: + - name: pull + image: : + command: ["true"] + resources: + requests: + cpu: "0" + memory: "0" + restartPolicy: Never + terminationGracePeriodSeconds: 0 + automountServiceAccountToken: false + enableServiceLinks: false + tolerations: +``` + +### imagePullPolicy on the Pod + +- When `CachedImage.spec.pullPolicy: IfNotPresent` → Pod container `imagePullPolicy: IfNotPresent` (skip if already on node). +- When `CachedImage.spec.pullPolicy: Always` → Pod container `imagePullPolicy: Always` (always check registry). + +--- + +## Reconcilers + +### CachedImage Reconciler + +**Watches:** `CachedImage`, owned `Pod` resources. + +**Reconcile loop (idempotent):** + +``` +1. Fetch CachedImage CR +2. If being deleted → clean up any active drop Pods → remove finalizer → done +3. Resolve target nodes: + a. List nodes matching CachedImage.spec.nodeSelector + b. Filter by tolerations (node must have matching taints) + c. Result: set of target node names +4. Resolve PullPolicy (from spec.policyRef, or use built-in defaults) +5. For each target node: + a. Check if drop Pod already exists (label selector) + b. If Pod exists and Succeeded → record node as ready in status + c. If Pod exists and Failed → record failure, apply backoff + d. If Pod does not exist and node not yet ready: + - Check pacing constraints (maxConcurrentNodes, minDelayBetweenPulls) + - If within budget → create drop Pod + - If over budget → skip, requeue +6. Update CachedImage.status: + - nodesTargeted, nodesReady, phase, conditions, lastPulledAt +7. Clean up completed/failed Pods (after recording status) +8. If all nodes ready → set phase=Ready, done +9. If work remaining → requeue (with delay based on pacing) +``` + +**Key design points:** +- Idempotent: calling Reconcile multiple times produces the same result. +- Rate limiting is per-CachedImage and global (via PullPolicy pacing check). +- The reconciler does NOT watch all Pods in the cluster — only Pods it owns (via `.Owns(&corev1.Pod{})`). +- Uses `GenerationChangedPredicate` to avoid reconciling on status-only updates. + +### CachedImageSet Reconciler + +**Watches:** `CachedImageSet`, owned `CachedImage` resources, referenced `DiscoveryPolicy` (via watch with handler). + +**Reconcile loop:** + +``` +1. Fetch CachedImageSet CR +2. Determine desired image list: + a. If spec.images set → use static list + b. If spec.discoveryPolicyRef set → read DiscoveryPolicy.status.discoveredImages + c. Merge (static takes precedence for same image ref) +3. List existing child CachedImage resources (ownerReference filter) +4. Diff desired vs existing: + a. New images → create CachedImage with ownerRef pointing to this set + b. Removed images → delete child CachedImage (GC via ownerRef also works) + c. Changed images → update child CachedImage spec +5. Propagate shared config to children: + - policyRef, nodeSelector, tolerations, pullPolicy, repullPolicy +6. Update CachedImageSet.status: + - imagesManaged, imagesReady (aggregate from children), phase, conditions +``` + +**Key design points:** +- Child `CachedImage` resources have `ownerReferences` → Kubernetes GC handles cleanup if the set is deleted. +- The reconciler watches `DiscoveryPolicy` changes via an explicit watch with `handler.EnqueueRequestsFromMapFunc` to trigger reconciliation when discovery results change. + +### DiscoveryPolicy Reconciler + +**Watches:** `DiscoveryPolicy`, referenced `Secret` resources (for auth credential rotation). + +**Reconcile loop:** + +``` +1. Fetch DiscoveryPolicy CR +2. For each source in spec.sources: + a. Build HTTP client: + - Read secretRef → populate auth headers/TLS config + - Set timeout (default 30s) + b. Execute source-specific query: + - Prometheus: GET /api/v1/query with query string + - Registry: GET /v2//tags/list + c. Parse response into []ImageResult{Image, Score} + d. On failure: log error, set condition, keep previous results, continue +3. Merge results from all sources (deduplicate by image ref, keep highest score) +4. Apply imageFilter regex (exclude non-matching) +5. Sort by score descending, truncate to maxImages +6. Write to status.discoveredImages +7. Update conditions (Ready, SourceHealthy) +8. Requeue after syncInterval +``` + +**Key design points:** +- On transient failures, preserve last known good results (no cache thrashing). +- Each source is independent — one failing source doesn't block others. +- The reconciler is purely a data producer; it does NOT create CachedImage resources directly. That responsibility belongs to `CachedImageSet`. + +--- + +## Pacing Engine + +The pacing engine is NOT a separate controller. It is shared logic called by the `CachedImage` reconciler before creating a drop Pod. + +```go +// PacingDecision determines if a new pull can be started right now. +type PacingDecision struct { + Allowed bool + RequeueIn time.Duration // if not allowed, when to retry +} + +func (p *PacingEngine) CanPull(ctx context.Context, policy *v1alpha1.PullPolicy) PacingDecision { + // 1. Count currently active drop Pods matching this policy's scope + // 2. If active >= policy.Spec.MaxConcurrentNodes → deny, requeue + // 3. Check time since last pull start for this policy + // 4. If elapsed < policy.Spec.MinDelayBetweenPulls → deny, requeue with remaining delay + // 5. Allow +} +``` + +**Implementation:** Query active Pods via label selectors (cached by informer). No external state store needed — all state is derived from the cluster. + +**Defaults (when no PullPolicy is referenced):** +- `maxConcurrentNodes: 1` — sequential, safest default. +- `minDelayBetweenPulls: 10s` — gentle pacing. +- `failureBackoff: initial=30s, max=5m` — exponential with cap. + +--- + +## Resource Relationships + +``` +PullPolicy ◄──── policyRef ─────── CachedImage + ▲ + │ ownerRef +PullPolicy ◄──── policyRef ─────── CachedImageSet ──── discoveryPolicyRef ───► DiscoveryPolicy + │ + │ creates (ownerRef) + ▼ + CachedImage (child) +``` + +- `PullPolicy` is referenced but never owns or is owned. +- `DiscoveryPolicy` is referenced by `CachedImageSet`; never owns or is owned. +- `CachedImageSet` owns child `CachedImage` resources. +- `CachedImage` owns drop `Pod` resources. + +--- + +## Project Structure (Go) + +Following standard Kubebuilder layout: + +``` +drop/ +├── api/ +│ └── v1alpha1/ +│ ├── cachedimage_types.go +│ ├── cachedimageset_types.go +│ ├── pullpolicy_types.go +│ ├── discoverypolicy_types.go +│ ├── groupversion_info.go +│ └── zz_generated.deepcopy.go +├── cmd/ +│ └── main.go # manager entrypoint +├── internal/ +│ ├── controller/ +│ │ ├── cachedimage_controller.go +│ │ ├── cachedimageset_controller.go +│ │ └── discoverypolicy_controller.go +│ ├── pacing/ +│ │ └── engine.go # pacing logic (shared) +│ ├── discovery/ +│ │ ├── source.go # Source interface +│ │ ├── prometheus.go # Prometheus source implementation +│ │ └── registry.go # Registry source implementation +│ └── podbuilder/ +│ └── builder.go # constructs drop Pod specs +├── config/ +│ ├── crd/ # generated CRD manifests +│ ├── rbac/ # generated RBAC +│ ├── manager/ # manager Deployment +│ └── samples/ # example CRs +├── charts/ +│ └── drop/ # Helm chart +├── test/ +│ └── e2e/ # Kyverno Chainsaw test scenarios +├── docs/ # Hugo Hextra source +├── Dockerfile +├── Makefile +├── go.mod +└── go.sum +``` + +--- + +## Key Interfaces + +### Source Interface (Discovery) + +```go +// Source is the interface every discovery backend implements. +type Source interface { + // Fetch queries the backend and returns discovered images. + Fetch(ctx context.Context) ([]ImageResult, error) +} + +type ImageResult struct { + Image string + Score float64 +} +``` + +Each source type (`prometheus`, `registry`) implements this interface. Adding a new source = one new file implementing `Source`. No other changes needed. + +### Pod Builder + +```go +// BuildDropPod creates a Pod spec for pulling an image onto a specific node. +func BuildDropPod(ci *v1alpha1.CachedImage, nodeName string) *corev1.Pod +``` + +Single function, tested in isolation. No abstraction layers. + +--- + +## Controller Registration + +```go +func main() { + mgr, _ := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + LeaderElection: true, + LeaderElectionID: "drop.corewire.io", + // ... + }) + + // CachedImage controller - owns Pods + ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.CachedImage{}). + Owns(&corev1.Pod{}). + WithEventFilter(predicate.GenerationChangedPredicate{}). + Complete(&controller.CachedImageReconciler{}) + + // CachedImageSet controller - owns CachedImages, watches DiscoveryPolicy + ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.CachedImageSet{}). + Owns(&v1alpha1.CachedImage{}). + Watches(&v1alpha1.DiscoveryPolicy{}, handler.EnqueueRequestsFromMapFunc(mapDiscoveryToSets)). + WithEventFilter(predicate.GenerationChangedPredicate{}). + Complete(&controller.CachedImageSetReconciler{}) + + // DiscoveryPolicy controller + ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.DiscoveryPolicy{}). + Complete(&controller.DiscoveryPolicyReconciler{}) + + mgr.Start(ctrl.SetupSignalHandler()) +} +``` + +--- + +## RBAC (Least Privilege) + +```yaml +# Core operations +- apiGroups: ["drop.corewire.io"] + resources: ["cachedimages", "cachedimagesets", "pullpolicies", "discoverypolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["drop.corewire.io"] + resources: ["cachedimages/status", "cachedimagesets/status", "discoverypolicies/status"] + verbs: ["get", "update", "patch"] + +# Drop Pods +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "delete"] + +# Node listing (read-only) +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + +# Secrets for discovery auth (read-only) +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get"] + +# Events +- apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] + +# Leader election +- apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +``` + +--- + +## Status Conditions (standard k8s convention) + +All status types use `metav1.Condition` for consistency: + +**CachedImage conditions:** +- `Ready` — all target nodes have the image cached. +- `Progressing` — pulls are in progress. +- `Degraded` — some nodes have failed pulls (with message). + +**CachedImageSet conditions:** +- `Ready` — all child CachedImages are ready. +- `Progressing` — children are being created/reconciled. + +**DiscoveryPolicy conditions:** +- `Ready` — last sync was successful. +- `SourceHealthy` — all configured sources are responding. + +--- + +## Observability + +**Prometheus metrics (exposed on /metrics):** + +| Metric | Type | Description | +|--------|------|-------------| +| `drop_cachedimage_nodes_ready` | Gauge | Nodes with image cached per CachedImage | +| `drop_cachedimage_nodes_targeted` | Gauge | Target nodes per CachedImage | +| `drop_pull_duration_seconds` | Histogram | Time to pull an image onto a node | +| `drop_pull_failures_total` | Counter | Failed pull attempts | +| `drop_discovery_sync_duration_seconds` | Histogram | Discovery query duration | +| `drop_discovery_images_found` | Gauge | Number of images discovered per DiscoveryPolicy | +| `drop_active_pulls` | Gauge | Currently active drop Pods | + +**Kubernetes Events:** +- `PullSucceeded` — image successfully cached on node. +- `PullFailed` — image pull failed (with error message). +- `DiscoverySyncFailed` — discovery source query failed. +- `PolicyViolation` — pull rate exceeded (informational). + +--- + +## Error Handling and Resilience + +| Scenario | Behavior | +|----------|----------| +| Drop Pod fails | Record failure in CachedImage status, apply exponential backoff from PullPolicy, retry | +| Node removed from cluster | CachedImage status updated on next reconcile (node drops from targeted set) | +| Node added to cluster | Reconciler picks up new node on next cycle, creates drop Pod if within pacing budget | +| Discovery source down | Keep last known good results, set SourceHealthy=False condition, retry on next syncInterval | +| PullPolicy deleted while referenced | CachedImage reconciler falls back to built-in defaults, emits warning event | +| CachedImageSet deleted | Kubernetes GC cascades deletion to child CachedImage resources (ownerRef) | +| Controller restart | Reconcilers rebuild state from existing CRs and Pods — no external state store needed | + +--- + +## Constraints and Non-Goals + +**Constraints:** +- All resources are cluster-scoped (nodes are cluster-scoped). +- Pulls must never affect node schedulability (non-disruptive guarantee). +- No CRI socket mounting, no privileged containers. +- Single binary, single Deployment, leader-elected. + +**Non-goals (explicitly out of scope):** +- Image garbage collection / cleanup (use Eraser or kubelet GC for that). +- Registry mirroring / caching proxy (use Spegel or registry mirrors). +- Pod scheduling decisions (this operator only pre-caches; it does not influence the scheduler). +- Multi-cluster support (single-cluster operator; run one instance per cluster). + +--- + +## Implementation Phases + +| Phase | Scope | Outcome | +|-------|-------|---------| +| 1 | Project bootstrap + CRDs + `CachedImage` reconciler (static, single node) | Can declare an image and have it pulled onto a specific node | +| 2 | Multi-node targeting + `PullPolicy` pacing | Safe, throttled pulls across multiple nodes | +| 3 | `CachedImageSet` with static image lists | Group images, shared config, ownerRef GC | +| 4 | `DiscoveryPolicy` with Prometheus source | Auto-discover top images from metrics | +| 5 | Registry source + imageTemplate | Discover images from OCI registries | +| 6 | Helm chart, CI/CD, multi-arch images, docs | Production-ready distribution | + +Each phase is independently useful and deployable. No phase depends on later phases. + +--- + +## Validation Summary + +**Does this architecture follow Go best practices?** +- ✅ Standard project layout (Kubebuilder conventions). +- ✅ Interfaces for extensibility (`Source` interface). +- ✅ No globals — dependency injection via reconciler struct fields. +- ✅ Table-driven tests for Pod building, pacing logic. +- ✅ Packages grouped by domain responsibility, not by layer. + +**Does this follow Kubernetes operator best practices?** +- ✅ Idempotent reconciliation — safe to call multiple times. +- ✅ Status subresource for observed state. +- ✅ OwnerReferences for garbage collection. +- ✅ Leader election for single-writer safety. +- ✅ Event predicates to avoid unnecessary reconciliations. +- ✅ Least-privilege RBAC. +- ✅ Standard conditions pattern (`metav1.Condition`). +- ✅ Finalizers only where external cleanup is needed (none needed here — all resources are k8s-native). +- ✅ No watch on all Pods — only owned Pods via `.Owns()`. + +**Is it simple?** +- ✅ Three reconcilers, each with a single clear responsibility. +- ✅ No custom schedulers, no webhooks (for v1), no conversion webhooks. +- ✅ Pacing is shared utility code, not a separate controller. +- ✅ Discovery sources implement one interface with one method. +- ✅ Pull mechanism is a standard Pod — no DaemonSet lifecycle complexity. + +**Is it powerful?** +- ✅ Handles static and dynamic image lists. +- ✅ Extensible discovery (any backend that implements `Source`). +- ✅ Per-pool pacing via nodeSelector on PullPolicy. +- ✅ Automatic cleanup via ownerReferences. +- ✅ Observable via Prometheus metrics and k8s events. diff --git a/ai-docs/15-implementation-plan.md b/ai-docs/15-implementation-plan.md new file mode 100644 index 0000000..ea0618a --- /dev/null +++ b/ai-docs/15-implementation-plan.md @@ -0,0 +1,823 @@ +# Implementation Plan + +Detailed, step-by-step implementation plan for the drop operator. Each task includes exact commands, files to create/modify, acceptance criteria, and estimated effort. Tasks are ordered by dependency — later tasks depend on earlier ones completing. + +--- + +## Phase 1: Project Bootstrap + +### Task 1.1: Initialize Kubebuilder Project + +**Goal:** Scaffold Go project with Kubebuilder, establish module and project structure. + +**Commands:** +```bash +# Prerequisites: Go 1.22+, Kubebuilder 4.x +kubebuilder init --domain corewire.io --repo github.com/Breee/drop +``` + +**Files created (by scaffolding):** +- `go.mod` (module `github.com/Breee/drop`) +- `go.sum` +- `Makefile` (Kubebuilder-generated, with controller-gen, envtest, kustomize targets) +- `cmd/main.go` (manager entrypoint with leader election, health probes) +- `config/` (manager, RBAC, CRD kustomize bases) +- `Dockerfile` +- `PROJECT` (Kubebuilder project metadata) +- `.golangci.yml` (add manually — standard strict config) + +**Manual additions after scaffold:** +- Add `.golangci.yml` with `gofmt`, `govet`, `errcheck`, `staticcheck`, `unused`, `gosec` linters. +- Add `Taskfile.yml` (go-task) mirroring Make targets for developer preference. +- Add `.editorconfig` for consistent formatting. +- Add `.gitignore` for Go binaries, `bin/`, `testbin/`, `vendor/`, coverage files. + +**Acceptance criteria:** +- [ ] `make build` succeeds (empty operator binary compiles). +- [ ] `make test` succeeds (no tests yet, but envtest setup works). +- [ ] `go vet ./...` passes. +- [ ] `golangci-lint run` passes. + +--- + +### Task 1.2: Scaffold CRD APIs + +**Goal:** Create the four API types with all spec/status fields. + +**Commands:** +```bash +kubebuilder create api --group drop --version v1alpha1 --kind CachedImage --resource --controller +kubebuilder create api --group drop --version v1alpha1 --kind CachedImageSet --resource --controller +kubebuilder create api --group drop --version v1alpha1 --kind PullPolicy --resource --controller=false +kubebuilder create api --group drop --version v1alpha1 --kind DiscoveryPolicy --resource --controller +``` + +**Files to implement (after scaffold, fill in types):** + +#### `api/v1alpha1/cachedimage_types.go` +```go +type CachedImageSpec struct { + // Image is the fully qualified image reference (without tag/digest). + Image string `json:"image"` + // Tag to pull. Mutually exclusive with Digest. + // +optional + Tag string `json:"tag,omitempty"` + // Digest to pull (immutable reference). Mutually exclusive with Tag. + // +optional + Digest string `json:"digest,omitempty"` + // PullPolicy controls whether to pull if image exists on node. + // +kubebuilder:default=IfNotPresent + // +kubebuilder:validation:Enum=IfNotPresent;Always + PullPolicy string `json:"pullPolicy,omitempty"` + // RepullPolicy controls refresh behavior for cached images. + // +kubebuilder:default=Never + // +kubebuilder:validation:Enum=Never;OnSchedule;Always + RepullPolicy string `json:"repullPolicy,omitempty"` + // NodeSelector restricts which nodes to cache the image on. + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + // Tolerations allow targeting tainted nodes. + // +optional + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + // Priority is a pull ordering hint (lower values pulled first). + // +optional + Priority *int32 `json:"priority,omitempty"` + // PolicyRef references a PullPolicy for pacing controls. + // +optional + PolicyRef *PolicyReference `json:"policyRef,omitempty"` +} + +type CachedImageStatus struct { + // ObservedGeneration is the last generation reconciled. + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + // Phase summarizes the overall state. + // +kubebuilder:validation:Enum=Pending;Pulling;Ready;Degraded + Phase string `json:"phase,omitempty"` + // NodesTargeted is the number of nodes that should have this image. + NodesTargeted int32 `json:"nodesTargeted,omitempty"` + // NodesReady is the number of nodes that have successfully pulled the image. + NodesReady int32 `json:"nodesReady,omitempty"` + // LastPulledAt is the timestamp of the most recent successful pull. + // +optional + LastPulledAt *metav1.Time `json:"lastPulledAt,omitempty"` + // Conditions represent the latest available observations. + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +type PolicyReference struct { + Name string `json:"name"` +} +``` + +#### `api/v1alpha1/cachedimageset_types.go` +```go +type CachedImageSetSpec struct { + // PolicyRef references a PullPolicy for pacing controls. + // +optional + PolicyRef *PolicyReference `json:"policyRef,omitempty"` + // DiscoveryPolicyRef references a DiscoveryPolicy for dynamic image lists. + // +optional + DiscoveryPolicyRef *DiscoveryPolicyReference `json:"discoveryPolicyRef,omitempty"` + // NodeSelector restricts which nodes to cache images on (propagated to children). + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + // Tolerations allow targeting tainted nodes (propagated to children). + // +optional + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + // Images is a static list of images to cache. + // +optional + Images []ImageEntry `json:"images,omitempty"` + // PullPolicy default for child CachedImage resources. + // +kubebuilder:default=IfNotPresent + // +kubebuilder:validation:Enum=IfNotPresent;Always + // +optional + PullPolicy string `json:"pullPolicy,omitempty"` + // RepullPolicy default for child CachedImage resources. + // +kubebuilder:default=Never + // +kubebuilder:validation:Enum=Never;OnSchedule;Always + // +optional + RepullPolicy string `json:"repullPolicy,omitempty"` +} + +type ImageEntry struct { + Image string `json:"image"` + Tag string `json:"tag,omitempty"` + Digest string `json:"digest,omitempty"` +} + +type DiscoveryPolicyReference struct { + Name string `json:"name"` +} + +type CachedImageSetStatus struct { + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + Phase string `json:"phase,omitempty"` + ImagesManaged int32 `json:"imagesManaged,omitempty"` + ImagesReady int32 `json:"imagesReady,omitempty"` + Conditions []metav1.Condition `json:"conditions,omitempty"` +} +``` + +#### `api/v1alpha1/pullpolicy_types.go` +```go +type PullPolicySpec struct { + // MaxConcurrentNodes is the max nodes pulling simultaneously for this policy. + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=1 + MaxConcurrentNodes int32 `json:"maxConcurrentNodes,omitempty"` + // MinDelayBetweenPulls is the minimum time between starting pulls on different nodes. + // +kubebuilder:default="10s" + MinDelayBetweenPulls metav1.Duration `json:"minDelayBetweenPulls,omitempty"` + // FailureBackoff configures retry delays on pull failures. + // +optional + FailureBackoff *BackoffConfig `json:"failureBackoff,omitempty"` + // RepullPolicyDefault is the default repull behavior for images referencing this policy. + // +kubebuilder:default=Never + // +kubebuilder:validation:Enum=Never;OnSchedule;Always + RepullPolicyDefault string `json:"repullPolicyDefault,omitempty"` + // NodeSelector scopes this policy to a specific node pool. + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + // Tolerations match tainted nodes in the pool. + // +optional + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` +} + +type BackoffConfig struct { + // Initial delay before first retry. + // +kubebuilder:default="30s" + Initial metav1.Duration `json:"initial,omitempty"` + // Max delay cap for exponential backoff. + // +kubebuilder:default="5m" + Max metav1.Duration `json:"max,omitempty"` +} + +// PullPolicy has no status — it is a configuration-only resource. +type PullPolicyStatus struct{} +``` + +#### `api/v1alpha1/discoverypolicy_types.go` +```go +type DiscoveryPolicySpec struct { + // Sources is the list of discovery backends to query. + Sources []DiscoverySource `json:"sources"` + // ImageFilter is a regex to filter discovered images. + // +optional + ImageFilter string `json:"imageFilter,omitempty"` + // SyncInterval is how often to re-query sources. + // +kubebuilder:default="30m" + SyncInterval metav1.Duration `json:"syncInterval,omitempty"` + // MaxImages caps the number of discovered images. + // +kubebuilder:default=50 + // +kubebuilder:validation:Minimum=1 + MaxImages int32 `json:"maxImages,omitempty"` +} + +type DiscoverySource struct { + // Type identifies the backend (prometheus, registry). + // +kubebuilder:validation:Enum=prometheus;registry + Type string `json:"type"` + // Prometheus config (when type=prometheus). + // +optional + Prometheus *PrometheusSource `json:"prometheus,omitempty"` + // Registry config (when type=registry). + // +optional + Registry *RegistrySource `json:"registry,omitempty"` + // SecretRef references a Secret for auth/TLS for this source. + // +optional + SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"` +} + +type PrometheusSource struct { + Endpoint string `json:"endpoint"` + Query string `json:"query"` +} + +type RegistrySource struct { + URL string `json:"url"` + Repositories []string `json:"repositories"` + TagFilter string `json:"tagFilter,omitempty"` + TopX int32 `json:"topX,omitempty"` + ImageTemplate string `json:"imageTemplate,omitempty"` +} + +type DiscoveryPolicyStatus struct { + LastSyncTime *metav1.Time `json:"lastSyncTime,omitempty"` + DiscoveredImages []DiscoveredImage `json:"discoveredImages,omitempty"` + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +type DiscoveredImage struct { + Image string `json:"image"` + Score float64 `json:"score"` + Source string `json:"source"` +} +``` + +**Post-scaffold steps:** +```bash +make generate # deepcopy generators +make manifests # CRD YAML generation +``` + +**Acceptance criteria:** +- [ ] `make generate` succeeds. +- [ ] `make manifests` produces CRD YAML files in `config/crd/bases/`. +- [ ] `make build` compiles with all types defined. +- [ ] CRD YAMLs contain all fields with correct validation markers. +- [ ] `kubectl apply -f config/crd/bases/` succeeds against a kind cluster. + +--- + +### Task 1.3: Implement Pod Builder + +**Goal:** Build drop Pod specs in isolation from controller logic. + +**File:** `internal/podbuilder/builder.go` + +```go +package podbuilder + +// BuildDropPod creates a Pod spec for pulling an image onto a specific node. +func BuildDropPod(ci *v1alpha1.CachedImage, nodeName string, scheme *runtime.Scheme) (*corev1.Pod, error) +``` + +**Implementation details:** +- Set `pod.Spec.NodeName = nodeName`. +- Set container image to `ci.Spec.Image:ci.Spec.Tag` (or `@ci.Spec.Digest`). +- Set `command: ["true"]`, `restartPolicy: Never`. +- Set `imagePullPolicy` from `ci.Spec.PullPolicy`. +- Copy `tolerations` from `ci.Spec.Tolerations`. +- Set `ownerReference` to the CachedImage (via `controllerutil.SetControllerReference`). +- Set labels: `app.kubernetes.io/managed-by=drop`, `drop.corewire.io/cachedimage=`, `drop.corewire.io/node=`. +- Set `automountServiceAccountToken: false`, `enableServiceLinks: false`, `terminationGracePeriodSeconds: 0`. +- Set resource requests to zero (pull-only Pod). + +**File:** `internal/podbuilder/builder_test.go` + +**Tests (table-driven):** +- Pod has correct nodeName. +- Pod has correct image ref (tag variant). +- Pod has correct image ref (digest variant). +- Pod has correct imagePullPolicy mapping. +- Pod has ownerReference set. +- Pod has expected labels. +- Pod tolerations match CachedImage tolerations. +- Pod has no resource requests/limits (other than zero). + +**Acceptance criteria:** +- [ ] `go test ./internal/podbuilder/...` passes. +- [ ] 100% branch coverage on builder function. + +--- + +### Task 1.4: Implement Pacing Engine + +**Goal:** Shared pacing logic that CachedImage reconciler calls before creating Pods. + +**File:** `internal/pacing/engine.go` + +```go +package pacing + +type Engine struct { + client client.Client +} + +type Decision struct { + Allowed bool + RequeueIn time.Duration +} + +// CanStartPull checks pacing constraints and returns whether a new pull can start. +func (e *Engine) CanStartPull(ctx context.Context, policy *v1alpha1.PullPolicy, cachedImageName string) (Decision, error) +``` + +**Implementation details:** +- List Pods with label `app.kubernetes.io/managed-by=drop` that are in Running/Pending phase. +- If policy has `nodeSelector`, filter active Pods to those on matching nodes. +- Count active pulls. If `>= policy.Spec.MaxConcurrentNodes` → deny. +- Find most recent Pod creation timestamp among active pulls for this policy scope. +- If `time.Since(lastCreated) < policy.Spec.MinDelayBetweenPulls` → deny with `RequeueIn` = remaining delay. +- Otherwise → allow. + +**File:** `internal/pacing/engine_test.go` + +**Tests:** +- Allows when no active pulls exist. +- Denies when maxConcurrentNodes reached, returns correct requeue duration. +- Denies when minDelayBetweenPulls not elapsed, returns remaining duration. +- Allows when exactly at boundary (maxConcurrentNodes - 1 active). +- Handles nil policy (use defaults). +- Scopes correctly when policy has nodeSelector. + +**Acceptance criteria:** +- [ ] `go test ./internal/pacing/...` passes. +- [ ] Unit tests cover all decision paths. + +--- + +### Task 1.5: Implement CachedImage Reconciler + +**Goal:** Core reconciler that creates drop Pods and tracks node-level completion. + +**File:** `internal/controller/cachedimage_controller.go` + +**Reconcile loop implementation:** +1. Fetch CachedImage; handle not-found (deleted). +2. List nodes matching `spec.nodeSelector` (via `client.List` with label selector). +3. Filter nodes whose taints are tolerated by `spec.tolerations`. +4. Fetch referenced PullPolicy (or use defaults if none referenced / not found). +5. List owned Pods (label selector `drop.corewire.io/cachedimage=`). +6. Build per-node state map: `{node → podStatus}`. +7. For nodes with Succeeded Pod → mark ready, delete Pod (cleanup). +8. For nodes with Failed Pod → record failure, calculate backoff, delete Pod. +9. For nodes with no Pod and not yet ready → check pacing via `pacing.Engine.CanStartPull()`. +10. If allowed → call `podbuilder.BuildDropPod()` → `client.Create()`. +11. Update `CachedImage.Status` (nodesTargeted, nodesReady, phase, conditions). +12. Return `ctrl.Result{RequeueAfter: ...}` based on pacing needs. + +**Controller setup:** +```go +func (r *CachedImageReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.CachedImage{}). + Owns(&corev1.Pod{}). + WithEventFilter(predicate.GenerationChangedPredicate{}). + Complete(r) +} +``` + +**File:** `internal/controller/cachedimage_controller_test.go` + +**Tests (envtest-based integration):** +- Creating a CachedImage with one matching node → drop Pod created. +- Drop Pod completes → CachedImage status shows nodesReady=1, phase=Ready. +- Drop Pod fails → CachedImage status shows Degraded condition. +- Two nodes match, PullPolicy maxConcurrentNodes=1 → only one Pod at a time. +- NodeSelector filters nodes correctly. +- Deleting CachedImage cleans up Pods. +- Updating CachedImage spec triggers new reconcile. + +**Acceptance criteria:** +- [ ] `make test` passes (envtest integration tests). +- [ ] CachedImage reaches Ready phase when all target nodes complete. +- [ ] Pacing is respected (verified by checking Pod creation timing in tests). + +--- + +## Phase 2: Multi-Node Pacing + PullPolicy + +### Task 2.1: Complete Pacing Integration + +**Goal:** End-to-end verification that PullPolicy controls multi-node rollout speed. + +**Tests to add:** +- 5-node cluster, PullPolicy `maxConcurrentNodes: 2` → never more than 2 active drop Pods. +- PullPolicy `minDelayBetweenPulls: 5s` → Pods created at least 5s apart. +- Failure backoff: Pod fails → next retry respects exponential delay. +- PullPolicy update (e.g. increase maxConcurrentNodes) → immediate effect on next reconcile. + +**Acceptance criteria:** +- [ ] Integration tests pass with timing assertions. +- [ ] No race conditions under `MaxConcurrentReconciles > 1`. + +--- + +### Task 2.2: RepullPolicy (Moving Tags) + +**Goal:** Support refreshing images on schedule for moving tags like `latest`. + +**Implementation in CachedImage reconciler:** +- After a node is marked Ready, check `repullPolicy`: + - `Never` → do nothing until spec changes. + - `OnSchedule` → on next reconcile after syncInterval, create new drop Pod with `imagePullPolicy: Always`. + - `Always` → every reconcile cycle, re-pull (only for specific use cases). +- Track `lastPulledAt` per node in status to determine if refresh is due. + +**Acceptance criteria:** +- [ ] `OnSchedule` triggers re-pull after interval. +- [ ] `Never` does not re-pull. +- [ ] `Always` + `imagePullPolicy: Always` forces registry check on each cycle. + +--- + +## Phase 3: CachedImageSet + +### Task 3.1: Implement CachedImageSet Reconciler + +**File:** `internal/controller/cachedimageset_controller.go` + +**Reconcile loop:** +1. Fetch CachedImageSet CR. +2. Build desired image list from `spec.images` (static). +3. List existing child CachedImage resources (ownerReference match). +4. Diff: create new, delete removed, update changed. +5. For each child CachedImage, propagate: `policyRef`, `nodeSelector`, `tolerations`, `pullPolicy`, `repullPolicy`. +6. Set ownerReference on each child → parent CachedImageSet. +7. Update status: imagesManaged, imagesReady (count children with phase=Ready). + +**Controller setup:** +```go +func (r *CachedImageSetReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.CachedImageSet{}). + Owns(&v1alpha1.CachedImage{}). + WithEventFilter(predicate.GenerationChangedPredicate{}). + Complete(r) +} +``` + +**Tests:** +- CachedImageSet with 3 static images → 3 CachedImage children created. +- Remove one image from set → child CachedImage deleted. +- Delete CachedImageSet → all children garbage collected (ownerRef cascade). +- Config propagation: change nodeSelector on set → children updated. + +**Acceptance criteria:** +- [ ] Static image list CRUD works correctly. +- [ ] OwnerReference cascade deletion works. +- [ ] Status aggregation reflects child states. + +--- + +## Phase 4: DiscoveryPolicy (Prometheus) + +### Task 4.1: Implement Source Interface + Prometheus Source + +**File:** `internal/discovery/source.go` +```go +package discovery + +type Source interface { + Fetch(ctx context.Context) ([]ImageResult, error) +} + +type ImageResult struct { + Image string + Score float64 +} +``` + +**File:** `internal/discovery/prometheus.go` + +**Implementation:** +- Build HTTP client with auth from Secret (basic auth or bearer token). +- Execute `GET /api/v1/query` with `query` parameter. +- Parse standard Prometheus JSON response. +- Extract `image` label from each result → `ImageResult.Image`. +- Extract metric value → `ImageResult.Score`. +- Return sorted results. + +**Tests (unit with httptest):** +- Valid Prometheus response → correct ImageResult list. +- Missing `image` label → skip result, don't error. +- Auth headers applied from Secret data. +- HTTP error → return error (caller handles gracefully). +- Timeout respected. + +**Acceptance criteria:** +- [ ] `go test ./internal/discovery/...` passes. +- [ ] Prometheus source handles real response format correctly. + +--- + +### Task 4.2: Implement DiscoveryPolicy Reconciler + +**File:** `internal/controller/discoverypolicy_controller.go` + +**Reconcile loop:** +1. Fetch DiscoveryPolicy CR. +2. For each source in `spec.sources`: + a. Resolve Secret (if secretRef set). + b. Construct appropriate `Source` implementation. + c. Call `source.Fetch(ctx)`. + d. On error: set condition `SourceHealthy=False`, keep previous status, continue. +3. Merge all results (deduplicate by image, keep highest score). +4. Apply `imageFilter` regex. +5. Sort by score descending, truncate to `maxImages`. +6. Write `status.discoveredImages`. +7. Set conditions (`Ready`, `SourceHealthy`). +8. Return `ctrl.Result{RequeueAfter: syncInterval}`. + +**Tests:** +- Single Prometheus source → discovered images appear in status. +- Source failure → condition set, previous results preserved. +- imageFilter excludes non-matching images. +- maxImages truncation works. +- syncInterval causes periodic requeue. + +**Acceptance criteria:** +- [ ] Discovery results appear in status. +- [ ] Transient failure preserves last good results. +- [ ] Conditions reflect source health. + +--- + +### Task 4.3: Connect CachedImageSet to DiscoveryPolicy + +**Modification:** `internal/controller/cachedimageset_controller.go` + +**Changes:** +- If `spec.discoveryPolicyRef` is set, read `DiscoveryPolicy.status.discoveredImages`. +- Convert discovered images to desired CachedImage list. +- Merge with static `spec.images` (static wins on conflict). +- Add watch: `Watches(&v1alpha1.DiscoveryPolicy{}, handler.EnqueueRequestsFromMapFunc(mapDiscoveryToSets))`. + +**The map function:** +```go +func mapDiscoveryToSets(ctx context.Context, obj client.Object) []reconcile.Request { + // List all CachedImageSets that reference this DiscoveryPolicy + // Return reconcile.Request for each +} +``` + +**Tests:** +- DiscoveryPolicy updates status → CachedImageSet reconciles → children updated. +- Image drops from discovery → child CachedImage deleted. +- New image discovered → child CachedImage created. + +**Acceptance criteria:** +- [ ] End-to-end: DiscoveryPolicy discovers images → CachedImageSet materializes children → CachedImage pulls onto nodes. +- [ ] GC works when images leave discovery results. + +--- + +## Phase 5: Registry Source + +### Task 5.1: Implement Registry Source + +**File:** `internal/discovery/registry.go` + +**Implementation:** +- HTTP client with auth from Secret (bearer token or basic auth). +- `GET /v2//tags/list` (OCI Distribution API). +- Parse tag list response. +- Apply `tagFilter` regex. +- Sort by semver (if parseable) or lexicographic. +- Take top X. +- Apply `imageTemplate` (Go `text/template`) to construct full image refs. +- Return `[]ImageResult` (score = index-based ranking for recency). + +**Tests:** +- Valid tag list → correct image refs constructed. +- tagFilter excludes non-matching tags. +- imageTemplate produces expected refs (GitLab helper pattern). +- Semver sorting works correctly. +- Auth headers applied. +- Pagination handling (if registry returns `Link` header). + +**Acceptance criteria:** +- [ ] `go test ./internal/discovery/...` passes. +- [ ] GitLab helper image pattern works with `imageTemplate`. + +--- + +## Phase 6: Production Readiness + +### Task 6.1: Helm Chart + +**Directory:** `charts/drop/` + +**Structure:** +``` +charts/drop/ +├── Chart.yaml +├── values.yaml +├── templates/ +│ ├── deployment.yaml +│ ├── serviceaccount.yaml +│ ├── clusterrole.yaml +│ ├── clusterrolebinding.yaml +│ ├── _helpers.tpl +│ └── NOTES.txt +└── crds/ + └── (symlinked or copied from config/crd/bases/) +``` + +**values.yaml key settings:** +- `image.repository`, `image.tag` +- `replicaCount: 1` (leader election handles HA) +- `resources` (sensible defaults for controller) +- `leaderElection.enabled: true` +- `metrics.enabled: true` +- `serviceMonitor.enabled: false` (opt-in) + +**Acceptance criteria:** +- [ ] `helm lint charts/drop` passes. +- [ ] `helm template drop charts/drop` produces valid YAML. +- [ ] `helm install` on kind cluster deploys working operator. + +--- + +### Task 6.2: CI Pipeline (GitHub Actions) + +**File:** `.github/workflows/ci.yml` + +**Jobs:** +1. **lint** — `golangci-lint run` +2. **test** — `make test` (unit + envtest) +3. **build** — `make build` (compile binary) +4. **e2e** — Create kind cluster → install CRDs → run Kyverno Chainsaw tests +5. **docker** — Build multi-arch image (`linux/amd64`, `linux/arm64`) via `docker buildx` + +**File:** `.github/workflows/release.yml` + +**Trigger:** on tag `v*` + +**Jobs:** +1. Run CI pipeline (lint, test, build, e2e). +2. Build + push multi-arch image to `ghcr.io/breee/drop:`. +3. Package Helm chart → push to GHCR OCI registry. +4. Create GitHub Release with changelog (generated from conventional commits via `git-cliff` or similar). + +**Acceptance criteria:** +- [ ] CI passes on PRs. +- [ ] Release produces multi-arch image on GHCR. +- [ ] Helm chart is pullable from GHCR OCI. + +--- + +### Task 6.3: E2E Tests (Kyverno Chainsaw) + +**Directory:** `test/e2e/` + +**Scenario files (Chainsaw YAML):** + +1. `test/e2e/static-pull/chainsaw-test.yaml` — Create CachedImage → verify drop Pod created → verify status Ready. +2. `test/e2e/pull-policy/chainsaw-test.yaml` — Create PullPolicy + 2 CachedImages → verify sequential pulls. +3. `test/e2e/image-set/chainsaw-test.yaml` — Create CachedImageSet with static images → verify children created. +4. `test/e2e/discovery/chainsaw-test.yaml` — Create DiscoveryPolicy (mock Prometheus) → verify discovered images in status. +5. `test/e2e/cleanup/chainsaw-test.yaml` — Delete CachedImageSet → verify children and Pods cleaned up. + +**Acceptance criteria:** +- [ ] All Chainsaw scenarios pass against kind cluster. +- [ ] Tests complete within 5 minutes. + +--- + +### Task 6.4: Dockerfile (Multi-Arch) + +**File:** `Dockerfile` + +```dockerfile +FROM --platform=$BUILDPLATFORM golang:1.22 AS builder +ARG TARGETOS TARGETARCH +WORKDIR /workspace +COPY go.mod go.sum ./ +RUN go mod download +COPY . . +RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o manager cmd/main.go + +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/manager . +USER 65532:65532 +ENTRYPOINT ["/manager"] +``` + +**Acceptance criteria:** +- [ ] Builds for `linux/amd64` and `linux/arm64`. +- [ ] Final image is < 50MB. +- [ ] Runs as non-root. + +--- + +### Task 6.5: Documentation (Hugo Hextra) + +**Directory:** `docs/` + +**Pages:** +- `docs/content/_index.md` — Landing page. +- `docs/content/getting-started.md` — Quickstart with Helm. +- `docs/content/crds/cachedimage.md` — CRD reference. +- `docs/content/crds/cachedimageset.md` — CRD reference. +- `docs/content/crds/pullpolicy.md` — CRD reference. +- `docs/content/crds/discoverypolicy.md` — CRD reference. +- `docs/content/guides/static-images.md` — How to cache specific images. +- `docs/content/guides/discovery.md` — How to set up Prometheus discovery. +- `docs/content/architecture.md` — High-level architecture for users. + +**Acceptance criteria:** +- [ ] `hugo serve` renders docs locally. +- [ ] CRD reference docs generated/synced from code comments. + +--- + +## Dependency Graph + +``` +Task 1.1 (bootstrap) + └─► Task 1.2 (CRD APIs) + ├─► Task 1.3 (Pod builder) + ├─► Task 1.4 (Pacing engine) + └─► Task 1.5 (CachedImage reconciler) ◄── depends on 1.3 + 1.4 + └─► Task 2.1 (Pacing integration tests) + └─► Task 2.2 (RepullPolicy) + └─► Task 3.1 (CachedImageSet reconciler) + └─► Task 4.1 (Source interface + Prometheus) + └─► Task 4.2 (DiscoveryPolicy reconciler) + └─► Task 4.3 (Connect Set ↔ Discovery) + └─► Task 5.1 (Registry source) + +Task 6.1 (Helm) ◄── depends on Task 1.5+ (needs working operator) +Task 6.2 (CI) ◄── depends on Task 1.1 (needs compilable project) +Task 6.3 (E2E) ◄── depends on Task 1.5+ (needs reconciler) +Task 6.4 (Dockerfile) ◄── depends on Task 1.1 +Task 6.5 (Docs) ◄── can start anytime, references CRD types +``` + +--- + +## Effort Estimates + +| Task | Effort | Complexity | +|------|--------|------------| +| 1.1 Bootstrap | Small | Low — scaffolding | +| 1.2 CRD APIs | Medium | Low — type definitions | +| 1.3 Pod builder | Small | Low — single function | +| 1.4 Pacing engine | Medium | Medium — timing logic | +| 1.5 CachedImage reconciler | Large | High — core reconciler | +| 2.1 Pacing integration | Medium | Medium — timing tests | +| 2.2 RepullPolicy | Small | Low — add condition | +| 3.1 CachedImageSet | Medium | Medium — child management | +| 4.1 Prometheus source | Medium | Medium — HTTP + parsing | +| 4.2 DiscoveryPolicy reconciler | Medium | Medium — multi-source | +| 4.3 Connect Set ↔ Discovery | Small | Low — wire existing | +| 5.1 Registry source | Medium | Medium — OCI API | +| 6.1 Helm | Small | Low — templating | +| 6.2 CI | Medium | Low — standard GHA | +| 6.3 E2E | Medium | Medium — scenario design | +| 6.4 Dockerfile | Small | Low — standard | +| 6.5 Docs | Medium | Low — content creation | + +--- + +## Quality Gates (Per Task) + +Every task must pass before moving to the next: + +1. **Compiles** — `make build` succeeds. +2. **Lints** — `golangci-lint run` passes. +3. **Unit tests** — `make test` passes with new tests. +4. **No regressions** — all existing tests still pass. +5. **CRD validation** — `make manifests` produces valid CRDs. + +For Phase 6 tasks additionally: +6. **E2E** — Chainsaw scenarios pass on kind. +7. **Helm** — `helm lint` + `helm template` pass. +8. **Image** — `docker build` succeeds for both architectures. + +--- + +## Review Checklist + +This plan meets the project's standards: + +- ✅ **Simple architecture** — three reconcilers, each doing one thing. No webhooks, no custom schedulers, no abstraction layers beyond what's needed. +- ✅ **No premature optimization** — pacing uses Pod listing (informer-cached), no external databases or caches. Adds complexity only when proven necessary. +- ✅ **Go best practices** — interfaces for extensibility, table-driven tests, dependency injection, standard project layout, no globals. +- ✅ **Kubernetes operator best practices** — idempotent reconciliation, ownerRefs for GC, status subresource, leader election, least-privilege RBAC, event predicates. +- ✅ **Testable** — every component testable in isolation (pod builder, pacing, sources) and integrated (envtest, Chainsaw). +- ✅ **Incrementally shippable** — Phase 1 alone is useful (static image caching). Each phase adds value independently. +- ✅ **No guesses** — pull mechanism (nodeName Pod), pacing (informer-based counting), discovery (Source interface) are all patterns used by production Kubernetes operators (kube-fledged, eraser, etc.). diff --git a/ai-docs/16-docs-redesign-proposal.md b/ai-docs/16-docs-redesign-proposal.md new file mode 100644 index 0000000..0aec7e5 --- /dev/null +++ b/ai-docs/16-docs-redesign-proposal.md @@ -0,0 +1,228 @@ +# Documentation Redesign Proposal + +## Problem + +Current landing page lists features nobody cares about until they already know the project. A visitor needs to answer two questions in <5 seconds: + +1. **What does this do?** → A diagram worth 1000 words +2. **Where do I go?** → Depends on who I am + +## Landing Page Design + +### Hero: One Diagram + +```mermaid +flowchart LR + subgraph You["Your Cluster"] + CR["CachedImage CR"] --> Ctrl["Drop Operator"] + Ctrl --> Pod1["Pod (node-1)"] + Ctrl --> Pod2["Pod (node-2)"] + Ctrl --> Pod3["Pod (node-3)"] + Pod1 -.->|"kubelet pulls"| Img["nginx:latest"] + Pod2 -.->|"kubelet pulls"| Img + Pod3 -.->|"kubelet pulls"| Img + end + Pod1 -->|"exits"| Done["✓ Image cached"] + Pod2 -->|"exits"| Done + Pod3 -->|"exits"| Done +``` + +Below the diagram, one sentence: + +> **Drop creates short-lived Pods on each node. The kubelet pulls the image, the Pod exits. No privileges, no DaemonSets.** + +### Navigation: Three Personas + +``` +┌─────────────────────────────────────────────────────────┐ +│ I want to... │ +├───────────────┬───────────────────┬─────────────────────┤ +│ USE Drop │ DEVELOP Drop │ INTEGRATE (Agent) │ +│ │ │ │ +│ • Install │ • Architecture │ • llms.txt │ +│ • Configure │ • CRD Reference │ • llms-full.txt │ +│ • Monitor │ • Contributing │ • Markdown API │ +│ │ • Testing │ • Agent instruct. │ +└───────────────┴───────────────────┴─────────────────────┘ +``` + +## Proposed Site Structure + +``` +/drop/ ← Landing: diagram + persona links +/drop/docs/ ← Docs index (short, links only) +/drop/docs/install/ ← Helm install, prerequisites +/drop/docs/usage/ ← CachedImage, CachedImageSet, PullPolicy examples +/drop/docs/discovery/ ← DiscoveryPolicy guide +/drop/docs/monitoring/ ← Metrics, events, dashboards +/drop/docs/reference/crds/ ← Generated field reference +/drop/docs/reference/errors/ ← Status conditions lookup +/drop/docs/reference/metrics/ ← Prometheus metrics table +/drop/docs/reference/arch/ ← Package graph, sequence diagrams +/drop/llms.txt ← Site index for AI agents (auto-generated by Hextra) +/drop/llms-full.txt ← Complete reference in one file +``` + +### What changed vs. current + +| Current | Proposed | Why | +|---------|----------|-----| +| `getting-started.md` (install + usage mixed) | Split into `install/` and `usage/` | Different questions at different times | +| `observability.md` | `monitoring/` | Clearer name | +| `kamera.md` at top level | Remove from docs (it's a future evaluation, not user-facing) | Noise | +| 6 feature cards on homepage | 1 diagram + 3 persona links | Shows vs. tells | +| `_generated_crds` URLs | `reference/crds/` (aliases already done) | Clean | + +## Landing Page Content (Markdown) + +```markdown +--- +title: Drop +layout: hextra-home +--- + +
+{{< hextra/hero-headline >}} + Drop +{{< /hextra/hero-headline >}} +
+ +
+{{< hextra/hero-subtitle >}} + Pre-cache container images on Kubernetes nodes. +{{< /hextra/hero-subtitle >}} +
+ + +```mermaid +flowchart LR + CR[CachedImage] --> Op[Drop Operator] + Op --> P1[Pod node-1] + Op --> P2[Pod node-2] + Op --> P3[Pod node-3] + P1 -.->|pull| I[image] + P2 -.->|pull| I + P3 -.->|pull| I + P1 --> X1[✓ cached] + P2 --> X2[✓ cached] + P3 --> X3[✓ cached] +``` + +> Create a CachedImage CR → operator creates a Pod per node → kubelet pulls the image → Pod exits → image is warm on every node. No privileges required. + +--- + +## I want to... + +{{< hextra/feature-grid >}} + {{< hextra/feature-card + title="Use Drop" + subtitle="Install, create CachedImages, configure pacing and discovery." + link="docs/install/" + >}} + {{< hextra/feature-card + title="Develop Drop" + subtitle="Architecture, CRD reference, testing, contributing." + link="docs/reference/arch/" + >}} + {{< hextra/feature-card + title="Feed to AI Agent" + subtitle="llms.txt, Markdown API, full reference in one file." + link="llms-full.txt" + >}} +{{< /hextra/feature-grid >}} +``` + +## Sidebar Navigation (proposed) + +```yaml +# Weight ordering in frontmatter +docs/_index.md # weight: 0 — just links, no prose +docs/install.md # weight: 1 — prerequisites + helm +docs/usage.md # weight: 2 — CachedImage, CachedImageSet, PullPolicy examples +docs/discovery.md # weight: 3 — DiscoveryPolicy +docs/monitoring.md # weight: 4 — metrics, events, conditions +docs/reference/_index # weight: 5 — section header +docs/reference/crds # weight: 1 — generated +docs/reference/errors # weight: 2 — generated +docs/reference/metrics # weight: 3 — generated +docs/reference/arch # weight: 4 — generated +``` + +## Key Principles + +1. **Diagram first** — one image that shows the mechanism. No "features" list. +2. **Persona routing** — 3 cards that route you based on intent, not topic. +3. **Flat + shallow** — max 2 levels deep. Everything reachable in 2 clicks. +4. **No noise** — Kamera (future), AI-friendliness meta-docs don't belong in user docs. +5. **Examples everywhere** — every CRD page starts with a working YAML before the field table. +6. **One file for agents** — `llms-full.txt` served on the site = entire project context in one GET. + +## Implementation Steps + +1. [ ] Create the Mermaid diagram as an SVG (for the landing page image fallback) +2. [ ] Rewrite `_index.md` (landing) with diagram + persona cards +3. [ ] Split `getting-started.md` → `install.md` + `usage.md` +4. [ ] Rename `observability.md` → `monitoring.md` +5. [ ] Remove `kamera.md` from docs (move to ai-docs/ or a "future" section) +6. [ ] Update sidebar weights +7. [ ] Verify all links resolve with Hugo aliases +8. [ ] Run `make docs-gen` to regenerate with new structure + +## Gaps / Open Questions + +### 1. Mermaid in hextra-home layout +Hextra's `hextra-home` layout may not process Mermaid code fences the same as regular content pages. Options: +- Use `{{}}` shortcode (if Hextra supports it in that layout) +- Pre-render as SVG and embed as `` (guaranteed to work, also better for llms.txt/markdown output) +- **Recommendation:** Pre-render SVG, store in `docs/static/img/how-it-works.svg` + +### 2. "Develop Drop" has no landing page +The persona card links to `reference/arch/` but a developer first needs: clone → install tools → run tests → submit PR. Options: +- Add `docs/contributing.md` (build from source, dev workflow, test commands) +- Or link to CONTRIBUTING.md in the repo (GitHub renders it) +- **Recommendation:** Add a short `docs/developing.md` that covers `make codegen && make test && make lint` + +### 3. Redirects for renamed pages +Renaming `getting-started` → `install` + `usage` and `observability` → `monitoring` breaks existing links (README, external blogs, bookmarks). Need Hugo `aliases` in OLD paths pointing to NEW: +```yaml +# In install.md +aliases: + - /drop/docs/getting-started/ +``` + +### 4. llms.txt template hardcodes old paths +The repo-root `llms.txt` template in `templates.go` has: +``` +| [Getting Started](docs/getting-started/) | ... | +| [CRD Reference](docs/reference/_generated_crds/) | ... | +``` +These need to update to the new paths after restructuring. + +### 5. "Feed to AI Agent" card links to raw file +`llms-full.txt` is plain text — clicking it just dumps text in the browser. Better options: +- Link to a dedicated `docs/for-agents.md` page explaining the endpoints +- Or keep it (agents don't click HTML links — they fetch URLs, and this is the right one) +- **Recommendation:** Keep as-is. The card subtitle already explains what it is. Humans who click it see exactly what an agent sees — that's the point. + +### 6. docs/_index.md purpose +Currently has "Core Concepts" and "How It Works" — content that overlaps with the landing page diagram. After redesign: +- Make it a pure navigation hub: short intro sentence + auto-generated section list +- The "how it works" explanation lives on the landing page diagram only +- Core concepts (CRD list) moves to `usage.md` + +### 7. Missing llmsDescription for new pages +Every new/renamed page needs `llmsDescription` frontmatter: +- `install.md` — "Helm install, prerequisites, namespace setup" +- `usage.md` — "CachedImage, CachedImageSet, PullPolicy examples with YAML" +- `monitoring.md` — "Prometheus metrics, events, status conditions, Grafana" +- `developing.md` — "Build, test, lint, codegen commands for contributors" + +### 8. Search index +Hextra FlexSearch indexes page content automatically. Renaming files doesn't break it — Hugo rebuilds the index. No action needed, but verify after implementation. + +### 9. Diagram for AI agents +The Mermaid diagram is great for humans but invisible to agents reading markdown output. The one-line description below it is what agents actually consume. Make sure the alt-text / description is sufficient: +> "CachedImage CR → Drop Operator → Pod per node → kubelet pulls image → Pod exits → image cached" + +This should appear in the page's `llmsDescription` frontmatter. diff --git a/ai-docs/README.md b/ai-docs/README.md new file mode 100644 index 0000000..33fbd31 --- /dev/null +++ b/ai-docs/README.md @@ -0,0 +1,22 @@ +# AI Docs + +Living design documents for the drop operator. Historical planning docs have been archived to `docs/decisions/`. + +## Current Files + +- `progress.md` — implementation tracking checklist +- `05-ai-friendly-docs.md` — documentation generation strategy and conventions +- `13-discovery-architecture.md` — discovery reconciliation flow, query contract, source types +- `14-architecture.md` — system architecture: reconcilers, pull mechanism, pacing +- `15-implementation-plan.md` — tasks, acceptance criteria, dependencies + +## Generated Docs (DO NOT EDIT) + +All generated documentation lives at the repo root and in `docs/content/docs/reference/`: +- `knowledge.yaml` — structured intermediate (full project model) +- `llms.txt` / `llms-full.txt` — for USE agents +- `.github/copilot-instructions.md` / `.cursorrules` / `AGENTS.md` — for CODE agents +- `docs/content/docs/reference/_generated_*.md` — for humans (Hugo) +- `docs/doc-generation.md` — Mermaid diagram of the generation flow + +Regenerate with: `make docs-gen` diff --git a/ai-docs/progress.md b/ai-docs/progress.md new file mode 100644 index 0000000..83641cf --- /dev/null +++ b/ai-docs/progress.md @@ -0,0 +1,35 @@ +# Progress Tracker + +- [x] Create AI docs structure and feature-sliced plan files +- [x] Decide CRD naming: `CachedImage`, `CachedImageSet`, `PullPolicy`, `DiscoveryPolicy` (cluster-scoped) +- [x] Consolidate all docs to use decided naming and structure +- [x] Design overall system architecture (reconcilers, pull mechanism, pacing, project layout) +- [x] Create detailed implementation plan with tasks, acceptance criteria, and dependencies +- [x] **Phase 1:** Bootstrap Go operator project using Kubebuilder (controller-runtime) +- [x] **Phase 1:** Define CRDs (`CachedImage`, `CachedImageSet`, `PullPolicy`, `DiscoveryPolicy`) in `drop.corewire.io/v1alpha1` +- [x] **Phase 1:** Implement Pod builder (drop Pod construction) +- [x] **Phase 1:** Implement pacing engine (shared rate-limiting logic) +- [x] **Phase 1:** Implement `CachedImage` reconciler (core pull loop) +- [x] **Phase 2:** Multi-node pacing integration tests +- [ ] **Phase 2:** RepullPolicy for moving tags (reconciler-level requeue) +- [x] **Phase 3:** Implement `CachedImageSet` reconciler (static image lists, child management) +- [x] **Phase 4:** Implement Source interface + Prometheus source +- [x] **Phase 4:** Implement `DiscoveryPolicy` reconciler +- [x] **Phase 4:** Connect CachedImageSet ↔ DiscoveryPolicy +- [x] **Phase 5:** Implement registry source + imageTemplate +- [x] **Phase 6:** Helm chart packaging and publishing +- [x] **Phase 6:** CI pipeline (lint, test, build, e2e, release) +- [x] **Phase 6:** Multi-arch container builds (`linux/amd64`, `linux/arm64`) to GHCR +- [x] AI-friendly docs (llms.txt, llms-full.txt) +- [x] Hugo Hextra docs site (docs/ directory with getting-started, CRDs, discovery, observability) +- [x] Helm chart ServiceMonitor + metrics Service +- [x] Helm chart cert-manager Certificate integration +- [x] Custom Prometheus metrics (drop_images_cached_total, drop_pull_duration_seconds, etc.) +- [x] Kubernetes events on CachedImage (PullStarted, PullSucceeded, PullFailed) +- [x] Developer tooling (Tiltfile, pre-commit, enhanced Makefile, demo script) +- [x] E2E test scaffolding with Kyverno Chainsaw (5 scenarios) +- [x] Kamera evaluation documentation (post-MVP decision) +- [ ] Hugo Hextra docs generation CI workflow +- [ ] RepullPolicy implementation (requeueAfter for moving tags) +- [ ] Add a base of instructions to all instruction files so coding agents do not waste time + diff --git a/api/v1alpha1/cachedimage_types.go b/api/v1alpha1/cachedimage_types.go new file mode 100644 index 0000000..1b14c2b --- /dev/null +++ b/api/v1alpha1/cachedimage_types.go @@ -0,0 +1,132 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// CachedImageSpec defines the desired state of CachedImage. +type CachedImageSpec struct { + // Image is the fully qualified image reference (registry/repository). + // +kubebuilder:validation:MinLength=1 + Image string `json:"image"` + // Tag to pull. Mutually exclusive with Digest. + // +optional + Tag string `json:"tag,omitempty"` + // Digest to pull (immutable reference). Mutually exclusive with Tag. + // +optional + Digest string `json:"digest,omitempty"` + // ImagePullPolicy controls when kubelet pulls the image. + // Defaults to Always (checks upstream digest, only downloads if changed). + // Set to IfNotPresent to skip the registry check when the tag already exists locally. + // +kubebuilder:validation:Enum=Always;IfNotPresent;Never + // +kubebuilder:default=Always + // +optional + ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"` + // ImagePullSecrets are references to secrets for pulling from private registries. + // +optional + ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"` + // NodeSelector restricts which nodes to cache the image on. + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + // Tolerations allow targeting tainted nodes. + // +optional + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + // Priority is a pull ordering hint (lower values pulled first). + // +optional + Priority *int32 `json:"priority,omitempty"` + // PolicyRef references a PullPolicy for pacing controls. + // +optional + PolicyRef *PolicyReference `json:"policyRef,omitempty"` +} + +// PolicyReference is a reference to a PullPolicy resource. +type PolicyReference struct { + // Name of the PullPolicy resource. + // +kubebuilder:validation:MinLength=1 + Name string `json:"name"` +} + +// CachedImageStatus defines the observed state of CachedImage. +type CachedImageStatus struct { + // ObservedGeneration is the last generation reconciled. + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + // Phase summarizes the overall state. + // +kubebuilder:validation:Enum=Pending;Pulling;Ready;Degraded + Phase string `json:"phase,omitempty"` + // Ready is a human-readable "nodesReady/nodesTargeted" fraction for display. + Ready string `json:"ready,omitempty"` + // ResolvedDigest is the sha256 digest of the image as reported by the container runtime after pull. + // +optional + ResolvedDigest string `json:"resolvedDigest,omitempty"` + // NodesTargeted is the number of nodes that should have this image. + NodesTargeted int32 `json:"nodesTargeted,omitempty"` + // NodesReady is the number of nodes that have successfully pulled the image. + NodesReady int32 `json:"nodesReady,omitempty"` + // CachedNodes is the list of node names that have successfully cached the image. + // +optional + CachedNodes []string `json:"cachedNodes,omitempty"` + // ConsecutiveFailures counts sequential reconcile failures for backoff calculation. + // +optional + ConsecutiveFailures int32 `json:"consecutiveFailures,omitempty"` + // LastPulledAt is the timestamp of the most recent successful pull. + // +optional + LastPulledAt *metav1.Time `json:"lastPulledAt,omitempty"` + // LastAttemptedAt is the timestamp of the most recent pull attempt (success or failure). + // +optional + LastAttemptedAt *metav1.Time `json:"lastAttemptedAt,omitempty"` + // Conditions represent the latest available observations. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster,categories=drop +// +kubebuilder:printcolumn:name="Image",type=string,JSONPath=`.spec.image` +// +kubebuilder:printcolumn:name="Tag",type=string,JSONPath=`.spec.tag` +// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].reason` +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.ready` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` +// +kubebuilder:printcolumn:name="Digest",type=string,JSONPath=`.status.resolvedDigest`,priority=1 +// +kubebuilder:printcolumn:name="Set",type=string,JSONPath=`.metadata.labels.drop\.corewire\.io/imageset`,description="Parent CachedImageSet",priority=1 +// +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].message`,priority=1 +// +kubebuilder:printcolumn:name="Policy",type=string,JSONPath=`.spec.policyRef.name`,priority=1 + +// CachedImage is the Schema for the cachedimages API. +type CachedImage struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec CachedImageSpec `json:"spec,omitempty"` + Status CachedImageStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// CachedImageList contains a list of CachedImage. +type CachedImageList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []CachedImage `json:"items"` +} + +func init() { + SchemeBuilder.Register(&CachedImage{}, &CachedImageList{}) +} diff --git a/api/v1alpha1/cachedimageset_types.go b/api/v1alpha1/cachedimageset_types.go new file mode 100644 index 0000000..8c778a2 --- /dev/null +++ b/api/v1alpha1/cachedimageset_types.go @@ -0,0 +1,117 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// CachedImageSetSpec defines the desired state of CachedImageSet. +type CachedImageSetSpec struct { + // PolicyRef references a PullPolicy for pacing controls. + // +optional + PolicyRef *PolicyReference `json:"policyRef,omitempty"` + // DiscoveryPolicyRef references a DiscoveryPolicy for dynamic image lists. + // +optional + DiscoveryPolicyRef *DiscoveryPolicyReference `json:"discoveryPolicyRef,omitempty"` + // ImagePullPolicy controls when kubelet pulls the image (propagated to children). + // +kubebuilder:validation:Enum=Always;IfNotPresent;Never + // +kubebuilder:default=Always + // +optional + ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"` + // ImagePullSecrets are references to secrets for pulling from private registries (propagated to children). + // +optional + ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"` + // NodeSelector restricts which nodes to cache images on (propagated to children). + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + // Tolerations allow targeting tainted nodes (propagated to children). + // +optional + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + // Images is a static list of images to cache. + // +optional + Images []ImageEntry `json:"images,omitempty"` +} + +// ImageEntry defines a single image to include in a set. +type ImageEntry struct { + // Image is the fully qualified image reference (registry/repository). + // +kubebuilder:validation:MinLength=1 + Image string `json:"image"` + // Tag to pull. + // +optional + Tag string `json:"tag,omitempty"` + // Digest to pull. + // +optional + Digest string `json:"digest,omitempty"` +} + +// DiscoveryPolicyReference is a reference to a DiscoveryPolicy resource. +type DiscoveryPolicyReference struct { + // Name of the DiscoveryPolicy resource. + // +kubebuilder:validation:MinLength=1 + Name string `json:"name"` +} + +// CachedImageSetStatus defines the observed state of CachedImageSet. +type CachedImageSetStatus struct { + // ObservedGeneration is the last generation reconciled. + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + // Phase summarizes the overall state. + // +kubebuilder:validation:Enum=Pending;Ready;Degraded + Phase string `json:"phase,omitempty"` + // ImagesManaged is the number of CachedImage children managed by this set. + ImagesManaged int32 `json:"imagesManaged,omitempty"` + // ImagesReady is the number of children in Ready phase. + ImagesReady int32 `json:"imagesReady,omitempty"` + // Conditions represent the latest available observations. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster,categories=drop +// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].reason` +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.imagesReady` +// +kubebuilder:printcolumn:name="Managed",type=integer,JSONPath=`.status.imagesManaged` +// +kubebuilder:printcolumn:name="Source",type=string,JSONPath=`.spec.discoveryPolicyRef.name` +// +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].message`,priority=1 +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// CachedImageSet is the Schema for the cachedimagesets API. +type CachedImageSet struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec CachedImageSetSpec `json:"spec,omitempty"` + Status CachedImageSetStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// CachedImageSetList contains a list of CachedImageSet. +type CachedImageSetList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []CachedImageSet `json:"items"` +} + +func init() { + SchemeBuilder.Register(&CachedImageSet{}, &CachedImageSetList{}) +} diff --git a/api/v1alpha1/discoverypolicy_types.go b/api/v1alpha1/discoverypolicy_types.go new file mode 100644 index 0000000..d3e2574 --- /dev/null +++ b/api/v1alpha1/discoverypolicy_types.go @@ -0,0 +1,156 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// DiscoveryPolicySpec defines the desired state of DiscoveryPolicy. +type DiscoveryPolicySpec struct { + // Sources is the list of discovery backends to query. + // +kubebuilder:validation:MinItems=1 + Sources []DiscoverySource `json:"sources"` + // ImageFilter is a regex to filter discovered images. + // +optional + ImageFilter string `json:"imageFilter,omitempty"` + // SyncInterval is how often to re-query sources. + // +kubebuilder:default="30m" + SyncInterval metav1.Duration `json:"syncInterval,omitempty"` + // MaxImages caps the number of discovered images. + // +kubebuilder:default=50 + // +kubebuilder:validation:Minimum=1 + MaxImages int32 `json:"maxImages,omitempty"` +} + +// DiscoverySource defines a single discovery backend. +type DiscoverySource struct { + // Type identifies the backend. + // +kubebuilder:validation:Enum=prometheus;registry + Type string `json:"type"` + // Prometheus config (when type=prometheus). + // +optional + Prometheus *PrometheusSource `json:"prometheus,omitempty"` + // Registry config (when type=registry). + // +optional + Registry *RegistrySource `json:"registry,omitempty"` + // SecretRef references a Secret for auth/TLS for this source. + // +optional + SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"` +} + +// PrometheusSource defines Prometheus query configuration. +type PrometheusSource struct { + // Endpoint is the Prometheus API URL. + // +kubebuilder:validation:MinLength=1 + Endpoint string `json:"endpoint"` + // Query is the PromQL query that must return an 'image' label. + // +kubebuilder:validation:MinLength=1 + Query string `json:"query"` + // Lookback is the time window to aggregate over (e.g. "7d", "24h"). + // When set, uses query_range and sums values to rank by total usage. + // When unset, uses an instant query (point-in-time). + // +optional + Lookback *metav1.Duration `json:"lookback,omitempty"` + // Step is the query resolution step for range queries. + // +kubebuilder:default="5m" + // +optional + Step string `json:"step,omitempty"` +} + +// RegistrySource defines OCI registry tag listing configuration. +type RegistrySource struct { + // URL is the registry base URL. + // +kubebuilder:validation:MinLength=1 + URL string `json:"url"` + // Repositories is the list of repositories to query. + // +kubebuilder:validation:MinItems=1 + Repositories []string `json:"repositories"` + // TagFilter is a regex to filter tags. + // +optional + TagFilter string `json:"tagFilter,omitempty"` + // TopX limits the number of tags to fetch per repository. + // +optional + // +kubebuilder:validation:Minimum=1 + TopX int32 `json:"topX,omitempty"` + // ImageTemplate is a Go text/template for constructing the full image reference. + // Available variables: .Registry, .Repository, .Tag + // +optional + ImageTemplate string `json:"imageTemplate,omitempty"` +} + +// DiscoveryPolicyStatus defines the observed state of DiscoveryPolicy. +type DiscoveryPolicyStatus struct { + // LastSyncTime is the timestamp of the last successful sync. + // +optional + LastSyncTime *metav1.Time `json:"lastSyncTime,omitempty"` + // DiscoveredImages is the list of discovered images from all sources. + // +optional + DiscoveredImages []DiscoveredImage `json:"discoveredImages,omitempty"` + // ImageCount is the number of discovered images. + // +optional + ImageCount int32 `json:"imageCount,omitempty"` + // SourceCount is the number of configured sources. + // +optional + SourceCount int32 `json:"sourceCount,omitempty"` + // Conditions represent the latest available observations. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// DiscoveredImage represents a single discovered image with metadata. +type DiscoveredImage struct { + // Image is the fully qualified image reference. + Image string `json:"image"` + // Score is the ranking score from the source (higher = more relevant). + Score int64 `json:"score"` + // Source identifies which discovery source produced this image. + Source string `json:"source"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster,categories=drop +// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].reason` +// +kubebuilder:printcolumn:name="Sources",type=integer,JSONPath=`.status.sourceCount` +// +kubebuilder:printcolumn:name="Images",type=integer,JSONPath=`.status.imageCount` +// +kubebuilder:printcolumn:name="LastSync",type=date,JSONPath=`.status.lastSyncTime` +// +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].message`,priority=1 +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// DiscoveryPolicy is the Schema for the discoverypolicies API. +type DiscoveryPolicy struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec DiscoveryPolicySpec `json:"spec,omitempty"` + Status DiscoveryPolicyStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// DiscoveryPolicyList contains a list of DiscoveryPolicy. +type DiscoveryPolicyList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []DiscoveryPolicy `json:"items"` +} + +func init() { + SchemeBuilder.Register(&DiscoveryPolicy{}, &DiscoveryPolicyList{}) +} diff --git a/api/v1alpha1/groupversion_info.go b/api/v1alpha1/groupversion_info.go new file mode 100644 index 0000000..429de25 --- /dev/null +++ b/api/v1alpha1/groupversion_info.go @@ -0,0 +1,36 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the drop v1alpha1 API group. +// +kubebuilder:object:generate=true +// +groupName=drop.corewire.io +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects. + GroupVersion = schema.GroupVersion{Group: "drop.corewire.io", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme. + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/api/v1alpha1/pullpolicy_types.go b/api/v1alpha1/pullpolicy_types.go new file mode 100644 index 0000000..8d6a477 --- /dev/null +++ b/api/v1alpha1/pullpolicy_types.go @@ -0,0 +1,84 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// PullPolicySpec defines pacing and behavior configuration for image pulls. +type PullPolicySpec struct { + // MaxConcurrentNodes is the max nodes pulling simultaneously for this policy. + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=1 + MaxConcurrentNodes int32 `json:"maxConcurrentNodes,omitempty"` + // MinDelayBetweenPulls is the minimum time between starting pulls on different nodes. + // +kubebuilder:default="10s" + MinDelayBetweenPulls metav1.Duration `json:"minDelayBetweenPulls,omitempty"` + // FailureBackoff configures retry delays on pull failures. + // +optional + FailureBackoff *BackoffConfig `json:"failureBackoff,omitempty"` + // RepullInterval is how often to re-pull cached images. Zero or unset means never re-pull. + // +optional + RepullInterval *metav1.Duration `json:"repullInterval,omitempty"` + // NodeSelector scopes this policy to a specific node pool. + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + // Tolerations match tainted nodes in the pool. + // +optional + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` +} + +// BackoffConfig defines retry backoff behavior. +type BackoffConfig struct { + // Initial delay before first retry. + // +kubebuilder:default="30s" + Initial metav1.Duration `json:"initial,omitempty"` + // Max delay cap for exponential backoff. + // +kubebuilder:default="5m" + Max metav1.Duration `json:"max,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Cluster,categories=drop +// +kubebuilder:printcolumn:name="MaxNodes",type=integer,JSONPath=`.spec.maxConcurrentNodes` +// +kubebuilder:printcolumn:name="MinDelay",type=string,JSONPath=`.spec.minDelayBetweenPulls` +// +kubebuilder:printcolumn:name="RepullInterval",type=string,JSONPath=`.spec.repullInterval` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// PullPolicy is the Schema for the pullpolicies API. +// It is a configuration-only resource with no status. +type PullPolicy struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec PullPolicySpec `json:"spec,omitempty"` +} + +// +kubebuilder:object:root=true + +// PullPolicyList contains a list of PullPolicy. +type PullPolicyList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []PullPolicy `json:"items"` +} + +func init() { + SchemeBuilder.Register(&PullPolicy{}, &PullPolicyList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 0000000..328c0ef --- /dev/null +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,653 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BackoffConfig) DeepCopyInto(out *BackoffConfig) { + *out = *in + out.Initial = in.Initial + out.Max = in.Max +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackoffConfig. +func (in *BackoffConfig) DeepCopy() *BackoffConfig { + if in == nil { + return nil + } + out := new(BackoffConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CachedImage) DeepCopyInto(out *CachedImage) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CachedImage. +func (in *CachedImage) DeepCopy() *CachedImage { + if in == nil { + return nil + } + out := new(CachedImage) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CachedImage) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CachedImageList) DeepCopyInto(out *CachedImageList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]CachedImage, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CachedImageList. +func (in *CachedImageList) DeepCopy() *CachedImageList { + if in == nil { + return nil + } + out := new(CachedImageList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CachedImageList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CachedImageSet) DeepCopyInto(out *CachedImageSet) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CachedImageSet. +func (in *CachedImageSet) DeepCopy() *CachedImageSet { + if in == nil { + return nil + } + out := new(CachedImageSet) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CachedImageSet) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CachedImageSetList) DeepCopyInto(out *CachedImageSetList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]CachedImageSet, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CachedImageSetList. +func (in *CachedImageSetList) DeepCopy() *CachedImageSetList { + if in == nil { + return nil + } + out := new(CachedImageSetList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CachedImageSetList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CachedImageSetSpec) DeepCopyInto(out *CachedImageSetSpec) { + *out = *in + if in.PolicyRef != nil { + in, out := &in.PolicyRef, &out.PolicyRef + *out = new(PolicyReference) + **out = **in + } + if in.DiscoveryPolicyRef != nil { + in, out := &in.DiscoveryPolicyRef, &out.DiscoveryPolicyRef + *out = new(DiscoveryPolicyReference) + **out = **in + } + if in.ImagePullSecrets != nil { + in, out := &in.ImagePullSecrets, &out.ImagePullSecrets + *out = make([]v1.LocalObjectReference, len(*in)) + copy(*out, *in) + } + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Tolerations != nil { + in, out := &in.Tolerations, &out.Tolerations + *out = make([]v1.Toleration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Images != nil { + in, out := &in.Images, &out.Images + *out = make([]ImageEntry, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CachedImageSetSpec. +func (in *CachedImageSetSpec) DeepCopy() *CachedImageSetSpec { + if in == nil { + return nil + } + out := new(CachedImageSetSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CachedImageSetStatus) DeepCopyInto(out *CachedImageSetStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CachedImageSetStatus. +func (in *CachedImageSetStatus) DeepCopy() *CachedImageSetStatus { + if in == nil { + return nil + } + out := new(CachedImageSetStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CachedImageSpec) DeepCopyInto(out *CachedImageSpec) { + *out = *in + if in.ImagePullSecrets != nil { + in, out := &in.ImagePullSecrets, &out.ImagePullSecrets + *out = make([]v1.LocalObjectReference, len(*in)) + copy(*out, *in) + } + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Tolerations != nil { + in, out := &in.Tolerations, &out.Tolerations + *out = make([]v1.Toleration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Priority != nil { + in, out := &in.Priority, &out.Priority + *out = new(int32) + **out = **in + } + if in.PolicyRef != nil { + in, out := &in.PolicyRef, &out.PolicyRef + *out = new(PolicyReference) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CachedImageSpec. +func (in *CachedImageSpec) DeepCopy() *CachedImageSpec { + if in == nil { + return nil + } + out := new(CachedImageSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CachedImageStatus) DeepCopyInto(out *CachedImageStatus) { + *out = *in + if in.CachedNodes != nil { + in, out := &in.CachedNodes, &out.CachedNodes + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.LastPulledAt != nil { + in, out := &in.LastPulledAt, &out.LastPulledAt + *out = (*in).DeepCopy() + } + if in.LastAttemptedAt != nil { + in, out := &in.LastAttemptedAt, &out.LastAttemptedAt + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CachedImageStatus. +func (in *CachedImageStatus) DeepCopy() *CachedImageStatus { + if in == nil { + return nil + } + out := new(CachedImageStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiscoveredImage) DeepCopyInto(out *DiscoveredImage) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiscoveredImage. +func (in *DiscoveredImage) DeepCopy() *DiscoveredImage { + if in == nil { + return nil + } + out := new(DiscoveredImage) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiscoveryPolicy) DeepCopyInto(out *DiscoveryPolicy) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiscoveryPolicy. +func (in *DiscoveryPolicy) DeepCopy() *DiscoveryPolicy { + if in == nil { + return nil + } + out := new(DiscoveryPolicy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DiscoveryPolicy) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiscoveryPolicyList) DeepCopyInto(out *DiscoveryPolicyList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]DiscoveryPolicy, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiscoveryPolicyList. +func (in *DiscoveryPolicyList) DeepCopy() *DiscoveryPolicyList { + if in == nil { + return nil + } + out := new(DiscoveryPolicyList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DiscoveryPolicyList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiscoveryPolicyReference) DeepCopyInto(out *DiscoveryPolicyReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiscoveryPolicyReference. +func (in *DiscoveryPolicyReference) DeepCopy() *DiscoveryPolicyReference { + if in == nil { + return nil + } + out := new(DiscoveryPolicyReference) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiscoveryPolicySpec) DeepCopyInto(out *DiscoveryPolicySpec) { + *out = *in + if in.Sources != nil { + in, out := &in.Sources, &out.Sources + *out = make([]DiscoverySource, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.SyncInterval = in.SyncInterval +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiscoveryPolicySpec. +func (in *DiscoveryPolicySpec) DeepCopy() *DiscoveryPolicySpec { + if in == nil { + return nil + } + out := new(DiscoveryPolicySpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiscoveryPolicyStatus) DeepCopyInto(out *DiscoveryPolicyStatus) { + *out = *in + if in.LastSyncTime != nil { + in, out := &in.LastSyncTime, &out.LastSyncTime + *out = (*in).DeepCopy() + } + if in.DiscoveredImages != nil { + in, out := &in.DiscoveredImages, &out.DiscoveredImages + *out = make([]DiscoveredImage, len(*in)) + copy(*out, *in) + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiscoveryPolicyStatus. +func (in *DiscoveryPolicyStatus) DeepCopy() *DiscoveryPolicyStatus { + if in == nil { + return nil + } + out := new(DiscoveryPolicyStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiscoverySource) DeepCopyInto(out *DiscoverySource) { + *out = *in + if in.Prometheus != nil { + in, out := &in.Prometheus, &out.Prometheus + *out = new(PrometheusSource) + (*in).DeepCopyInto(*out) + } + if in.Registry != nil { + in, out := &in.Registry, &out.Registry + *out = new(RegistrySource) + (*in).DeepCopyInto(*out) + } + if in.SecretRef != nil { + in, out := &in.SecretRef, &out.SecretRef + *out = new(v1.LocalObjectReference) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiscoverySource. +func (in *DiscoverySource) DeepCopy() *DiscoverySource { + if in == nil { + return nil + } + out := new(DiscoverySource) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ImageEntry) DeepCopyInto(out *ImageEntry) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ImageEntry. +func (in *ImageEntry) DeepCopy() *ImageEntry { + if in == nil { + return nil + } + out := new(ImageEntry) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PolicyReference) DeepCopyInto(out *PolicyReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PolicyReference. +func (in *PolicyReference) DeepCopy() *PolicyReference { + if in == nil { + return nil + } + out := new(PolicyReference) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PrometheusSource) DeepCopyInto(out *PrometheusSource) { + *out = *in + if in.Lookback != nil { + in, out := &in.Lookback, &out.Lookback + *out = new(metav1.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PrometheusSource. +func (in *PrometheusSource) DeepCopy() *PrometheusSource { + if in == nil { + return nil + } + out := new(PrometheusSource) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PullPolicy) DeepCopyInto(out *PullPolicy) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PullPolicy. +func (in *PullPolicy) DeepCopy() *PullPolicy { + if in == nil { + return nil + } + out := new(PullPolicy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *PullPolicy) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PullPolicyList) DeepCopyInto(out *PullPolicyList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]PullPolicy, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PullPolicyList. +func (in *PullPolicyList) DeepCopy() *PullPolicyList { + if in == nil { + return nil + } + out := new(PullPolicyList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *PullPolicyList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PullPolicySpec) DeepCopyInto(out *PullPolicySpec) { + *out = *in + out.MinDelayBetweenPulls = in.MinDelayBetweenPulls + if in.FailureBackoff != nil { + in, out := &in.FailureBackoff, &out.FailureBackoff + *out = new(BackoffConfig) + **out = **in + } + if in.RepullInterval != nil { + in, out := &in.RepullInterval, &out.RepullInterval + *out = new(metav1.Duration) + **out = **in + } + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Tolerations != nil { + in, out := &in.Tolerations, &out.Tolerations + *out = make([]v1.Toleration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PullPolicySpec. +func (in *PullPolicySpec) DeepCopy() *PullPolicySpec { + if in == nil { + return nil + } + out := new(PullPolicySpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrySource) DeepCopyInto(out *RegistrySource) { + *out = *in + if in.Repositories != nil { + in, out := &in.Repositories, &out.Repositories + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrySource. +func (in *RegistrySource) DeepCopy() *RegistrySource { + if in == nil { + return nil + } + out := new(RegistrySource) + in.DeepCopyInto(out) + return out +} diff --git a/charts/drop/.helmignore b/charts/drop/.helmignore new file mode 100644 index 0000000..dcba78d --- /dev/null +++ b/charts/drop/.helmignore @@ -0,0 +1,10 @@ +# Patterns to ignore when building packages. +.git +.gitignore +.dockerignore +*.md +docs/ +ai-docs/ +hack/ +test/ +bin/ diff --git a/charts/drop/Chart.yaml b/charts/drop/Chart.yaml new file mode 100644 index 0000000..8b37e77 --- /dev/null +++ b/charts/drop/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v2 +name: drop +description: A Kubernetes operator that pre-pulls container images onto nodes +type: application +version: 0.1.0 +appVersion: "0.1.0" +kubeVersion: ">=1.28.0-0" +keywords: + - kubernetes + - operator + - image-caching + - pre-pull +home: https://github.com/Breee/drop +sources: + - https://github.com/Breee/drop +maintainers: + - name: Breee + url: https://github.com/Breee diff --git a/charts/drop/dashboards/drop-operator.json b/charts/drop/dashboards/drop-operator.json new file mode 100644 index 0000000..6d89adb --- /dev/null +++ b/charts/drop/dashboards/drop-operator.json @@ -0,0 +1,249 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "schemaVersion": 39, + "tags": ["drop", "operator", "kubernetes"], + "templating": { + "list": [ + { + "name": "image", + "type": "query", + "datasource": "Prometheus", + "query": "label_values(drop_images_cached_total, image)", + "refresh": 2, + "includeAll": true, + "allValue": ".*", + "current": { "text": "All", "value": "$__all" } + }, + { + "name": "policy", + "type": "query", + "datasource": "Prometheus", + "query": "label_values(drop_discovery_images_found, policy)", + "refresh": 2, + "includeAll": true, + "allValue": ".*", + "current": { "text": "All", "value": "$__all" } + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Drop Operator", + "uid": "drop-operator", + "version": 2, + "refresh": "10s", + "panels": [ + { + "id": 1, + "title": "Active Pulls", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, + "datasource": "Prometheus", + "targets": [{ "expr": "drop_active_pulls", "legendFormat": "active" }], + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 3 }, + { "color": "red", "value": 5 } + ] + } + } + } + }, + { + "id": 2, + "title": "Pull Rate (success/min)", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum(rate(drop_images_cached_total[5m])) * 60", "legendFormat": "pulls/min" }], + "fieldConfig": { + "defaults": { + "thresholds": { "steps": [{ "color": "green", "value": null }] } + } + } + }, + { + "id": 3, + "title": "Error Rate (errors/min)", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum(rate(drop_pull_errors_total[5m])) * 60", "legendFormat": "errors/min" }], + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + } + } + } + }, + { + "id": 4, + "title": "Discovered Images", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum(drop_discovery_images_found)", "legendFormat": "total" }], + "fieldConfig": { + "defaults": { + "thresholds": { "steps": [{ "color": "blue", "value": null }] } + } + } + }, + { + "id": 5, + "title": "Discovery Source Health", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 }, + "datasource": "Prometheus", + "targets": [{ "expr": "drop_discovery_source_health", "legendFormat": "{{policy}} ({{source_type}})" }], + "fieldConfig": { + "defaults": { + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "color": "red" }, + "1": { "text": "UP", "color": "green" } + } + } + ] + } + } + }, + { + "id": 10, + "title": "Images Cached Over Time", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum by (image) (rate(drop_images_cached_total{image=~\"$image\"}[5m]))", "legendFormat": "{{image}}" }], + "fieldConfig": { + "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } } + } + }, + { + "id": 11, + "title": "Pull Errors Over Time", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum by (image) (rate(drop_pull_errors_total{image=~\"$image\"}[5m]))", "legendFormat": "{{image}}" }], + "fieldConfig": { + "defaults": { + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }, + "color": { "mode": "palette-classic" } + } + } + }, + { + "id": 20, + "title": "Pull Duration (p50 / p95 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "datasource": "Prometheus", + "targets": [ + { "expr": "histogram_quantile(0.50, sum by (le) (rate(drop_pull_duration_seconds_bucket{image=~\"$image\"}[5m])))", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by (le) (rate(drop_pull_duration_seconds_bucket{image=~\"$image\"}[5m])))", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by (le) (rate(drop_pull_duration_seconds_bucket{image=~\"$image\"}[5m])))", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { "unit": "s", "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 2 } } + } + }, + { + "id": 21, + "title": "Active Pulls Over Time", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "datasource": "Prometheus", + "targets": [{ "expr": "drop_active_pulls", "legendFormat": "active pods" }], + "fieldConfig": { + "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 20, "lineWidth": 2 } } + } + }, + { + "id": 30, + "title": "Discovery: Images Found by Policy", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, + "datasource": "Prometheus", + "targets": [{ "expr": "drop_discovery_images_found{policy=~\"$policy\"}", "legendFormat": "{{policy}} ({{source_type}})" }], + "fieldConfig": { + "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } } + } + }, + { + "id": 31, + "title": "Discovery: Source Latency (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "datasource": "Prometheus", + "targets": [{ "expr": "histogram_quantile(0.95, sum by (le, policy) (rate(drop_discovery_source_latency_seconds_bucket{policy=~\"$policy\"}[5m])))", "legendFormat": "{{policy}}" }], + "fieldConfig": { + "defaults": { "unit": "s", "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 2 } } + } + }, + { + "id": 40, + "title": "Reconcile Rate by Controller", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum by (controller, result) (rate(controller_runtime_reconcile_total[5m])) * 60", "legendFormat": "{{controller}} ({{result}})" }], + "fieldConfig": { + "defaults": { + "unit": "ops/min", + "custom": { "drawStyle": "bars", "fillOpacity": 50, "lineWidth": 1, "stacking": { "mode": "normal" } } + } + } + }, + { + "id": 41, + "title": "Reconcile Errors", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum by (controller) (rate(controller_runtime_reconcile_errors_total[5m]))", "legendFormat": "{{controller}}" }], + "fieldConfig": { + "defaults": { + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }, + "color": { "fixedColor": "red", "mode": "fixed" } + } + } + }, + { + "id": 50, + "title": "Images Cached per Node", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum by (node) (drop_images_cached_total{image=~\"$image\"})", "legendFormat": "{{node}}" }], + "fieldConfig": { + "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 20, "lineWidth": 2 } } + } + }, + { + "id": 51, + "title": "Top Images (table)", + "type": "table", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }, + "datasource": "Prometheus", + "targets": [{ "expr": "sum by (image, node) (drop_images_cached_total{image=~\"$image\"})", "format": "table", "instant": true }], + "transformations": [{ "id": "organize", "options": { "excludeByName": { "Time": true } } }] + } + ] +} diff --git a/charts/drop/templates/_helpers.tpl b/charts/drop/templates/_helpers.tpl new file mode 100644 index 0000000..8bc2624 --- /dev/null +++ b/charts/drop/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "drop.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "drop.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "drop.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "drop.labels" -}} +helm.sh/chart: {{ include "drop.chart" . }} +{{ include "drop.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "drop.selectorLabels" -}} +app.kubernetes.io/name: {{ include "drop.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "drop.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "drop.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/drop/templates/certificate.yaml b/charts/drop/templates/certificate.yaml new file mode 100644 index 0000000..db26910 --- /dev/null +++ b/charts/drop/templates/certificate.yaml @@ -0,0 +1,17 @@ +{{- if .Values.certManager.enabled }} +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "drop.fullname" . }}-metrics-cert + labels: + {{- include "drop.labels" . | nindent 4 }} +spec: + secretName: {{ include "drop.fullname" . }}-metrics-tls + issuerRef: + {{- toYaml .Values.certManager.issuerRef | nindent 4 }} + dnsNames: + - {{ include "drop.fullname" . }}-metrics.{{ .Release.Namespace }}.svc + - {{ include "drop.fullname" . }}-metrics.{{ .Release.Namespace }}.svc.cluster.local + duration: 8760h # 1 year + renewBefore: 720h # 30 days +{{- end }} diff --git a/charts/drop/templates/clusterrole.yaml b/charts/drop/templates/clusterrole.yaml new file mode 100644 index 0000000..2ab75da --- /dev/null +++ b/charts/drop/templates/clusterrole.yaml @@ -0,0 +1,52 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "drop.fullname" . }} + labels: + {{- include "drop.labels" . | nindent 4 }} +rules: + - apiGroups: ["drop.corewire.io"] + resources: ["cachedimages"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["drop.corewire.io"] + resources: ["cachedimages/status"] + verbs: ["get", "update", "patch"] + - apiGroups: ["drop.corewire.io"] + resources: ["cachedimages/finalizers"] + verbs: ["update"] + - apiGroups: ["drop.corewire.io"] + resources: ["cachedimagesets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["drop.corewire.io"] + resources: ["cachedimagesets/status"] + verbs: ["get", "update", "patch"] + - apiGroups: ["drop.corewire.io"] + resources: ["cachedimagesets/finalizers"] + verbs: ["update"] + - apiGroups: ["drop.corewire.io"] + resources: ["pullpolicies"] + verbs: ["get", "list", "watch"] + - apiGroups: ["drop.corewire.io"] + resources: ["discoverypolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["drop.corewire.io"] + resources: ["discoverypolicies/status"] + verbs: ["get", "update", "patch"] + - apiGroups: ["drop.corewire.io"] + resources: ["discoverypolicies/finalizers"] + verbs: ["update"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] diff --git a/charts/drop/templates/clusterrolebinding.yaml b/charts/drop/templates/clusterrolebinding.yaml new file mode 100644 index 0000000..e5f3643 --- /dev/null +++ b/charts/drop/templates/clusterrolebinding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "drop.fullname" . }} + labels: + {{- include "drop.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "drop.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "drop.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} diff --git a/charts/drop/templates/deployment.yaml b/charts/drop/templates/deployment.yaml new file mode 100644 index 0000000..77f89b9 --- /dev/null +++ b/charts/drop/templates/deployment.yaml @@ -0,0 +1,89 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "drop.fullname" . }} + labels: + {{- include "drop.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "drop.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "drop.selectorLabels" . | nindent 8 }} + spec: + serviceAccountName: {{ include "drop.serviceAccountName" . }} + securityContext: + runAsNonRoot: true + containers: + - name: manager + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + {{- if .Values.leaderElection.enabled }} + - --leader-elect + {{- end }} + {{- if .Values.metrics.enabled }} + - --metrics-bind-address=:8443 + {{- if not .Values.metrics.secureServing }} + - --metrics-secure=false + {{- end }} + {{- else }} + - --metrics-bind-address=0 + {{- end }} + {{- if .Values.certManager.enabled }} + - --metrics-cert-path=/tmp/k8s-metrics-server/metrics-certs + {{- end }} + - --health-probe-bind-address=:8081 + ports: + - name: metrics + containerPort: 8443 + protocol: TCP + - name: health + containerPort: 8081 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: health + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + {{- toYaml .Values.resources | nindent 12 }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + {{- if .Values.certManager.enabled }} + volumeMounts: + - name: metrics-certs + mountPath: /tmp/k8s-metrics-server/metrics-certs + readOnly: true + {{- end }} + {{- if .Values.certManager.enabled }} + volumes: + - name: metrics-certs + secret: + secretName: {{ include "drop.fullname" . }}-metrics-tls + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/drop/templates/metrics-service.yaml b/charts/drop/templates/metrics-service.yaml new file mode 100644 index 0000000..ea9ca2d --- /dev/null +++ b/charts/drop/templates/metrics-service.yaml @@ -0,0 +1,16 @@ +{{- if .Values.metrics.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "drop.fullname" . }}-metrics + labels: + {{- include "drop.labels" . | nindent 4 }} +spec: + ports: + - name: https-metrics + port: 8443 + targetPort: metrics + protocol: TCP + selector: + {{- include "drop.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/charts/drop/templates/serviceaccount.yaml b/charts/drop/templates/serviceaccount.yaml new file mode 100644 index 0000000..4ef4df3 --- /dev/null +++ b/charts/drop/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "drop.serviceAccountName" . }} + labels: + {{- include "drop.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/drop/templates/servicemonitor.yaml b/charts/drop/templates/servicemonitor.yaml new file mode 100644 index 0000000..1ec5a09 --- /dev/null +++ b/charts/drop/templates/servicemonitor.yaml @@ -0,0 +1,25 @@ +{{- if .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "drop.fullname" . }} + labels: + {{- include "drop.labels" . | nindent 4 }} + {{- with .Values.serviceMonitor.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + endpoints: + - port: https-metrics + scheme: https + interval: {{ .Values.serviceMonitor.interval }} + {{- if .Values.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }} + {{- end }} + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + selector: + matchLabels: + {{- include "drop.selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/charts/drop/values.yaml b/charts/drop/values.yaml new file mode 100644 index 0000000..19429a4 --- /dev/null +++ b/charts/drop/values.yaml @@ -0,0 +1,45 @@ +# Default values for drop. +replicaCount: 1 + +image: + repository: ghcr.io/breee/drop + pullPolicy: IfNotPresent + tag: "" # Defaults to Chart appVersion + +serviceAccount: + create: true + annotations: {} + name: "" + +resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + +leaderElection: + enabled: true + +metrics: + enabled: true + secureServing: true + +serviceMonitor: + enabled: false + interval: 30s + scrapeTimeout: "" + additionalLabels: {} + +# cert-manager integration for metrics TLS certificates. +# Assumes cert-manager is installed in the cluster. +certManager: + enabled: false + issuerRef: + name: selfsigned-issuer + kind: ClusterIssuer + +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/cmd/main.go b/cmd/main.go new file mode 100644 index 0000000..aacd452 --- /dev/null +++ b/cmd/main.go @@ -0,0 +1,266 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "crypto/tls" + "flag" + "os" + "path/filepath" + + // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) + // to ensure that exec-entrypoint and run can make use of them. + _ "k8s.io/client-go/plugin/pkg/client/auth" + + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/certwatcher" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" + "github.com/Breee/drop/internal/controller" + _ "github.com/Breee/drop/internal/metrics" // Register custom metrics + "github.com/Breee/drop/internal/pacing" + // +kubebuilder:scaffold:imports +) + +var ( + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + utilruntime.Must(dropv1alpha1.AddToScheme(scheme)) + // +kubebuilder:scaffold:scheme +} + +// nolint:gocyclo +func main() { + var metricsAddr string + var metricsCertPath, metricsCertName, metricsCertKey string + var webhookCertPath, webhookCertName, webhookCertKey string + var enableLeaderElection bool + var probeAddr string + var secureMetrics bool + var enableHTTP2 bool + var podNamespace string + var tlsOpts []func(*tls.Config) + flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ + "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + flag.BoolVar(&enableLeaderElection, "leader-elect", false, + "Enable leader election for controller manager. "+ + "Enabling this will ensure there is only one active controller manager.") + flag.BoolVar(&secureMetrics, "metrics-secure", true, + "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.") + flag.StringVar(&podNamespace, "pod-namespace", "drop-system", + "The namespace where drop Pods are created.") + flag.StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.") + flag.StringVar(&webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.") + flag.StringVar(&webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.") + flag.StringVar(&metricsCertPath, "metrics-cert-path", "", + "The directory that contains the metrics server certificate.") + flag.StringVar(&metricsCertName, "metrics-cert-name", "tls.crt", "The name of the metrics server certificate file.") + flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") + flag.BoolVar(&enableHTTP2, "enable-http2", false, + "If set, HTTP/2 will be enabled for the metrics and webhook servers") + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + // if the enable-http2 flag is false (the default), http/2 should be disabled + // due to its vulnerabilities. More specifically, disabling http/2 will + // prevent from being vulnerable to the HTTP/2 Stream Cancellation and + // Rapid Reset CVEs. For more information see: + // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 + // - https://github.com/advisories/GHSA-4374-p667-p6c8 + disableHTTP2 := func(c *tls.Config) { + setupLog.Info("disabling http/2") + c.NextProtos = []string{"http/1.1"} + } + + if !enableHTTP2 { + tlsOpts = append(tlsOpts, disableHTTP2) + } + + // Create watchers for metrics and webhooks certificates + var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher + + // Initial webhook TLS options + webhookTLSOpts := tlsOpts + + if len(webhookCertPath) > 0 { + setupLog.Info("Initializing webhook certificate watcher using provided certificates", + "webhook-cert-path", webhookCertPath, "webhook-cert-name", webhookCertName, "webhook-cert-key", webhookCertKey) + + var err error + webhookCertWatcher, err = certwatcher.New( + filepath.Join(webhookCertPath, webhookCertName), + filepath.Join(webhookCertPath, webhookCertKey), + ) + if err != nil { + setupLog.Error(err, "Failed to initialize webhook certificate watcher") + os.Exit(1) + } + + webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) { + config.GetCertificate = webhookCertWatcher.GetCertificate + }) + } + + webhookServer := webhook.NewServer(webhook.Options{ + TLSOpts: webhookTLSOpts, + }) + + // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. + // More info: + // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/metrics/server + // - https://book.kubebuilder.io/reference/metrics.html + metricsServerOptions := metricsserver.Options{ + BindAddress: metricsAddr, + SecureServing: secureMetrics, + TLSOpts: tlsOpts, + } + + if secureMetrics { + // FilterProvider is used to protect the metrics endpoint with authn/authz. + // These configurations ensure that only authorized users and service accounts + // can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info: + // https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/metrics/filters#WithAuthenticationAndAuthorization + metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization + } + + // If the certificate is not specified, controller-runtime will automatically + // generate self-signed certificates for the metrics server. While convenient for development and testing, + // this setup is not recommended for production. + // + // TODO(user): If you enable certManager, uncomment the following lines: + // - [METRICS-WITH-CERTS] at config/default/kustomization.yaml to generate and use certificates + // managed by cert-manager for the metrics server. + // - [PROMETHEUS-WITH-CERTS] at config/prometheus/kustomization.yaml for TLS certification. + if len(metricsCertPath) > 0 { + setupLog.Info("Initializing metrics certificate watcher using provided certificates", + "metrics-cert-path", metricsCertPath, "metrics-cert-name", metricsCertName, "metrics-cert-key", metricsCertKey) + + var err error + metricsCertWatcher, err = certwatcher.New( + filepath.Join(metricsCertPath, metricsCertName), + filepath.Join(metricsCertPath, metricsCertKey), + ) + if err != nil { + setupLog.Error(err, "Failed to initialize metrics certificate watcher") + os.Exit(1) + } + + metricsServerOptions.TLSOpts = append(metricsServerOptions.TLSOpts, func(config *tls.Config) { + config.GetCertificate = metricsCertWatcher.GetCertificate + }) + } + + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + Scheme: scheme, + Metrics: metricsServerOptions, + WebhookServer: webhookServer, + HealthProbeBindAddress: probeAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "b889acf8.corewire.io", + // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily + // when the Manager ends. This requires the binary to immediately end when the + // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly + // speeds up voluntary leader transitions as the new leader don't have to wait + // LeaseDuration time first. + // + // In the default scaffold provided, the program ends immediately after + // the manager stops, so would be fine to enable this option. However, + // if you are doing or is intended to do any operation such as perform cleanups + // after the manager stops then its usage might be unsafe. + // LeaderElectionReleaseOnCancel: true, + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + os.Exit(1) + } + + if err = (&controller.CachedImageReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + PacingEngine: pacing.NewEngine(mgr.GetClient(), podNamespace), + Recorder: mgr.GetEventRecorderFor("cachedimage-controller"), + PodNamespace: podNamespace, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "CachedImage") + os.Exit(1) + } + if err = (&controller.CachedImageSetReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "CachedImageSet") + os.Exit(1) + } + if err = (&controller.DiscoveryPolicyReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "DiscoveryPolicy") + os.Exit(1) + } + // +kubebuilder:scaffold:builder + + if metricsCertWatcher != nil { + setupLog.Info("Adding metrics certificate watcher to manager") + if err := mgr.Add(metricsCertWatcher); err != nil { + setupLog.Error(err, "unable to add metrics certificate watcher to manager") + os.Exit(1) + } + } + + if webhookCertWatcher != nil { + setupLog.Info("Adding webhook certificate watcher to manager") + if err := mgr.Add(webhookCertWatcher); err != nil { + setupLog.Error(err, "unable to add webhook certificate watcher to manager") + os.Exit(1) + } + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + os.Exit(1) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up ready check") + os.Exit(1) + } + + setupLog.Info("starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "problem running manager") + os.Exit(1) + } +} diff --git a/config/crd/bases/drop.corewire.io_cachedimages.yaml b/config/crd/bases/drop.corewire.io_cachedimages.yaml new file mode 100644 index 0000000..c24ecb1 --- /dev/null +++ b/config/crd/bases/drop.corewire.io_cachedimages.yaml @@ -0,0 +1,298 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: cachedimages.drop.corewire.io +spec: + group: drop.corewire.io + names: + categories: + - drop + kind: CachedImage + listKind: CachedImageList + plural: cachedimages + singular: cachedimage + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.image + name: Image + type: string + - jsonPath: .spec.tag + name: Tag + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].reason + name: Status + type: string + - jsonPath: .status.ready + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .status.resolvedDigest + name: Digest + priority: 1 + type: string + - description: Parent CachedImageSet + jsonPath: .metadata.labels.drop\.corewire\.io/imageset + name: Set + priority: 1 + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].message + name: Message + priority: 1 + type: string + - jsonPath: .spec.policyRef.name + name: Policy + priority: 1 + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: CachedImage is the Schema for the cachedimages API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: CachedImageSpec defines the desired state of CachedImage. + properties: + digest: + description: Digest to pull (immutable reference). Mutually exclusive + with Tag. + type: string + image: + description: Image is the fully qualified image reference (registry/repository). + minLength: 1 + type: string + imagePullPolicy: + default: Always + description: |- + ImagePullPolicy controls when kubelet pulls the image. + Defaults to Always (checks upstream digest, only downloads if changed). + Set to IfNotPresent to skip the registry check when the tag already exists locally. + enum: + - Always + - IfNotPresent + - Never + type: string + imagePullSecrets: + description: ImagePullSecrets are references to secrets for pulling + from private registries. + items: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: array + nodeSelector: + additionalProperties: + type: string + description: NodeSelector restricts which nodes to cache the image + on. + type: object + policyRef: + description: PolicyRef references a PullPolicy for pacing controls. + properties: + name: + description: Name of the PullPolicy resource. + minLength: 1 + type: string + required: + - name + type: object + priority: + description: Priority is a pull ordering hint (lower values pulled + first). + format: int32 + type: integer + tag: + description: Tag to pull. Mutually exclusive with Digest. + type: string + tolerations: + description: Tolerations allow targeting tainted nodes. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + required: + - image + type: object + status: + description: CachedImageStatus defines the observed state of CachedImage. + properties: + cachedNodes: + description: CachedNodes is the list of node names that have successfully + cached the image. + items: + type: string + type: array + conditions: + description: Conditions represent the latest available observations. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + consecutiveFailures: + description: ConsecutiveFailures counts sequential reconcile failures + for backoff calculation. + format: int32 + type: integer + lastAttemptedAt: + description: LastAttemptedAt is the timestamp of the most recent pull + attempt (success or failure). + format: date-time + type: string + lastPulledAt: + description: LastPulledAt is the timestamp of the most recent successful + pull. + format: date-time + type: string + nodesReady: + description: NodesReady is the number of nodes that have successfully + pulled the image. + format: int32 + type: integer + nodesTargeted: + description: NodesTargeted is the number of nodes that should have + this image. + format: int32 + type: integer + observedGeneration: + description: ObservedGeneration is the last generation reconciled. + format: int64 + type: integer + phase: + description: Phase summarizes the overall state. + enum: + - Pending + - Pulling + - Ready + - Degraded + type: string + ready: + description: Ready is a human-readable "nodesReady/nodesTargeted" + fraction for display. + type: string + resolvedDigest: + description: ResolvedDigest is the sha256 digest of the image as reported + by the container runtime after pull. + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/drop.corewire.io_cachedimagesets.yaml b/config/crd/bases/drop.corewire.io_cachedimagesets.yaml new file mode 100644 index 0000000..30adc2e --- /dev/null +++ b/config/crd/bases/drop.corewire.io_cachedimagesets.yaml @@ -0,0 +1,265 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: cachedimagesets.drop.corewire.io +spec: + group: drop.corewire.io + names: + categories: + - drop + kind: CachedImageSet + listKind: CachedImageSetList + plural: cachedimagesets + singular: cachedimageset + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.conditions[?(@.type=="Ready")].reason + name: Status + type: string + - jsonPath: .status.imagesReady + name: Ready + type: string + - jsonPath: .status.imagesManaged + name: Managed + type: integer + - jsonPath: .spec.discoveryPolicyRef.name + name: Source + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].message + name: Message + priority: 1 + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: CachedImageSet is the Schema for the cachedimagesets API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: CachedImageSetSpec defines the desired state of CachedImageSet. + properties: + discoveryPolicyRef: + description: DiscoveryPolicyRef references a DiscoveryPolicy for dynamic + image lists. + properties: + name: + description: Name of the DiscoveryPolicy resource. + minLength: 1 + type: string + required: + - name + type: object + imagePullPolicy: + default: Always + description: ImagePullPolicy controls when kubelet pulls the image + (propagated to children). + enum: + - Always + - IfNotPresent + - Never + type: string + imagePullSecrets: + description: ImagePullSecrets are references to secrets for pulling + from private registries (propagated to children). + items: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: array + images: + description: Images is a static list of images to cache. + items: + description: ImageEntry defines a single image to include in a set. + properties: + digest: + description: Digest to pull. + type: string + image: + description: Image is the fully qualified image reference (registry/repository). + minLength: 1 + type: string + tag: + description: Tag to pull. + type: string + required: + - image + type: object + type: array + nodeSelector: + additionalProperties: + type: string + description: NodeSelector restricts which nodes to cache images on + (propagated to children). + type: object + policyRef: + description: PolicyRef references a PullPolicy for pacing controls. + properties: + name: + description: Name of the PullPolicy resource. + minLength: 1 + type: string + required: + - name + type: object + tolerations: + description: Tolerations allow targeting tainted nodes (propagated + to children). + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + status: + description: CachedImageSetStatus defines the observed state of CachedImageSet. + properties: + conditions: + description: Conditions represent the latest available observations. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + imagesManaged: + description: ImagesManaged is the number of CachedImage children managed + by this set. + format: int32 + type: integer + imagesReady: + description: ImagesReady is the number of children in Ready phase. + format: int32 + type: integer + observedGeneration: + description: ObservedGeneration is the last generation reconciled. + format: int64 + type: integer + phase: + description: Phase summarizes the overall state. + enum: + - Pending + - Ready + - Degraded + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/drop.corewire.io_discoverypolicies.yaml b/config/crd/bases/drop.corewire.io_discoverypolicies.yaml new file mode 100644 index 0000000..d4dad33 --- /dev/null +++ b/config/crd/bases/drop.corewire.io_discoverypolicies.yaml @@ -0,0 +1,273 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: discoverypolicies.drop.corewire.io +spec: + group: drop.corewire.io + names: + categories: + - drop + kind: DiscoveryPolicy + listKind: DiscoveryPolicyList + plural: discoverypolicies + singular: discoverypolicy + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.conditions[?(@.type=="Ready")].reason + name: Status + type: string + - jsonPath: .status.sourceCount + name: Sources + type: integer + - jsonPath: .status.imageCount + name: Images + type: integer + - jsonPath: .status.lastSyncTime + name: LastSync + type: date + - jsonPath: .status.conditions[?(@.type=="Ready")].message + name: Message + priority: 1 + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: DiscoveryPolicy is the Schema for the discoverypolicies API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: DiscoveryPolicySpec defines the desired state of DiscoveryPolicy. + properties: + imageFilter: + description: ImageFilter is a regex to filter discovered images. + type: string + maxImages: + default: 50 + description: MaxImages caps the number of discovered images. + format: int32 + minimum: 1 + type: integer + sources: + description: Sources is the list of discovery backends to query. + items: + description: DiscoverySource defines a single discovery backend. + properties: + prometheus: + description: Prometheus config (when type=prometheus). + properties: + endpoint: + description: Endpoint is the Prometheus API URL. + minLength: 1 + type: string + lookback: + description: |- + Lookback is the time window to aggregate over (e.g. "7d", "24h"). + When set, uses query_range and sums values to rank by total usage. + When unset, uses an instant query (point-in-time). + type: string + query: + description: Query is the PromQL query that must return + an 'image' label. + minLength: 1 + type: string + step: + default: 5m + description: Step is the query resolution step for range + queries. + type: string + required: + - endpoint + - query + type: object + registry: + description: Registry config (when type=registry). + properties: + imageTemplate: + description: |- + ImageTemplate is a Go text/template for constructing the full image reference. + Available variables: .Registry, .Repository, .Tag + type: string + repositories: + description: Repositories is the list of repositories to + query. + items: + type: string + minItems: 1 + type: array + tagFilter: + description: TagFilter is a regex to filter tags. + type: string + topX: + description: TopX limits the number of tags to fetch per + repository. + format: int32 + minimum: 1 + type: integer + url: + description: URL is the registry base URL. + minLength: 1 + type: string + required: + - repositories + - url + type: object + secretRef: + description: SecretRef references a Secret for auth/TLS for + this source. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: + description: Type identifies the backend. + enum: + - prometheus + - registry + type: string + required: + - type + type: object + minItems: 1 + type: array + syncInterval: + default: 30m + description: SyncInterval is how often to re-query sources. + type: string + required: + - sources + type: object + status: + description: DiscoveryPolicyStatus defines the observed state of DiscoveryPolicy. + properties: + conditions: + description: Conditions represent the latest available observations. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + discoveredImages: + description: DiscoveredImages is the list of discovered images from + all sources. + items: + description: DiscoveredImage represents a single discovered image + with metadata. + properties: + image: + description: Image is the fully qualified image reference. + type: string + score: + description: Score is the ranking score from the source (higher + = more relevant). + format: int64 + type: integer + source: + description: Source identifies which discovery source produced + this image. + type: string + required: + - image + - score + - source + type: object + type: array + imageCount: + description: ImageCount is the number of discovered images. + format: int32 + type: integer + lastSyncTime: + description: LastSyncTime is the timestamp of the last successful + sync. + format: date-time + type: string + sourceCount: + description: SourceCount is the number of configured sources. + format: int32 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/drop.corewire.io_pullpolicies.yaml b/config/crd/bases/drop.corewire.io_pullpolicies.yaml new file mode 100644 index 0000000..e98302b --- /dev/null +++ b/config/crd/bases/drop.corewire.io_pullpolicies.yaml @@ -0,0 +1,136 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: pullpolicies.drop.corewire.io +spec: + group: drop.corewire.io + names: + categories: + - drop + kind: PullPolicy + listKind: PullPolicyList + plural: pullpolicies + singular: pullpolicy + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.maxConcurrentNodes + name: MaxNodes + type: integer + - jsonPath: .spec.minDelayBetweenPulls + name: MinDelay + type: string + - jsonPath: .spec.repullInterval + name: RepullInterval + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + PullPolicy is the Schema for the pullpolicies API. + It is a configuration-only resource with no status. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: PullPolicySpec defines pacing and behavior configuration + for image pulls. + properties: + failureBackoff: + description: FailureBackoff configures retry delays on pull failures. + properties: + initial: + default: 30s + description: Initial delay before first retry. + type: string + max: + default: 5m + description: Max delay cap for exponential backoff. + type: string + type: object + maxConcurrentNodes: + default: 1 + description: MaxConcurrentNodes is the max nodes pulling simultaneously + for this policy. + format: int32 + minimum: 1 + type: integer + minDelayBetweenPulls: + default: 10s + description: MinDelayBetweenPulls is the minimum time between starting + pulls on different nodes. + type: string + nodeSelector: + additionalProperties: + type: string + description: NodeSelector scopes this policy to a specific node pool. + type: object + repullInterval: + description: RepullInterval is how often to re-pull cached images. + Zero or unset means never re-pull. + type: string + tolerations: + description: Tolerations match tainted nodes in the pool. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: {} diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml new file mode 100644 index 0000000..5cb47bb --- /dev/null +++ b/config/crd/kustomization.yaml @@ -0,0 +1,19 @@ +# This kustomization.yaml is not intended to be run by itself, +# since it depends on service name and namespace that are out of this kustomize package. +# It should be run by config/default +resources: +- bases/drop.corewire.io_cachedimages.yaml +- bases/drop.corewire.io_cachedimagesets.yaml +- bases/drop.corewire.io_pullpolicies.yaml +- bases/drop.corewire.io_discoverypolicies.yaml +# +kubebuilder:scaffold:crdkustomizeresource + +patches: +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. +# patches here are for enabling the conversion webhook for each CRD +# +kubebuilder:scaffold:crdkustomizewebhookpatch + +# [WEBHOOK] To enable webhook, uncomment the following section +# the following config is for teaching kustomize how to do kustomization for CRDs. +#configurations: +#- kustomizeconfig.yaml diff --git a/config/crd/kustomizeconfig.yaml b/config/crd/kustomizeconfig.yaml new file mode 100644 index 0000000..ec5c150 --- /dev/null +++ b/config/crd/kustomizeconfig.yaml @@ -0,0 +1,19 @@ +# This file is for teaching kustomize how to substitute name and namespace reference in CRD +nameReference: +- kind: Service + version: v1 + fieldSpecs: + - kind: CustomResourceDefinition + version: v1 + group: apiextensions.k8s.io + path: spec/conversion/webhook/clientConfig/service/name + +namespace: +- kind: CustomResourceDefinition + version: v1 + group: apiextensions.k8s.io + path: spec/conversion/webhook/clientConfig/service/namespace + create: false + +varReference: +- path: metadata/annotations diff --git a/config/default/cert_metrics_manager_patch.yaml b/config/default/cert_metrics_manager_patch.yaml new file mode 100644 index 0000000..d975015 --- /dev/null +++ b/config/default/cert_metrics_manager_patch.yaml @@ -0,0 +1,30 @@ +# This patch adds the args, volumes, and ports to allow the manager to use the metrics-server certs. + +# Add the volumeMount for the metrics-server certs +- op: add + path: /spec/template/spec/containers/0/volumeMounts/- + value: + mountPath: /tmp/k8s-metrics-server/metrics-certs + name: metrics-certs + readOnly: true + +# Add the --metrics-cert-path argument for the metrics server +- op: add + path: /spec/template/spec/containers/0/args/- + value: --metrics-cert-path=/tmp/k8s-metrics-server/metrics-certs + +# Add the metrics-server certs volume configuration +- op: add + path: /spec/template/spec/volumes/- + value: + name: metrics-certs + secret: + secretName: metrics-server-cert + optional: false + items: + - key: ca.crt + path: ca.crt + - key: tls.crt + path: tls.crt + - key: tls.key + path: tls.key diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml new file mode 100644 index 0000000..f5e120d --- /dev/null +++ b/config/default/kustomization.yaml @@ -0,0 +1,234 @@ +# Adds namespace to all resources. +namespace: drop-system + +# Value of this field is prepended to the +# names of all resources, e.g. a deployment named +# "wordpress" becomes "alices-wordpress". +# Note that it should also match with the prefix (text before '-') of the namespace +# field above. +namePrefix: drop- + +# Labels to add to all resources and selectors. +#labels: +#- includeSelectors: true +# pairs: +# someName: someValue + +resources: +- ../crd +- ../rbac +- ../manager +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in +# crd/kustomization.yaml +#- ../webhook +# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. +#- ../certmanager +# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. +#- ../prometheus +# [METRICS] Expose the controller manager metrics service. +- metrics_service.yaml +# [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy. +# Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics. +# Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will +# be able to communicate with the Webhook Server. +#- ../network-policy + +# Uncomment the patches line if you enable Metrics +patches: +# [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443. +# More info: https://book.kubebuilder.io/reference/metrics +- path: manager_metrics_patch.yaml + target: + kind: Deployment + +# Uncomment the patches line if you enable Metrics and CertManager +# [METRICS-WITH-CERTS] To enable metrics protected with certManager, uncomment the following line. +# This patch will protect the metrics with certManager self-signed certs. +#- path: cert_metrics_manager_patch.yaml +# target: +# kind: Deployment + +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in +# crd/kustomization.yaml +#- path: manager_webhook_patch.yaml +# target: +# kind: Deployment + +# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. +# Uncomment the following replacements to add the cert-manager CA injection annotations +#replacements: +# - source: # Uncomment the following block to enable certificates for metrics +# kind: Service +# version: v1 +# name: controller-manager-metrics-service +# fieldPath: metadata.name +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: metrics-certs +# fieldPaths: +# - spec.dnsNames.0 +# - spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 0 +# create: true +# - select: # Uncomment the following to set the Service name for TLS config in Prometheus ServiceMonitor +# kind: ServiceMonitor +# group: monitoring.coreos.com +# version: v1 +# name: controller-manager-metrics-monitor +# fieldPaths: +# - spec.endpoints.0.tlsConfig.serverName +# options: +# delimiter: '.' +# index: 0 +# create: true +# +# - source: +# kind: Service +# version: v1 +# name: controller-manager-metrics-service +# fieldPath: metadata.namespace +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: metrics-certs +# fieldPaths: +# - spec.dnsNames.0 +# - spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 1 +# create: true +# - select: # Uncomment the following to set the Service namespace for TLS in Prometheus ServiceMonitor +# kind: ServiceMonitor +# group: monitoring.coreos.com +# version: v1 +# name: controller-manager-metrics-monitor +# fieldPaths: +# - spec.endpoints.0.tlsConfig.serverName +# options: +# delimiter: '.' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have any webhook +# kind: Service +# version: v1 +# name: webhook-service +# fieldPath: .metadata.name # Name of the service +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPaths: +# - .spec.dnsNames.0 +# - .spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 0 +# create: true +# - source: +# kind: Service +# version: v1 +# name: webhook-service +# fieldPath: .metadata.namespace # Namespace of the service +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPaths: +# - .spec.dnsNames.0 +# - .spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert # This name should match the one in certificate.yaml +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: +# - select: +# kind: ValidatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 0 +# create: true +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.name +# targets: +# - select: +# kind: ValidatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting ) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: +# - select: +# kind: MutatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 0 +# create: true +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.name +# targets: +# - select: +# kind: MutatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a ConversionWebhook (--conversion) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD. +# +kubebuilder:scaffold:crdkustomizecainjectionns +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert +# fieldPath: .metadata.name +# targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD. +# +kubebuilder:scaffold:crdkustomizecainjectionname diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml new file mode 100644 index 0000000..2aaef65 --- /dev/null +++ b/config/default/manager_metrics_patch.yaml @@ -0,0 +1,4 @@ +# This patch adds the args to allow exposing the metrics endpoint using HTTPS +- op: add + path: /spec/template/spec/containers/0/args/0 + value: --metrics-bind-address=:8443 diff --git a/config/default/metrics_service.yaml b/config/default/metrics_service.yaml new file mode 100644 index 0000000..863324f --- /dev/null +++ b/config/default/metrics_service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: controller-manager-metrics-service + namespace: system +spec: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + control-plane: controller-manager + app.kubernetes.io/name: drop diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml new file mode 100644 index 0000000..5c5f0b8 --- /dev/null +++ b/config/manager/kustomization.yaml @@ -0,0 +1,2 @@ +resources: +- manager.yaml diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml new file mode 100644 index 0000000..78e298e --- /dev/null +++ b/config/manager/manager.yaml @@ -0,0 +1,98 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: system +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system + labels: + control-plane: controller-manager + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize +spec: + selector: + matchLabels: + control-plane: controller-manager + app.kubernetes.io/name: drop + replicas: 1 + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + app.kubernetes.io/name: drop + spec: + # TODO(user): Uncomment the following code to configure the nodeAffinity expression + # according to the platforms which are supported by your solution. + # It is considered best practice to support multiple architectures. You can + # build your manager image using the makefile target docker-buildx. + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: kubernetes.io/arch + # operator: In + # values: + # - amd64 + # - arm64 + # - ppc64le + # - s390x + # - key: kubernetes.io/os + # operator: In + # values: + # - linux + securityContext: + # Projects are configured by default to adhere to the "restricted" Pod Security Standards. + # This ensures that deployments meet the highest security requirements for Kubernetes. + # For more details, see: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - command: + - /manager + args: + - --leader-elect + - --health-probe-bind-address=:8081 + image: controller:latest + name: manager + ports: [] + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + # TODO(user): Configure the resources accordingly based on the project requirements. + # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + volumeMounts: [] + volumes: [] + serviceAccountName: controller-manager + terminationGracePeriodSeconds: 10 diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml new file mode 100644 index 0000000..3e217da --- /dev/null +++ b/config/network-policy/allow-metrics-traffic.yaml @@ -0,0 +1,27 @@ +# This NetworkPolicy allows ingress traffic +# with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those +# namespaces are able to gather data from the metrics endpoint. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: allow-metrics-traffic + namespace: system +spec: + podSelector: + matchLabels: + control-plane: controller-manager + app.kubernetes.io/name: drop + policyTypes: + - Ingress + ingress: + # This allows ingress traffic from any namespace with the label metrics: enabled + - from: + - namespaceSelector: + matchLabels: + metrics: enabled # Only from namespaces with this label + ports: + - port: 8443 + protocol: TCP diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml new file mode 100644 index 0000000..ec0fb5e --- /dev/null +++ b/config/network-policy/kustomization.yaml @@ -0,0 +1,2 @@ +resources: +- allow-metrics-traffic.yaml diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml new file mode 100644 index 0000000..fdc5481 --- /dev/null +++ b/config/prometheus/kustomization.yaml @@ -0,0 +1,11 @@ +resources: +- monitor.yaml + +# [PROMETHEUS-WITH-CERTS] The following patch configures the ServiceMonitor in ../prometheus +# to securely reference certificates created and managed by cert-manager. +# Additionally, ensure that you uncomment the [METRICS WITH CERTMANAGER] patch under config/default/kustomization.yaml +# to mount the "metrics-server-cert" secret in the Manager Deployment. +#patches: +# - path: monitor_tls_patch.yaml +# target: +# kind: ServiceMonitor diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml new file mode 100644 index 0000000..3477bc1 --- /dev/null +++ b/config/prometheus/monitor.yaml @@ -0,0 +1,27 @@ +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: controller-manager-metrics-monitor + namespace: system +spec: + endpoints: + - path: /metrics + port: https # Ensure this is the name of the port that exposes HTTPS metrics + scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables + # certificate verification, exposing the system to potential man-in-the-middle attacks. + # For production environments, it is recommended to use cert-manager for automatic TLS certificate management. + # To apply this configuration, enable cert-manager and use the patch located at config/prometheus/servicemonitor_tls_patch.yaml, + # which securely references the certificate from the 'metrics-server-cert' secret. + insecureSkipVerify: true + selector: + matchLabels: + control-plane: controller-manager + app.kubernetes.io/name: drop diff --git a/config/prometheus/monitor_tls_patch.yaml b/config/prometheus/monitor_tls_patch.yaml new file mode 100644 index 0000000..5bf84ce --- /dev/null +++ b/config/prometheus/monitor_tls_patch.yaml @@ -0,0 +1,19 @@ +# Patch for Prometheus ServiceMonitor to enable secure TLS configuration +# using certificates managed by cert-manager +- op: replace + path: /spec/endpoints/0/tlsConfig + value: + # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize + serverName: SERVICE_NAME.SERVICE_NAMESPACE.svc + insecureSkipVerify: false + ca: + secret: + name: metrics-server-cert + key: ca.crt + cert: + secret: + name: metrics-server-cert + key: tls.crt + keySecret: + name: metrics-server-cert + key: tls.key diff --git a/config/rbac/cachedimage_admin_role.yaml b/config/rbac/cachedimage_admin_role.yaml new file mode 100644 index 0000000..3bcc772 --- /dev/null +++ b/config/rbac/cachedimage_admin_role.yaml @@ -0,0 +1,27 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants full permissions ('*') over drop.corewire.io. +# This role is intended for users authorized to modify roles and bindings within the cluster, +# enabling them to delegate specific permissions to other users or groups as needed. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: cachedimage-admin-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - cachedimages + verbs: + - '*' +- apiGroups: + - drop.corewire.io + resources: + - cachedimages/status + verbs: + - get diff --git a/config/rbac/cachedimage_editor_role.yaml b/config/rbac/cachedimage_editor_role.yaml new file mode 100644 index 0000000..7a23a4b --- /dev/null +++ b/config/rbac/cachedimage_editor_role.yaml @@ -0,0 +1,33 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants permissions to create, update, and delete resources within the drop.corewire.io. +# This role is intended for users who need to manage these resources +# but should not control RBAC or manage permissions for others. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: cachedimage-editor-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - cachedimages + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - drop.corewire.io + resources: + - cachedimages/status + verbs: + - get diff --git a/config/rbac/cachedimage_viewer_role.yaml b/config/rbac/cachedimage_viewer_role.yaml new file mode 100644 index 0000000..8e8c17d --- /dev/null +++ b/config/rbac/cachedimage_viewer_role.yaml @@ -0,0 +1,29 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants read-only access to drop.corewire.io resources. +# This role is intended for users who need visibility into these resources +# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: cachedimage-viewer-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - cachedimages + verbs: + - get + - list + - watch +- apiGroups: + - drop.corewire.io + resources: + - cachedimages/status + verbs: + - get diff --git a/config/rbac/cachedimageset_admin_role.yaml b/config/rbac/cachedimageset_admin_role.yaml new file mode 100644 index 0000000..0005080 --- /dev/null +++ b/config/rbac/cachedimageset_admin_role.yaml @@ -0,0 +1,27 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants full permissions ('*') over drop.corewire.io. +# This role is intended for users authorized to modify roles and bindings within the cluster, +# enabling them to delegate specific permissions to other users or groups as needed. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: cachedimageset-admin-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - cachedimagesets + verbs: + - '*' +- apiGroups: + - drop.corewire.io + resources: + - cachedimagesets/status + verbs: + - get diff --git a/config/rbac/cachedimageset_editor_role.yaml b/config/rbac/cachedimageset_editor_role.yaml new file mode 100644 index 0000000..d971497 --- /dev/null +++ b/config/rbac/cachedimageset_editor_role.yaml @@ -0,0 +1,33 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants permissions to create, update, and delete resources within the drop.corewire.io. +# This role is intended for users who need to manage these resources +# but should not control RBAC or manage permissions for others. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: cachedimageset-editor-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - cachedimagesets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - drop.corewire.io + resources: + - cachedimagesets/status + verbs: + - get diff --git a/config/rbac/cachedimageset_viewer_role.yaml b/config/rbac/cachedimageset_viewer_role.yaml new file mode 100644 index 0000000..95b3290 --- /dev/null +++ b/config/rbac/cachedimageset_viewer_role.yaml @@ -0,0 +1,29 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants read-only access to drop.corewire.io resources. +# This role is intended for users who need visibility into these resources +# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: cachedimageset-viewer-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - cachedimagesets + verbs: + - get + - list + - watch +- apiGroups: + - drop.corewire.io + resources: + - cachedimagesets/status + verbs: + - get diff --git a/config/rbac/discoverypolicy_admin_role.yaml b/config/rbac/discoverypolicy_admin_role.yaml new file mode 100644 index 0000000..f10d35d --- /dev/null +++ b/config/rbac/discoverypolicy_admin_role.yaml @@ -0,0 +1,27 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants full permissions ('*') over drop.corewire.io. +# This role is intended for users authorized to modify roles and bindings within the cluster, +# enabling them to delegate specific permissions to other users or groups as needed. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: discoverypolicy-admin-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - discoverypolicies + verbs: + - '*' +- apiGroups: + - drop.corewire.io + resources: + - discoverypolicies/status + verbs: + - get diff --git a/config/rbac/discoverypolicy_editor_role.yaml b/config/rbac/discoverypolicy_editor_role.yaml new file mode 100644 index 0000000..34a7a55 --- /dev/null +++ b/config/rbac/discoverypolicy_editor_role.yaml @@ -0,0 +1,33 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants permissions to create, update, and delete resources within the drop.corewire.io. +# This role is intended for users who need to manage these resources +# but should not control RBAC or manage permissions for others. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: discoverypolicy-editor-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - discoverypolicies + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - drop.corewire.io + resources: + - discoverypolicies/status + verbs: + - get diff --git a/config/rbac/discoverypolicy_viewer_role.yaml b/config/rbac/discoverypolicy_viewer_role.yaml new file mode 100644 index 0000000..48d68bc --- /dev/null +++ b/config/rbac/discoverypolicy_viewer_role.yaml @@ -0,0 +1,29 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants read-only access to drop.corewire.io resources. +# This role is intended for users who need visibility into these resources +# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: discoverypolicy-viewer-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - discoverypolicies + verbs: + - get + - list + - watch +- apiGroups: + - drop.corewire.io + resources: + - discoverypolicies/status + verbs: + - get diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml new file mode 100644 index 0000000..3afcc63 --- /dev/null +++ b/config/rbac/kustomization.yaml @@ -0,0 +1,37 @@ +resources: +# All RBAC will be applied under this service account in +# the deployment namespace. You may comment out this resource +# if your manager will use a service account that exists at +# runtime. Be sure to update RoleBinding and ClusterRoleBinding +# subjects if changing service account names. +- service_account.yaml +- role.yaml +- role_binding.yaml +- leader_election_role.yaml +- leader_election_role_binding.yaml +# The following RBAC configurations are used to protect +# the metrics endpoint with authn/authz. These configurations +# ensure that only authorized users and service accounts +# can access the metrics endpoint. Comment the following +# permissions if you want to disable this protection. +# More info: https://book.kubebuilder.io/reference/metrics.html +- metrics_auth_role.yaml +- metrics_auth_role_binding.yaml +- metrics_reader_role.yaml +# For each CRD, "Admin", "Editor" and "Viewer" roles are scaffolded by +# default, aiding admins in cluster management. Those roles are +# not used by the {{ .ProjectName }} itself. You can comment the following lines +# if you do not want those helpers be installed with your Project. +- discoverypolicy_admin_role.yaml +- discoverypolicy_editor_role.yaml +- discoverypolicy_viewer_role.yaml +- pullpolicy_admin_role.yaml +- pullpolicy_editor_role.yaml +- pullpolicy_viewer_role.yaml +- cachedimageset_admin_role.yaml +- cachedimageset_editor_role.yaml +- cachedimageset_viewer_role.yaml +- cachedimage_admin_role.yaml +- cachedimage_editor_role.yaml +- cachedimage_viewer_role.yaml + diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml new file mode 100644 index 0000000..e239de9 --- /dev/null +++ b/config/rbac/leader_election_role.yaml @@ -0,0 +1,40 @@ +# permissions to do leader election. +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: leader-election-role +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml new file mode 100644 index 0000000..3db0a7f --- /dev/null +++ b/config/rbac/leader_election_role_binding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: leader-election-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: leader-election-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/metrics_auth_role.yaml b/config/rbac/metrics_auth_role.yaml new file mode 100644 index 0000000..32d2e4e --- /dev/null +++ b/config/rbac/metrics_auth_role.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metrics-auth-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml new file mode 100644 index 0000000..e775d67 --- /dev/null +++ b/config/rbac/metrics_auth_role_binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: metrics-auth-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: metrics-auth-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/metrics_reader_role.yaml b/config/rbac/metrics_reader_role.yaml new file mode 100644 index 0000000..51a75db --- /dev/null +++ b/config/rbac/metrics_reader_role.yaml @@ -0,0 +1,9 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metrics-reader +rules: +- nonResourceURLs: + - "/metrics" + verbs: + - get diff --git a/config/rbac/pullpolicy_admin_role.yaml b/config/rbac/pullpolicy_admin_role.yaml new file mode 100644 index 0000000..337b7f0 --- /dev/null +++ b/config/rbac/pullpolicy_admin_role.yaml @@ -0,0 +1,27 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants full permissions ('*') over drop.corewire.io. +# This role is intended for users authorized to modify roles and bindings within the cluster, +# enabling them to delegate specific permissions to other users or groups as needed. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: pullpolicy-admin-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - pullpolicies + verbs: + - '*' +- apiGroups: + - drop.corewire.io + resources: + - pullpolicies/status + verbs: + - get diff --git a/config/rbac/pullpolicy_editor_role.yaml b/config/rbac/pullpolicy_editor_role.yaml new file mode 100644 index 0000000..7ee2512 --- /dev/null +++ b/config/rbac/pullpolicy_editor_role.yaml @@ -0,0 +1,33 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants permissions to create, update, and delete resources within the drop.corewire.io. +# This role is intended for users who need to manage these resources +# but should not control RBAC or manage permissions for others. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: pullpolicy-editor-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - pullpolicies + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - drop.corewire.io + resources: + - pullpolicies/status + verbs: + - get diff --git a/config/rbac/pullpolicy_viewer_role.yaml b/config/rbac/pullpolicy_viewer_role.yaml new file mode 100644 index 0000000..e0f472c --- /dev/null +++ b/config/rbac/pullpolicy_viewer_role.yaml @@ -0,0 +1,29 @@ +# This rule is not used by the project drop itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants read-only access to drop.corewire.io resources. +# This role is intended for users who need visibility into these resources +# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: pullpolicy-viewer-role +rules: +- apiGroups: + - drop.corewire.io + resources: + - pullpolicies + verbs: + - get + - list + - watch +- apiGroups: + - drop.corewire.io + resources: + - pullpolicies/status + verbs: + - get diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml new file mode 100644 index 0000000..76ec601 --- /dev/null +++ b/config/rbac/role.yaml @@ -0,0 +1,72 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: manager-role +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +- apiGroups: + - "" + resources: + - nodes + - secrets + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - create + - delete + - get + - list + - watch +- apiGroups: + - drop.corewire.io + resources: + - cachedimages + - cachedimagesets + - discoverypolicies + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - drop.corewire.io + resources: + - cachedimages/finalizers + - cachedimagesets/finalizers + - discoverypolicies/finalizers + verbs: + - update +- apiGroups: + - drop.corewire.io + resources: + - cachedimages/status + - cachedimagesets/status + - discoverypolicies/status + verbs: + - get + - patch + - update +- apiGroups: + - drop.corewire.io + resources: + - pullpolicies + verbs: + - get + - list + - watch diff --git a/config/rbac/role_binding.yaml b/config/rbac/role_binding.yaml new file mode 100644 index 0000000..475e845 --- /dev/null +++ b/config/rbac/role_binding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: manager-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: manager-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/service_account.yaml b/config/rbac/service_account.yaml new file mode 100644 index 0000000..03bbd08 --- /dev/null +++ b/config/rbac/service_account.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: controller-manager + namespace: system diff --git a/config/samples/drop_v1alpha1_cachedimage.yaml b/config/samples/drop_v1alpha1_cachedimage.yaml new file mode 100644 index 0000000..fb30ce1 --- /dev/null +++ b/config/samples/drop_v1alpha1_cachedimage.yaml @@ -0,0 +1,9 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: cachedimage-sample +spec: + # TODO(user): Add fields here diff --git a/config/samples/drop_v1alpha1_cachedimageset.yaml b/config/samples/drop_v1alpha1_cachedimageset.yaml new file mode 100644 index 0000000..26e51d4 --- /dev/null +++ b/config/samples/drop_v1alpha1_cachedimageset.yaml @@ -0,0 +1,9 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: cachedimageset-sample +spec: + # TODO(user): Add fields here diff --git a/config/samples/drop_v1alpha1_discoverypolicy.yaml b/config/samples/drop_v1alpha1_discoverypolicy.yaml new file mode 100644 index 0000000..3bf771b --- /dev/null +++ b/config/samples/drop_v1alpha1_discoverypolicy.yaml @@ -0,0 +1,9 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: discoverypolicy-sample +spec: + # TODO(user): Add fields here diff --git a/config/samples/drop_v1alpha1_pullpolicy.yaml b/config/samples/drop_v1alpha1_pullpolicy.yaml new file mode 100644 index 0000000..e409b06 --- /dev/null +++ b/config/samples/drop_v1alpha1_pullpolicy.yaml @@ -0,0 +1,9 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + labels: + app.kubernetes.io/name: drop + app.kubernetes.io/managed-by: kustomize + name: pullpolicy-sample +spec: + # TODO(user): Add fields here diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml new file mode 100644 index 0000000..6818798 --- /dev/null +++ b/config/samples/kustomization.yaml @@ -0,0 +1,7 @@ +## Append samples of your project ## +resources: +- drop_v1alpha1_cachedimage.yaml +- drop_v1alpha1_cachedimageset.yaml +- drop_v1alpha1_pullpolicy.yaml +- drop_v1alpha1_discoverypolicy.yaml +# +kubebuilder:scaffold:manifestskustomizesamples diff --git a/docs/.hugo_build.lock b/docs/.hugo_build.lock new file mode 100644 index 0000000..e69de29 diff --git a/docs/assets/css/custom.css b/docs/assets/css/custom.css new file mode 100644 index 0000000..bccd6ee --- /dev/null +++ b/docs/assets/css/custom.css @@ -0,0 +1,10 @@ +/* Mermaid diagrams: fill available width */ +pre.mermaid svg { + max-width: 100% !important; + height: auto !important; +} + +/* Asciinema player: ensure full width in containers */ +.ap-player { + width: 100% !important; +} diff --git a/docs/content/_index.md b/docs/content/_index.md new file mode 100644 index 0000000..5577018 --- /dev/null +++ b/docs/content/_index.md @@ -0,0 +1,63 @@ +--- +title: Drop +layout: hextra-home +description: Kubernetes operator that pre-caches container images on cluster nodes. +llmsDescription: | + Drop is a Kubernetes operator that pre-caches container images on cluster + nodes. CachedImage CR → Drop Operator → Pod per node → kubelet pulls image + → Pod exits → image cached. CRDs: CachedImage, CachedImageSet, PullPolicy, + DiscoveryPolicy. API group drop.corewire.io/v1alpha1, all cluster-scoped. + No privileged containers — uses kubelet image pulls only. +--- + +
+{{< hextra/hero-headline >}} + Drop +{{< /hextra/hero-headline >}} +
+ +
+{{< hextra/hero-subtitle >}} + Pre-cache container images on Kubernetes nodes. +{{< /hextra/hero-subtitle >}} +
+ +{{< tabs items="Apply + Status,Pods + Nodes,Events" >}} + +{{< tab >}} +{{< asciinema file="casts/apply.cast" autoplay="true" loop="true" speed="0.75" >}} +{{< /tab >}} + +{{< tab >}} +{{< asciinema file="casts/pods.cast" autoplay="true" loop="true" speed="0.75" >}} +{{< /tab >}} + +{{< tab >}} +{{< asciinema file="casts/events.cast" autoplay="true" loop="true" speed="0.75" >}} +{{< /tab >}} + +{{< /tabs >}} + +> Create a CachedImage → operator spawns a Pod per node → kubelet pulls the image → Pod exits → image is warm. No privileges, no DaemonSets. + +--- + +## I want to... + +{{< hextra/feature-grid >}} + {{< hextra/feature-card + title="Use Drop" + subtitle="Install, create CachedImages, configure pacing and discovery." + link="docs/install/" + >}} + {{< hextra/feature-card + title="Develop Drop" + subtitle="Architecture, CRD reference, build and test commands." + link="docs/developing/" + >}} + {{< hextra/feature-card + title="Feed to AI Agent" + subtitle="llms.txt, Markdown API, full reference in one request." + link="docs/for-ai-agents/" + >}} +{{< /hextra/feature-grid >}} diff --git a/docs/content/docs/_index.md b/docs/content/docs/_index.md new file mode 100644 index 0000000..5246c08 --- /dev/null +++ b/docs/content/docs/_index.md @@ -0,0 +1,36 @@ +--- +title: Documentation +weight: 1 +description: Drop operator documentation. +llmsDescription: | + Documentation index for the drop Kubernetes operator. Sections: install, + usage (CachedImage/CachedImageSet/PullPolicy examples), discovery + (DiscoveryPolicy), monitoring (metrics/events), reference (CRD fields, + errors, metrics, architecture), developing (build/test/contribute). +--- + +Drop pre-caches container images on Kubernetes nodes using short-lived Pods. + +## Why + +When many CI jobs or workloads start simultaneously, Kubernetes nodes face a thundering herd of image pulls. Concurrent pods on the same node all pulling the same large image saturate bandwidth, stall containerd, and cascade into failures. + +| Problem | Impact | +|---------|--------| +| **Thundering herd** | Parallel pulls of the same image destabilize nodes | +| **Registry overload** | Sudden pull surges hit rate limits or cause outages | +| **Cold-start latency** | Large images delay workloads that need them immediately | + +Drop pre-caches images *before* workloads need them, paces pulls to stay within safe limits, and automatically discovers which images matter most. + +## Sections + +| Section | What you'll find | +|---------|-----------------| +| [Installation](install/) | Helm install, prerequisites | +| [Usage](usage/) | CachedImage, CachedImageSet, PullPolicy examples | +| [Discovery](discovery/) | Automatic image discovery with DiscoveryPolicy | +| [Monitoring](monitoring/) | Prometheus metrics, events, status conditions | +| [Reference](reference/) | CRD field reference, errors, metrics, architecture | +| [Developing](developing/) | Build, test, lint, project structure | +| [For AI Agents](for-ai-agents/) | llms.txt, Markdown API, generation architecture | diff --git a/docs/content/docs/crds.md b/docs/content/docs/crds.md new file mode 100644 index 0000000..f87378f --- /dev/null +++ b/docs/content/docs/crds.md @@ -0,0 +1,115 @@ +--- +title: CRD Reference +weight: 2 +description: Overview of all drop Custom Resource Definitions. +llmsDescription: | + Overview of drop CRDs under drop.corewire.io/v1alpha1. CachedImage caches + a single image, CachedImageSet caches a list via imageListSpec or + discoveryPolicyRef, PullPolicy configures pull behaviour (nodeSelector, + imagePullSecrets, scheduling), DiscoveryPolicy discovers images from external + sources (Prometheus, OCI registry). All cluster-scoped. +--- + +All CRDs are cluster-scoped under `drop.corewire.io/v1alpha1`. + +## CachedImage + +Declares a single container image to cache on target nodes. + +| Field | Type | Description | +|-------|------|-------------| +| `spec.image` | string | **Required.** Full image reference (e.g., `docker.io/library/nginx:1.25`) | +| `spec.nodeSelector` | map[string]string | Label selector for target nodes | +| `spec.tolerations` | []Toleration | Tolerations for tainted nodes | +| `spec.policyRef.name` | string | Reference to a PullPolicy for pacing | +| `spec.pullPolicy` | string | `Always` or `IfNotPresent` (default: `IfNotPresent`) | +| `spec.repullInterval` | duration | Re-pull interval for moving tags (e.g., `24h`) | + +### Status + +| Field | Type | Description | +|-------|------|-------------| +| `status.phase` | string | `Pending`, `Pulling`, `Ready`, or `Degraded` | +| `status.nodesTargeted` | int32 | Number of nodes matching selector | +| `status.nodesReady` | int32 | Number of nodes with image cached | +| `status.lastPulledAt` | time | Timestamp of last successful pull | +| `status.conditions` | []Condition | Standard conditions (Ready) | + +## CachedImageSet + +Manages a collection of CachedImage resources. + +| Field | Type | Description | +|-------|------|-------------| +| `spec.images` | []string | Static list of image references | +| `spec.discoveryPolicyRef.name` | string | Reference to a DiscoveryPolicy | +| `spec.nodeSelector` | map[string]string | Inherited by child CachedImages | +| `spec.tolerations` | []Toleration | Inherited by child CachedImages | +| `spec.policyRef.name` | string | Inherited by child CachedImages | + +## PullPolicy + +Controls pacing for image pulls across the cluster. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `spec.maxConcurrentNodes` | int32 | `1` | Max nodes pulling simultaneously | +| `spec.minDelayBetweenPulls` | duration | `10s` | Minimum delay between starting new pulls | +| `spec.failureBackoff` | duration | `5m` | Wait time after failure before retry | + +## DiscoveryPolicy + +Discovers images from external sources. + +| Field | Type | Description | +|-------|------|-------------| +| `spec.interval` | duration | How often to query sources (e.g., `1h`) | +| `spec.topX` | int32 | Maximum number of images to discover | +| `spec.imageFilter` | string | Regex filter applied to discovered images | +| `spec.sources` | []Source | List of discovery sources | + +### Source Types + +#### Prometheus + +```yaml +sources: + - type: prometheus + prometheus: + endpoint: https://prometheus.example.com + query: 'count(container_image_pull_total) by (image)' + secretRef: + name: prometheus-creds +``` + +The query must return an `image` label. The metric value becomes the ranking score. + +#### Registry + +```yaml +sources: + - type: registry + registry: + url: https://registry.example.com + repositories: + - my-org/my-app + tagFilter: "^v\\d+\\.\\d+\\.\\d+$" + topX: 5 + imageTemplate: "registry.example.com/{{ .Repository }}:{{ .Tag }}" + secretRef: + name: registry-creds +``` + +### Secret Format + +Secrets referenced by `secretRef` support these well-known keys: + +| Key | Description | +|-----|-------------| +| `token` | Bearer token for Authorization header | +| `username` | Username for basic auth | +| `password` | Password for basic auth | +| `ca.crt` | CA certificate for TLS verification | +| `tls.crt` | Client certificate for mTLS | +| `tls.key` | Client key for mTLS | +| `headers.` | Custom HTTP header value | diff --git a/docs/content/docs/developing.md b/docs/content/docs/developing.md new file mode 100644 index 0000000..a40e697 --- /dev/null +++ b/docs/content/docs/developing.md @@ -0,0 +1,20 @@ +--- +title: Developer Guide +weight: 6 +description: Everything you need to build, debug, test, and extend Drop. +llmsDescription: | + Developer guide index. Links to architecture, local dev setup, build commands, + testing, debugging, extending (new CRDs), code conventions, and release process. +--- + +This guide covers everything needed to work on Drop — from first checkout to shipping a release. + +{{< cards >}} + {{< card link="developing/architecture" title="Architecture" subtitle="Package graph, reconciler flows, design decisions" >}} + {{< card link="developing/setup" title="Local Dev Setup" subtitle="Prerequisites, kind cluster, Tilt" >}} + {{< card link="developing/testing" title="Testing" subtitle="envtest, Chainsaw e2e, patterns" >}} + {{< card link="developing/debugging" title="Debugging" subtitle="Logs, common issues, pacing diagnostics, Delve" >}} + {{< card link="developing/extending" title="Extending" subtitle="Adding a new CRD step-by-step" >}} + {{< card link="developing/conventions" title="Conventions" subtitle="Naming, status patterns, import order, don'ts" >}} + {{< card link="developing/releasing" title="Releasing" subtitle="Tag-triggered CI, multi-arch builds, Helm OCI" >}} +{{< /cards >}} diff --git a/docs/content/docs/developing/_index.md b/docs/content/docs/developing/_index.md new file mode 100644 index 0000000..a940b81 --- /dev/null +++ b/docs/content/docs/developing/_index.md @@ -0,0 +1,4 @@ +--- +title: Developer Guide +weight: 6 +--- diff --git a/docs/content/docs/developing/architecture.md b/docs/content/docs/developing/architecture.md new file mode 100644 index 0000000..7775d73 --- /dev/null +++ b/docs/content/docs/developing/architecture.md @@ -0,0 +1,121 @@ +--- +title: Architecture +weight: 1 +description: How the operator is structured internally. +llmsDescription: | + Architecture of drop operator. Three reconcilers (CachedImage, CachedImageSet, + DiscoveryPolicy), shared pacing engine, pure pod builder, discovery sources + (Prometheus, Registry). All CRDs cluster-scoped. Pods use nodeName + command: ["true"]. +--- + +Drop is a Kubernetes operator that pre-caches container images on cluster nodes by creating short-lived Pods. +It uses **kubelet-based image pulls** (no CRI socket, no privileged containers). + +## High-Level Flow + +``` +CachedImageSet ──owns──▶ CachedImage[] ──creates──▶ Pod (per node) + ▲ │ + │ image pulled by +DiscoveryPolicy ──discovers───┘ kubelet + │ + ├── PrometheusSource (PromQL query) + └── RegistrySource (OCI tag list) +``` + +## Package Dependency Graph + +``` +cmd/main.go + └── internal/controller/ + ├── cachedimage_controller.go (core pull loop) + ├── cachedimageset_controller.go (child management) + └── discoverypolicy_controller.go (image discovery) + │ + ├── internal/pacing/ (rate-limiting engine) + ├── internal/podbuilder/ (pure Pod construction) + ├── internal/discovery/ (source interface + impls) + └── internal/metrics/ (Prometheus counters/gauges) + +api/v1alpha1/ (CRD type definitions — imported by all) +``` + +## Reconciler Responsibilities + +### CachedImage Controller + +The core pull loop. For each CachedImage: +1. Resolve target nodes (by nodeSelector + toleration compatibility) +2. Fetch referenced PullPolicy for pacing config +3. Build per-node state from owned Pods +4. Mark nodes for re-pull if repull interval elapsed +5. Process Pod states (succeeded → mark ready, failed → mark degraded) +6. Schedule pulls respecting pacing engine +7. Update status with phase, ready count, conditions +8. Requeue based on backoff or repull interval + +### CachedImageSet Controller + +Child management. For each CachedImageSet: +1. Build desired image list (static + discovered via DiscoveryPolicy) +2. List existing child CachedImages (by ownerReference) +3. Diff: create missing, delete unwanted children +4. Update status: count ready, propagate failure reasons + +### DiscoveryPolicy Controller + +Image discovery. For each DiscoveryPolicy: +1. Query each source (Prometheus or Registry), measure latency +2. Merge results, deduplicate by highest score +3. Apply image filter (regex) +4. Sort by score, truncate to maxImages +5. Set status: DiscoveredImages, conditions +6. Requeue after SyncInterval + +## Key Design Decisions + +| Decision | Rationale | +|----------|-----------| +| One controller per CRD | Single responsibility; easier to reason about | +| Shared pacing engine | Prevents thundering herd across all CachedImages | +| Pod builder is a pure function | No k8s client = easy to unit test | +| `command: ["true"]` Pods | Kubelet pulls the image, Pod exits immediately | +| `nodeName` placement | Guarantees scheduling to the target node | +| Cluster-scoped CRDs | Images are node-level; namespaces don't apply | +| `metav1.Condition` status | Standard K8s pattern for Ready/Degraded states | +| ownerReferences | CachedImageSet→CachedImage, CachedImage→Pod for GC | + +## Pacing Engine + +Located in `internal/pacing/`. Shared across all CachedImage reconciliations. + +Blocks new pulls when: +- Active (Pending/Running) Pods ≥ `maxConcurrentNodes` +- Time since last Pod creation < `minDelayBetweenPulls` + +Pods stuck in `ErrImagePull`/`ImagePullBackOff` are excluded from the active count. + +## Pod Builder + +Located in `internal/podbuilder/`. A pure function (`BuildDropPod`) with no k8s client dependency. + +Produces Pods with: +- Labels: `app.kubernetes.io/managed-by=drop`, `drop.corewire.io/cachedimage=`, `drop.corewire.io/node=` +- `command: ["true"]` (no-op, image pull is the side effect) +- `RestartPolicy: Never`, `AutomountServiceAccountToken: false` +- `TerminationGracePeriodSeconds: 0` +- Tolerations + ImagePullSecrets propagated from CachedImage + +## Discovery Sources + +Located in `internal/discovery/`. Implements the `Source` interface: + +```go +type Source interface { + Fetch(ctx context.Context) ([]ImageResult, error) +} +``` + +**PrometheusSource:** Queries Prometheus for container images (requires `image` label in results). Supports instant and range queries. + +**RegistrySource:** Lists tags from an OCI registry via `/v2//tags/list`. Filters by regex, limits to TopX most recent. diff --git a/docs/content/docs/developing/conventions.md b/docs/content/docs/developing/conventions.md new file mode 100644 index 0000000..76bf4d6 --- /dev/null +++ b/docs/content/docs/developing/conventions.md @@ -0,0 +1,79 @@ +--- +title: Code Conventions +weight: 6 +description: Naming, patterns, and rules for contributing. +llmsDescription: | + Code conventions for drop. CRDs PascalCase, cluster-scoped. Status uses + metav1.Condition type "Ready". Pod builder is pure function. Pacing in + internal/pacing/ only. Table-driven tests. Import order: stdlib, k8s, project. +--- + +## Naming + +- CRD kinds: PascalCase (`CachedImage`, not `Cached_Image`) +- API group: `drop.corewire.io/v1alpha1` +- Controller files: `_controller.go` (lowercase) +- Test files: `_controller_test.go` + +## Status Patterns + +Always use `metav1.Condition` with type `"Ready"`: + +```go +meta.SetStatusCondition(&obj.Status.Conditions, metav1.Condition{ + Type: "Ready", + Status: metav1.ConditionTrue, + Reason: "AllNodesCached", + Message: "Image cached on all target nodes", + ObservedGeneration: obj.Generation, +}) +``` + +Phase progression: `Pending` → `Pulling` → `Ready` (or `Degraded`). + +## Error Classification + +Controllers classify errors into condition reasons: +- `DNSError`, `ConnectionRefused`, `Timeout`, `AuthenticationFailed`, `NotFound`, `RateLimited` + +## Pod Construction Rules + +- Always use `podbuilder.BuildDropPod()` — never construct Pods inline +- Pods get labels: `app.kubernetes.io/managed-by=drop`, `drop.corewire.io/cachedimage=`, `drop.corewire.io/node=` +- `RestartPolicy: Never` +- `AutomountServiceAccountToken: false` +- `TerminationGracePeriodSeconds: 0` + +## Import Order + +```go +import ( + // stdlib + "context" + "fmt" + + // k8s / controller-runtime + "sigs.k8s.io/controller-runtime/pkg/client" + + // project + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" + "github.com/Breee/drop/internal/pacing" +) +``` + +## Test Patterns + +- Table-driven tests preferred +- envtest for controllers (real API server, no kubelet) +- `httptest.NewServer` for discovery source mocks +- No mocking the k8s client directly — use envtest + +## Don'ts + +- Don't add CRI socket access or privileged containers +- Don't put pacing logic outside `internal/pacing/` +- Don't create namespaced CRDs +- Don't manually edit generated files (`zz_generated.deepcopy.go`, `config/crd/bases/`) +- Don't manually edit `llms.txt`, `llms-full.txt`, `.cursorrules`, `AGENTS.md` — run `make docs-gen` +- Don't construct Pods outside of `podbuilder.BuildDropPod()` +- Don't use `client.Mock` — use envtest instead diff --git a/docs/content/docs/developing/debugging.md b/docs/content/docs/developing/debugging.md new file mode 100644 index 0000000..4092cbd --- /dev/null +++ b/docs/content/docs/developing/debugging.md @@ -0,0 +1,105 @@ +--- +title: Debugging +weight: 4 +description: Logs, common issues, pacing diagnostics, and Delve. +llmsDescription: | + Debugging guide for drop. Check operator logs, inspect CachedImage status, + list drop Pods. Common issues: Pending pods (nodeSelector), ErrImagePull (auth), + stuck Pulling (pacing), Degraded (consecutive failures). Use Delve for local debugging. +--- + +## Operator Logs + +```bash +kubectl logs -n drop-system deploy/drop-controller-manager -f +``` + +The operator logs structured JSON. Look for `"controller"` and `"reconcileID"` fields to trace a specific reconciliation. + +## Inspect a CachedImage + +```bash +kubectl get cachedimage -o yaml +``` + +Key status fields: +- `phase`: Pending → Pulling → Ready (or Degraded) +- `conditions[type=Ready]`: The definitive health signal +- `cachedNodes`: Which nodes have the image +- `nodesTargeted` / `nodesReady`: Progress tracking +- `consecutiveFailures`: Backoff trigger + +## Inspect Drop Pods + +```bash +kubectl get pods -l app.kubernetes.io/managed-by=drop -o wide +``` + +Pods should be `Succeeded` (image pulled) or `Failed` (pull error). Check events for details: + +```bash +kubectl describe pod +``` + +## Common Issues + +| Symptom | Cause | Fix | +|---------|-------|-----| +| Pod stuck `Pending` | Node selector doesn't match any node | Check `nodeSelector` on CachedImage | +| Pod `ErrImagePull` | Wrong image name or missing auth | Check `imagePullSecrets`, verify image ref exists | +| CachedImage stays `Pulling` | Pacing engine throttling | Check PullPolicy `maxConcurrentNodes` / `minDelayBetweenPulls` | +| CachedImage `Degraded` | Consecutive failures exceeded | Check Pod events, increase backoff in PullPolicy | +| DiscoveryPolicy no images | Prometheus query returns empty | Run query manually in Prometheus UI, check for `image` label | +| DiscoveryPolicy `DNSError` | Source endpoint unreachable | Check network policies, DNS, service name | + +## Pacing Engine Diagnostics + +The pacing engine (in `internal/pacing/`) blocks new pulls when: +1. Active (Pending/Running) Pods ≥ `maxConcurrentNodes` +2. Time since last Pod creation < `minDelayBetweenPulls` + +Pods stuck in `ErrImagePull`/`ImagePullBackOff` are **excluded** from the active count (so they don't block other pulls). + +To check pacing state: +```bash +# Count active drop pods +kubectl get pods -l app.kubernetes.io/managed-by=drop --field-selector=status.phase!=Succeeded,status.phase!=Failed + +# Check the metric +curl -s localhost:8443/metrics | grep drop_active_pulls +``` + +## Delve Debugging + +```bash +# Run the operator locally with delve: +dlv debug ./cmd/ -- --metrics-bind-address=:8443 + +# Or attach to a running process: +dlv attach +``` + +When running locally, the operator uses your `~/.kube/config` context. + +### Useful breakpoints + +| Location | Why | +|----------|-----| +| `cachedimage_controller.go:Reconcile` | Entry point for the core loop | +| `pacing.go:CanStartPull` | Pacing decision point | +| `builder.go:BuildDropPod` | Pod spec construction | +| `discoverypolicy_controller.go:buildSource` | Source creation | + +## Metrics for Debugging + +```bash +curl -s localhost:8443/metrics | grep drop_ +``` + +| Metric | What it tells you | +|--------|-------------------| +| `drop_active_pulls` | How many Pods are in-flight right now | +| `drop_pull_errors_total` | Which images/nodes are failing | +| `drop_pull_duration_seconds` | How long pulls take | +| `drop_reconcile_total{result="error"}` | Controller errors | +| `drop_discovery_source_health` | Whether sources are reachable | diff --git a/docs/content/docs/developing/extending.md b/docs/content/docs/developing/extending.md new file mode 100644 index 0000000..9869676 --- /dev/null +++ b/docs/content/docs/developing/extending.md @@ -0,0 +1,150 @@ +--- +title: Extending +weight: 5 +description: Step-by-step guide to adding a new CRD. +llmsDescription: | + How to add a new CRD to drop. Steps: define types in api/v1alpha1/, run make codegen, + write controller in internal/controller/, register in cmd/main.go, add tests (envtest + e2e), + create sample, run make docs-gen. All CRDs must be cluster-scoped. +--- + +## Adding a New CRD + +### 1. Define the types + +Create `api/v1alpha1/_types.go`: + +```go +package v1alpha1 + +import metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` +// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" +type MyCRD struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + Spec MyCRDSpec `json:"spec,omitempty"` + Status MyCRDStatus `json:"status,omitempty"` +} + +type MyCRDSpec struct { + // +kubebuilder:validation:Required + SomeField string `json:"someField"` +} + +type MyCRDStatus struct { + Phase string `json:"phase,omitempty"` + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +type MyCRDList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []MyCRD `json:"items"` +} + +func init() { + SchemeBuilder.Register(&MyCRD{}, &MyCRDList{}) +} +``` + +**Rules:** +- Must be cluster-scoped (`+kubebuilder:resource:scope=Cluster`) +- Status must include `[]metav1.Condition` +- Register in `init()` via `SchemeBuilder` + +### 2. Generate code + +```bash +make codegen +``` + +This produces: +- `api/v1alpha1/zz_generated.deepcopy.go` (updated) +- `config/crd/bases/drop.corewire.io_mycrds.yaml` +- RBAC roles in `config/rbac/` + +### 3. Write the controller + +Create `internal/controller/_controller.go`: + +```go +package controller + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" +) + +type MyCRDReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=drop.corewire.io,resources=mycrds,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=drop.corewire.io,resources=mycrds/status,verbs=get;update;patch + +func (r *MyCRDReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := log.FromContext(ctx) + + var obj dropv1alpha1.MyCRD + if err := r.Get(ctx, req.NamespacedName, &obj); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + log.Info("reconciling", "name", obj.Name) + + // Business logic here + + return ctrl.Result{}, nil +} + +func (r *MyCRDReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&dropv1alpha1.MyCRD{}). + Complete(r) +} +``` + +### 4. Register in cmd/main.go + +```go +if err = (&controller.MyCRDReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), +}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "MyCRD") + os.Exit(1) +} +``` + +### 5. Add tests + +**Unit test** — `internal/controller/_controller_test.go`: +- Use envtest suite +- Create the resource, trigger reconciliation, assert status + +**E2E test** — `test/e2e/-basic/chainsaw-test.yaml`: +- Apply resource, assert expected status/children + +**Sample** — `config/samples/drop_v1alpha1_.yaml`: +- Minimal valid resource for testing + +### 6. Regenerate docs + +```bash +make docs-gen +``` + +This updates `llms.txt`, `AGENTS.md`, `.cursorrules`, `knowledge.yaml`, and the copilot instructions. diff --git a/docs/content/docs/developing/releasing.md b/docs/content/docs/developing/releasing.md new file mode 100644 index 0000000..dbc2092 --- /dev/null +++ b/docs/content/docs/developing/releasing.md @@ -0,0 +1,43 @@ +--- +title: Releasing +weight: 7 +description: Tag-triggered CI, multi-arch builds, and Helm OCI publishing. +llmsDescription: | + Release process for drop. Push a semver git tag to trigger CI: lint, test, e2e, + multi-arch Docker build (amd64+arm64) to ghcr.io, Helm chart OCI push, GitHub Release. +--- + +## How to Release + +```bash +git tag v0.1.0 +git push origin v0.1.0 +``` + +That's it. The CI pipeline handles the rest. + +## What CI Does on Tag Push + +1. **Lint** — golangci-lint +2. **Unit tests** — `make test` (envtest) +3. **E2E tests** — Chainsaw on kind +4. **Build multi-arch image** — `linux/amd64` + `linux/arm64` → `ghcr.io/breee/drop:` +5. **Package Helm chart** — push to OCI registry +6. **GitHub Release** — auto-generated release notes + +## Versioning + +| Format | Example | Use | +|--------|---------|-----| +| Stable | `v0.1.0` | Production release | +| Pre-release | `v0.1.0-rc.1` | Testing before stable | + +Chart version in `charts/drop/Chart.yaml` tracks the app version. + +## CI Workflows + +| Workflow | Trigger | Purpose | +|----------|---------|---------| +| `ci.yml` | Push, PR | Lint + test + build + e2e | +| `release.yml` | Tag push | Multi-arch build + publish | +| `docs.yml` | docs/ changes | Hugo build + GitHub Pages deploy | diff --git a/docs/content/docs/developing/setup.md b/docs/content/docs/developing/setup.md new file mode 100644 index 0000000..d4a6bff --- /dev/null +++ b/docs/content/docs/developing/setup.md @@ -0,0 +1,100 @@ +--- +title: Local Dev Setup +weight: 2 +description: Prerequisites, kind cluster, and Tilt workflow. +llmsDescription: | + Local development setup for drop. Requires Go 1.23+, Docker, kind, Tilt, kubectl, + Helm 3, golangci-lint, chainsaw. Run tilt up for full dev loop (compile, build, + deploy, port-forward, Hugo docs, e2e infra, dev samples). +--- + +## Prerequisites + +| Tool | Version | Purpose | +|------|---------|---------| +| Go | 1.23+ | Build the operator | +| Docker | any | Build images, run kind | +| kind | any | Local multi-node cluster | +| Tilt | any | Live-reload dev loop | +| kubectl | any | Cluster interaction | +| Helm | 3.x | Chart linting/deployment | +| golangci-lint | latest | Linting | +| chainsaw | latest | E2E tests | + +## Quick Start + +```bash +tilt up +``` + +That's it. Tilt handles everything: + +- Creates kind cluster `drop-dev` (1 control-plane + 2 workers) if it doesn't exist +- Compiles the Go binary +- Builds + loads the Docker image into kind +- Installs CRDs +- Deploys the operator via Helm +- Deploys e2e infrastructure (Prometheus, Registry, Grafana) +- Applies dev samples from `hack/dev-samples.yaml` +- Serves Hugo docs with live-reload +- Sets up port-forwards: + +| Port | Service | +|------|---------| +| 8443 | Operator metrics | +| 8081 | Health probes | +| 9090 | Prometheus | +| 5000 | OCI Registry | +| 3000 | Grafana | +| 1314 | Hugo docs | + +## Build Commands + +```bash +make codegen # regenerate deepcopy + CRD manifests + docs +make generate # deepcopy only +make manifests # CRD + RBAC YAML only +go build ./... # compile +make docker-build # build container image +make docs-gen # regenerate AI docs (llms.txt, AGENTS.md, etc.) +``` + +### When to run what + +| Changed… | Run | +|----------|-----| +| `api/v1alpha1/*_types.go` | `make codegen` | +| Any Go code | `go build ./...` | +| Controller RBAC markers | `make manifests` | +| Makefile or types | `make docs-gen` | + +## Useful Make Targets + +```bash +make help # list all targets +make kind-create # create dev cluster (Tilt does this automatically) +make install # apply CRDs to cluster +make e2e-infra # deploy Prometheus + Registry for testing +make helm-lint # lint the Helm chart +make lint # golangci-lint +make codegen # full code generation +make docs-gen # regenerate AI-friendly docs +``` + +## Without Tilt + +If you prefer not to use Tilt: + +```bash +# Create cluster +make kind-create + +# Install CRDs +make install + +# Run operator locally (uses ~/.kube/config) +go run ./cmd/ --metrics-bind-address=:8443 + +# Apply dev samples +kubectl apply -f hack/dev-samples.yaml +``` diff --git a/docs/content/docs/developing/testing.md b/docs/content/docs/developing/testing.md new file mode 100644 index 0000000..1c6bd49 --- /dev/null +++ b/docs/content/docs/developing/testing.md @@ -0,0 +1,93 @@ +--- +title: Testing +weight: 3 +description: Unit tests with envtest, E2E with Chainsaw, and test patterns. +llmsDescription: | + Testing guide for drop. Unit tests use controller-runtime envtest (real API server, + no kubelet). E2E uses Kyverno Chainsaw on kind. Table-driven tests preferred. + Discovery tests mock HTTP servers. Controller tests use real k8s client. +--- + +## Unit Tests (envtest) + +```bash +make test +``` + +Uses controller-runtime's `envtest` — a real API server + etcd, no kubelet. +Coverage report lands in `cover.out`. + +### Test Locations + +| Path | What it tests | +|------|---------------| +| `internal/controller/*_test.go` | Controller reconciliation logic | +| `internal/pacing/*_test.go` | Pacing engine constraints | +| `internal/podbuilder/*_test.go` | Pod construction correctness | +| `internal/discovery/*_test.go` | Source implementations | + +## E2E Tests (Chainsaw) + +```bash +make test-e2e +``` + +Requires a running kind cluster with the operator deployed (Tilt handles this). +Tests live in `test/e2e/` and use [Kyverno Chainsaw](https://kyverno.github.io/chainsaw/). + +Each test scenario is a directory with `chainsaw-test.yaml` defining steps: +1. Apply a resource +2. Assert expected state (status, child resources, events) +3. Cleanup + +## Writing Tests + +### Table-driven (preferred) + +```go +func TestSomething(t *testing.T) { + tests := []struct { + name string + // inputs + // expected outputs + }{ + {name: "happy path", ...}, + {name: "error case", ...}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // arrange, act, assert + }) + } +} +``` + +### Controller tests (envtest) + +```go +var k8sClient client.Client +var testEnv *envtest.Environment +// Setup in TestMain or BeforeSuite +``` + +Create resources with the real client, trigger reconciliation, assert status changes. + +### Discovery tests (mock HTTP) + +```go +srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Return mock Prometheus/Registry response +})) +defer srv.Close() + +source := &PrometheusSource{Endpoint: srv.URL, ...} +results, err := source.Fetch(ctx) +``` + +## Adding a New Test + +1. Create `*_test.go` next to the code being tested +2. Use table-driven format with descriptive case names +3. For controllers: create the CRD resource, reconcile, assert status +4. For discovery: mock the HTTP endpoint, call Fetch, assert results +5. Run `make test` to validate diff --git a/docs/content/docs/discovery.md b/docs/content/docs/discovery.md new file mode 100644 index 0000000..ab4fd45 --- /dev/null +++ b/docs/content/docs/discovery.md @@ -0,0 +1,112 @@ +--- +title: Discovery +weight: 3 +aliases: + - /drop/docs/discovery/ +description: Automatic image discovery with DiscoveryPolicy. +llmsDescription: | + DiscoveryPolicy CRD enables automatic image discovery from Prometheus metrics + or OCI registries. Referenced by CachedImageSet via discoveryPolicyRef. + Discovered images are materialized as CachedImage resources. Supports + filtering, deduplication, and periodic re-discovery. +--- + +The DiscoveryPolicy CRD enables automatic image discovery from external sources. When referenced by a CachedImageSet, discovered images are automatically materialized as CachedImage resources. + +## How It Works + +``` +DiscoveryPolicy → queries sources → writes to status.discoveredImages + ↓ +CachedImageSet → reads discoveredImages → creates/deletes CachedImage children +``` + +1. The DiscoveryPolicy reconciler queries all configured sources at the specified interval +2. Results are normalized to `{image, score}` pairs, merged, deduplicated, filtered, and sorted by score +3. Top-X results are written to `status.discoveredImages` +4. The CachedImageSet reconciler watches DiscoveryPolicy status changes +5. It diffs the desired images against existing CachedImage children +6. New CachedImages are created; orphaned ones are deleted via ownerReference GC + +## Prometheus Source + +### Query Contract + +Your Prometheus query **must** return an `image` label. The metric value becomes the ranking score (higher = more important). + +**Example:** Find the 30 most-used images in a namespace: + +```promql +count(container_memory_working_set_bytes{ + container!="", + container!="POD", + namespace="build-stuff" +}) by (image) +``` + +### Full Example + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: popular-build-images +spec: + interval: 1h + topX: 30 + imageFilter: "^(?!.*ecr\\..*amazonaws\\.com).*$" # Exclude ECR images + sources: + - type: prometheus + prometheus: + endpoint: https://mimir.example.com + query: | + count(container_memory_working_set_bytes{ + container!="", container!="POD", + namespace="build-stuff", cluster="mycluster" + }) by (image) + secretRef: + name: prometheus-creds +--- +apiVersion: v1 +kind: Secret +metadata: + name: prometheus-creds + namespace: drop-system +type: Opaque +stringData: + username: admin + password: my-prometheus-password +``` + +## Registry Source + +### Use Case: GitLab Runner Helper Images + +The registry source uses OCI Distribution API tag listing. Combined with `imageTemplate`, it handles complex tag patterns like GitLab Runner helpers: + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: gitlab-helpers +spec: + interval: 6h + topX: 10 + sources: + - type: registry + registry: + url: https://registry.gitlab.com + repositories: + - gitlab-org/gitlab-runner/gitlab-runner-helper + tagFilter: "^v\\d+\\.\\d+\\.\\d+$" + topX: 5 + imageTemplate: "registry.gitlab.com/{{ .Repository }}:x86_64-{{ .Tag }}" +``` + +This replaces the legacy bash script that curled the GitLab API and constructed image refs manually. + +## Error Handling + +- On transient failures, the operator keeps the **last known good** discovery results +- Source health is tracked via conditions on the DiscoveryPolicy status +- Each source is queried independently — one failing source doesn't block others diff --git a/docs/content/docs/for-ai-agents.md b/docs/content/docs/for-ai-agents.md new file mode 100644 index 0000000..5f1c0e7 --- /dev/null +++ b/docs/content/docs/for-ai-agents.md @@ -0,0 +1,122 @@ +--- +title: For AI Agents +weight: 7 +description: How to consume Drop docs as an AI agent or integrate with LLMs. +llmsDescription: | + Machine-readable documentation endpoints for drop. llms.txt at site root + lists all pages with summaries. llms-full.txt has complete CRD reference in + one file. Every page available as clean Markdown at {url}index.md. Link + alternate headers in HTML. Context menu has Open in ChatGPT/Claude links. + All docs generated from source code via make docs-gen. +--- + +## Endpoints + +| URL | Content | Use case | +|-----|---------|----------| +| [`/drop/llms.txt`](/drop/llms.txt) | Page index with one-line summaries | Discover what's available | +| [`/drop/llms-full.txt`](/drop/llms-full.txt) | Complete CRD reference, all fields | One GET = full project context | +| `{any-page}/index.md` | Clean Markdown (no HTML, no frontmatter) | Fetch individual pages | + +## How It Works + +All documentation is generated from one source of truth: + +```mermaid +flowchart TD + subgraph Source["Source of Truth"] + Types["api/v1alpha1/*_types.go"] + Ctrl["internal/controller/*.go"] + Metrics["internal/metrics/metrics.go"] + end + + Types --> Gen["make docs-gen"] + Ctrl --> Gen + Metrics --> Gen + + Gen --> LLMs["llms.txt
(page index)"] + Gen --> Full["llms-full.txt
(complete reference)"] + Gen --> Agents["AGENTS.md / .cursorrules
(IDE agent instructions)"] + Gen --> Hugo["Hugo pages
(HTML + Markdown)"] + + Hugo --> HTML["Human reads HTML"] + Hugo --> MD["Agent fetches index.md"] + LLMs --> RAG["RAG pipeline indexes"] + Full --> Chat["ChatGPT / Claude ingests"] +``` + +Three audiences, same facts: + +| Audience | What they consume | +|----------|-------------------| +| **USE agents** (ChatGPT, Claude, RAG) | `llms.txt`, `llms-full.txt`, `{page}/index.md` | +| **CODE agents** (Copilot, Cursor) | `.github/copilot-instructions.md`, `.cursorrules`, `AGENTS.md` | +| **Humans** | This Hugo site (HTML with search, nav, diagrams) | + +## Markdown Output + +Every page on this site is available as clean Markdown. Append `index.md` to any URL: + +``` +https://your-site.io/drop/docs/install/ → HTML +https://your-site.io/drop/docs/install/index.md → Markdown +``` + +The HTML head includes a `` tag pointing to the Markdown variant: + +```html + +``` + +## llms.txt + +Auto-generated by Hextra from page frontmatter. Lists every page with its `llmsDescription`: + +``` +# Drop Operator +> Kubernetes operator that caches container images on cluster nodes. + +## Documentation +- [Installation](http://...): Install via Helm. Requires K8s 1.28+... +- [Usage](http://...): CachedImage, CachedImageSet, PullPolicy examples... +... +``` + +## llms-full.txt + +Static file with the complete CRD field reference — every field, type, default, enum, and status condition in one document. Suitable for: +- Pasting into ChatGPT/Claude as project context +- RAG indexing +- Agent tools that accept a URL to read + +## IDE Agent Instructions + +Files in the repo root that IDE agents auto-discover: + +| File | Agent | +|------|-------| +| `.github/copilot-instructions.md` | GitHub Copilot | +| `.cursorrules` | Cursor | +| `AGENTS.md` | Codex, Devin, generic agents | + +All generated from the same source. Contains: build commands, conventions, CRD→controller mapping, don'ts. + +## Context Menu + +Every doc page has a context menu (top-right) with: +- **Copy as Markdown** — copies the page content +- **Open in ChatGPT** — opens ChatGPT with the Markdown URL pre-loaded +- **Open in Claude** — opens Claude with the Markdown URL pre-loaded + +## Generating Docs + +```bash +make docs-gen # regenerate everything from source +``` + +This runs `go run ./hack/gen-ai-docs/` which: +1. Parses Go types, controller code, metrics registration +2. Builds a `knowledge.yaml` intermediate representation +3. Renders templates for all output formats + +Adding a new output format = adding one template to `hack/gen-ai-docs/templates.go`. diff --git a/docs/content/docs/getting-started.md b/docs/content/docs/getting-started.md new file mode 100644 index 0000000..63c3f13 --- /dev/null +++ b/docs/content/docs/getting-started.md @@ -0,0 +1,89 @@ +--- +title: Getting Started +weight: 2 +description: Install and configure the drop operator. +llmsDescription: | + Installation guide for the drop operator. Prerequisites: Kubernetes 1.28+. + Install via Helm chart (charts/drop/). Create CachedImage or CachedImageSet + resources to start caching images. Operator watches for these resources and + creates short-lived Pods on target nodes to pull images via kubelet. +--- + +## Prerequisites + +- Kubernetes 1.28+ +- Helm 3.12+ +- cert-manager (optional, for secure metrics) + +## Installation + +### Via Helm (recommended) + +```bash +helm install drop oci://ghcr.io/breee/charts/drop \ + --namespace drop-system \ + --create-namespace +``` + +### With ServiceMonitor enabled + +```bash +helm install drop oci://ghcr.io/breee/charts/drop \ + --namespace drop-system \ + --create-namespace \ + --set serviceMonitor.enabled=true \ + --set certManager.enabled=true +``` + +## Your First CachedImage + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: nginx-latest +spec: + image: docker.io/library/nginx:latest + pullPolicy: Always +``` + +Apply it: + +```bash +kubectl apply -f cachedimage.yaml +kubectl get cachedimages +``` + +## Adding Pacing + +Create a PullPolicy to control how fast images are distributed: + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: conservative +spec: + maxConcurrentNodes: 2 + minDelayBetweenPulls: 30s + failureBackoff: 5m +``` + +Reference it from your CachedImage: + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: nginx-latest +spec: + image: docker.io/library/nginx:latest + policyRef: + name: conservative +``` + +## Next Steps + +- [CRD Reference](../reference/_generated_crds/) — full field documentation +- [Discovery](../discovery/) — automatic image discovery +- [Observability](../observability/) — metrics and monitoring diff --git a/docs/content/docs/install.md b/docs/content/docs/install.md new file mode 100644 index 0000000..90182bb --- /dev/null +++ b/docs/content/docs/install.md @@ -0,0 +1,43 @@ +--- +title: Installation +weight: 1 +aliases: + - /drop/docs/getting-started/ +description: Install the drop operator. +llmsDescription: | + Installation guide for the drop operator. Prerequisites: Kubernetes 1.28+, + Helm 3.12+. Install via Helm chart from ghcr.io/breee/charts/drop. + Optional: cert-manager for secure metrics, ServiceMonitor for Prometheus. +--- + +## Prerequisites + +- Kubernetes 1.28+ +- Helm 3.12+ +- cert-manager (optional, for secure metrics) + +## Helm Install + +```bash +helm install drop oci://ghcr.io/breee/charts/drop \ + --namespace drop-system \ + --create-namespace +``` + +### With Prometheus ServiceMonitor + +```bash +helm install drop oci://ghcr.io/breee/charts/drop \ + --namespace drop-system \ + --create-namespace \ + --set serviceMonitor.enabled=true \ + --set certManager.enabled=true +``` + +## Verify + +```bash +kubectl -n drop-system get pods +``` + +The operator Pod should be running and ready. diff --git a/docs/content/docs/kamera.md b/docs/content/docs/kamera.md new file mode 100644 index 0000000..ae9640d --- /dev/null +++ b/docs/content/docs/kamera.md @@ -0,0 +1,42 @@ +--- +title: Kamera Integration +weight: 5 +description: Simulation-based controller verification with Kamera. +llmsDescription: | + Kamera integration for simulation-based verification of drop controllers. + Uses deterministic simulation to test controller behaviour without a real + cluster. Catches race conditions and edge cases in reconciliation logic. +--- + +[Kamera](https://github.com/tgoodwin/Kamera) uses simulation to verify Kubernetes controller logic without running a real cluster. + +## Evaluation Status + +**Decision: Evaluate after MVP is stable.** + +### Rationale + +1. **Current coverage is sufficient for MVP**: Unit tests (pod builder, pacing, discovery) + envtest integration tests + Chainsaw E2E tests provide high confidence. +2. **Kamera adds value for complex state transitions**: Once we have production experience with edge cases (node churn during pulls, policy changes mid-rollout), Kamera can help verify invariants that are hard to test deterministically. +3. **Low priority vs. feature work**: The operator needs to be deployed and battle-tested first. + +### Planned Use Cases (Post-MVP) + +| Scenario | Invariant to Verify | +|----------|-------------------| +| Node removed during pull | No orphaned Pods, status eventually consistent | +| PullPolicy changed mid-rollout | New pacing applied without restarting in-flight pulls | +| DiscoveryPolicy source failure | Last known good set preserved, no cache thrashing | +| Concurrent CachedImage updates | No duplicate Pods per node | + +### Integration Plan + +1. Add `kamera` build tag to reconciler tests +2. Define state machine model for CachedImage lifecycle +3. Run simulation sweeps in CI nightly (not on every PR — too slow) +4. Compare failure modes found vs. existing test coverage + +### References + +- [Kamera GitHub](https://github.com/tgoodwin/Kamera) +- [The New Stack article](https://thenewstack.io/kamera-uses-simulation-to-verify-kubernetes-controller-logic/) diff --git a/docs/content/docs/monitoring.md b/docs/content/docs/monitoring.md new file mode 100644 index 0000000..8036c7b --- /dev/null +++ b/docs/content/docs/monitoring.md @@ -0,0 +1,78 @@ +--- +title: Monitoring +weight: 4 +aliases: + - /drop/docs/observability/ +description: Prometheus metrics, events, and health checks. +llmsDescription: | + Monitoring for drop: Prometheus metrics (drop_images_cached_total, + drop_pull_errors_total, drop_pull_duration_seconds, etc.), Kubernetes + events on CachedImage/CachedImageSet, and metav1.Condition status with + type Ready. ServiceMonitor included for Prometheus Operator integration. +--- + +## Prometheus Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `drop_images_cached_total` | Counter | `image`, `node` | Total images successfully cached | +| `drop_pull_duration_seconds` | Histogram | `image` | Duration of pull operations | +| `drop_pull_errors_total` | Counter | `image`, `node` | Total failed pull attempts | +| `drop_discovery_images_found` | Gauge | `policy`, `source_type` | Images found per discovery source | +| `drop_active_pulls` | Gauge | — | Currently active pull Pods | +| `drop_reconcile_total` | Counter | `controller`, `result` | Reconciliation attempts | + +### Enable ServiceMonitor + +```bash +helm install drop oci://ghcr.io/breee/charts/drop \ + --set serviceMonitor.enabled=true +``` + +### Example Queries + +```promql +# Pull success rate +rate(drop_images_cached_total[1h]) + +# p95 pull duration +histogram_quantile(0.95, rate(drop_pull_duration_seconds_bucket[1h])) + +# Error rate by image +rate(drop_pull_errors_total[1h]) + +# Active pulls right now +drop_active_pulls +``` + +## Kubernetes Events + +| Reason | Type | Description | +|--------|------|-------------| +| `PullStarted` | Normal | Image pull Pod created on a node | +| `PullSucceeded` | Normal | Image successfully cached on a node | +| `PullFailed` | Warning | Image pull failed on a node | + +```bash +kubectl get events --field-selector involvedObject.kind=CachedImage +``` + +## Status Conditions + +All resources use `metav1.Condition` with type `Ready`: + +```yaml +status: + conditions: + - type: Ready + status: "True" + reason: Cached + message: "Image cached on all 5 target nodes" +``` + +## Health Endpoints + +| Endpoint | Port | Description | +|----------|------|-------------| +| `/healthz` | 8081 | Liveness probe | +| `/readyz` | 8081 | Readiness probe | diff --git a/docs/content/docs/observability.md b/docs/content/docs/observability.md new file mode 100644 index 0000000..9dd34a9 --- /dev/null +++ b/docs/content/docs/observability.md @@ -0,0 +1,85 @@ +--- +title: Observability +weight: 4 +description: Monitoring the drop operator with Prometheus and Kubernetes events. +llmsDescription: | + Observability for drop: Prometheus metrics (drop_images_cached_total, + drop_pull_errors_total, drop_pull_duration_seconds, etc.), Kubernetes + events on CachedImage/CachedImageSet, and metav1.Condition status with + type Ready. ServiceMonitor included for Prometheus Operator integration. +--- + +The drop operator provides comprehensive observability through Prometheus metrics, Kubernetes events, and status conditions. + +## Prometheus Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `drop_images_cached_total` | Counter | `image`, `node` | Total images successfully cached | +| `drop_pull_duration_seconds` | Histogram | `image` | Duration of pull operations | +| `drop_pull_errors_total` | Counter | `image`, `node` | Total failed pull attempts | +| `drop_discovery_images_found` | Gauge | `policy`, `source_type` | Images found per discovery source | +| `drop_active_pulls` | Gauge | — | Currently active pull Pods | +| `drop_reconcile_total` | Counter | `controller`, `result` | Reconciliation attempts | + +### Enabling Metrics + +Metrics are enabled by default on port 8443 with secure serving. To scrape with Prometheus Operator: + +```bash +helm install drop oci://ghcr.io/breee/charts/drop \ + --set serviceMonitor.enabled=true +``` + +### Example Grafana Queries + +```promql +# Pull success rate over last hour +rate(drop_images_cached_total[1h]) + +# Average pull duration +histogram_quantile(0.95, rate(drop_pull_duration_seconds_bucket[1h])) + +# Error rate by image +rate(drop_pull_errors_total[1h]) + +# Active pulls right now +drop_active_pulls +``` + +## Kubernetes Events + +The operator emits events on CachedImage resources: + +| Event | Type | Reason | Description | +|-------|------|--------|-------------| +| Pull started | Normal | `PullStarted` | Image pull Pod created on a node | +| Pull succeeded | Normal | `PullSucceeded` | Image successfully cached on a node | +| Pull failed | Warning | `PullFailed` | Image pull failed on a node | + +View events: + +```bash +kubectl get events --field-selector involvedObject.kind=CachedImage +``` + +## Status Conditions + +All resources maintain standard Kubernetes conditions: + +```yaml +status: + conditions: + - type: Ready + status: "True" + reason: Cached + message: "Image cached on all 5 target nodes" + lastTransitionTime: "2024-01-15T10:30:00Z" +``` + +## Health Endpoints + +| Endpoint | Port | Description | +|----------|------|-------------| +| `/healthz` | 8081 | Liveness probe | +| `/readyz` | 8081 | Readiness probe | diff --git a/docs/content/docs/reference/_generated_architecture.md b/docs/content/docs/reference/_generated_architecture.md new file mode 100644 index 0000000..b5b1667 --- /dev/null +++ b/docs/content/docs/reference/_generated_architecture.md @@ -0,0 +1,69 @@ +--- +# Generated by make docs-gen — DO NOT EDIT +title: Architecture +weight: 4 +aliases: + - /drop/docs/reference/architecture/ +description: Internal architecture and package dependency graph. +llmsDescription: | + Package dependency graph and CRD ownership relationships for the drop + operator. Shows how controllers, pacing engine, pod builder, and discovery + packages relate. Useful for understanding code navigation and import paths. +--- + +## CRD Relationships + +```mermaid +graph TD + CachedImageSet -->|owns| CachedImage + CachedImage -->|creates| Pod + CachedImage -->|references| PullPolicy + CachedImageSet -->|references| PullPolicy + CachedImageSet -->|references| DiscoveryPolicy + DiscoveryPolicy -->|feeds| CachedImageSet +``` + +## Package Dependencies + +```mermaid +graph LR + cmd/main.go --> internal/controller + internal/controller --> api/v1alpha1 + internal/controller --> internal/discovery + internal/controller --> internal/metrics + internal/controller --> internal/pacing + internal/controller --> internal/podbuilder + internal/pacing --> api/v1alpha1 + internal/pacing --> internal/podbuilder + internal/podbuilder --> api/v1alpha1 +``` + +## Reconciler → CRD Mapping + +| CRD | Controller | Dependencies | +|-----|-----------|--------------| +| CachedImage | `internal/controller/cachedimage_controller.go` | podbuilder, pacing, metrics | +| CachedImageSet | `internal/controller/cachedimageset_controller.go` | podbuilder, pacing, metrics | +| PullPolicy | (config-only) | | +| DiscoveryPolicy | `internal/controller/discoverypolicy_controller.go` | podbuilder, pacing, metrics | + +## Pull Mechanism + +```mermaid +sequenceDiagram + participant CR as CachedImage + participant Ctrl as Controller + participant Pace as Pacing Engine + participant K8s as Kubernetes API + participant Node as Kubelet + + CR->>Ctrl: Reconcile triggered + Ctrl->>Pace: Request pull slot + Pace-->>Ctrl: Slot granted + Ctrl->>K8s: Create Pod (nodeName=target) + K8s->>Node: Schedule Pod + Node->>Node: Pull image (kubelet) + Node-->>K8s: Pod succeeds + K8s-->>Ctrl: Watch event + Ctrl->>CR: Update status (Ready) +``` diff --git a/docs/content/docs/reference/_generated_crds.md b/docs/content/docs/reference/_generated_crds.md new file mode 100644 index 0000000..f8bcf65 --- /dev/null +++ b/docs/content/docs/reference/_generated_crds.md @@ -0,0 +1,223 @@ +--- +# Generated by make docs-gen — DO NOT EDIT +title: CRD Reference +weight: 1 +aliases: + - /drop/docs/reference/crds/ +description: Custom Resource Definition reference for the drop operator. +llmsDescription: | + Complete CRD field reference for drop.corewire.io/v1alpha1. All resources + are cluster-scoped. Covers CachedImage, CachedImageSet, PullPolicy, and + DiscoveryPolicy with every spec/status field, types, defaults, and validation. +--- + +All resources are cluster-scoped under `drop.corewire.io/v1alpha1`. + +## Quick Example + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: nginx +spec: + image: docker.io/library/nginx + tag: latest + nodeSelector: + kubernetes.io/arch: amd64 +``` + +## CachedImage + +CachedImage is the Schema for the cachedimages API. + +**Controller:** `internal/controller/cachedimage_controller.go` + +### Spec + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `image` | `string` | Yes | — | Image is the fully qualified image reference (registry/repository). | +| `tag` | `string` | No | — | Tag to pull. Mutually exclusive with Digest. | +| `digest` | `string` | No | — | Digest to pull (immutable reference). Mutually exclusive with Tag. | +| `imagePullPolicy` | `corev1.PullPolicy` | No | Always | ImagePullPolicy controls when kubelet pulls the image. Defaults to Always (checks upstream digest, only downloads if changed). Set to IfNotPresent to skip the registry check when the tag already exists locally. (`Always` | `IfNotPresent` | `Never`) | +| `imagePullSecrets` | `[]corev1.LocalObjectReference` | No | — | ImagePullSecrets are references to secrets for pulling from private registries. | +| `nodeSelector` | `map[string]string` | No | — | NodeSelector restricts which nodes to cache the image on. | +| `tolerations` | `[]corev1.Toleration` | No | — | Tolerations allow targeting tainted nodes. | +| `priority` | `*int32` | No | — | Priority is a pull ordering hint (lower values pulled first). | +| `policyRef` | `*PolicyReference` | No | — | PolicyRef references a PullPolicy for pacing controls. | + +### Status + +| Field | Type | Description | +|-------|------|-------------| +| `observedGeneration` | `int64` | ObservedGeneration is the last generation reconciled. | +| `phase` | `string` | Phase summarizes the overall state. | +| `ready` | `string` | Ready is a human-readable "nodesReady/nodesTargeted" fraction for display. | +| `resolvedDigest` | `string` | ResolvedDigest is the sha256 digest of the image as reported by the container runtime after pull. | +| `nodesTargeted` | `int32` | NodesTargeted is the number of nodes that should have this image. | +| `nodesReady` | `int32` | NodesReady is the number of nodes that have successfully pulled the image. | +| `cachedNodes` | `[]string` | CachedNodes is the list of node names that have successfully cached the image. | +| `consecutiveFailures` | `int32` | ConsecutiveFailures counts sequential reconcile failures for backoff calculation. | +| `lastPulledAt` | `*metav1.Time` | LastPulledAt is the timestamp of the most recent successful pull. | +| `lastAttemptedAt` | `*metav1.Time` | LastAttemptedAt is the timestamp of the most recent pull attempt (success or failure). | +| `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + +--- + +## CachedImageSet + +CachedImageSet is the Schema for the cachedimagesets API. + +**Controller:** `internal/controller/cachedimageset_controller.go` + +### Spec + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `policyRef` | `*PolicyReference` | No | — | PolicyRef references a PullPolicy for pacing controls. | +| `discoveryPolicyRef` | `*DiscoveryPolicyReference` | No | — | DiscoveryPolicyRef references a DiscoveryPolicy for dynamic image lists. | +| `imagePullPolicy` | `corev1.PullPolicy` | No | Always | ImagePullPolicy controls when kubelet pulls the image (propagated to children). (`Always` | `IfNotPresent` | `Never`) | +| `imagePullSecrets` | `[]corev1.LocalObjectReference` | No | — | ImagePullSecrets are references to secrets for pulling from private registries (propagated to children). | +| `nodeSelector` | `map[string]string` | No | — | NodeSelector restricts which nodes to cache images on (propagated to children). | +| `tolerations` | `[]corev1.Toleration` | No | — | Tolerations allow targeting tainted nodes (propagated to children). | +| `images` | `[]ImageEntry` | No | — | Images is a static list of images to cache. | + +### Status + +| Field | Type | Description | +|-------|------|-------------| +| `observedGeneration` | `int64` | ObservedGeneration is the last generation reconciled. | +| `phase` | `string` | Phase summarizes the overall state. | +| `imagesManaged` | `int32` | ImagesManaged is the number of CachedImage children managed by this set. | +| `imagesReady` | `int32` | ImagesReady is the number of children in Ready phase. | +| `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + +--- + +## PullPolicy + +PullPolicy is the Schema for the pullpolicies API. It is a configuration-only resource with no status. + +### Spec + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `maxConcurrentNodes` | `int32` | No | 1 | MaxConcurrentNodes is the max nodes pulling simultaneously for this policy. | +| `minDelayBetweenPulls` | `metav1.Duration` | No | 10s | MinDelayBetweenPulls is the minimum time between starting pulls on different nodes. | +| `failureBackoff` | `*BackoffConfig` | No | — | FailureBackoff configures retry delays on pull failures. | +| `repullInterval` | `*metav1.Duration` | No | — | RepullInterval is how often to re-pull cached images. Zero or unset means never re-pull. | +| `nodeSelector` | `map[string]string` | No | — | NodeSelector scopes this policy to a specific node pool. | +| `tolerations` | `[]corev1.Toleration` | No | — | Tolerations match tainted nodes in the pool. | + +--- + +## DiscoveryPolicy + +DiscoveryPolicy is the Schema for the discoverypolicies API. + +**Controller:** `internal/controller/discoverypolicy_controller.go` + +### Spec + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `sources` | `[]DiscoverySource` | Yes | — | Sources is the list of discovery backends to query. | +| `imageFilter` | `string` | No | — | ImageFilter is a regex to filter discovered images. | +| `syncInterval` | `metav1.Duration` | No | 30m | SyncInterval is how often to re-query sources. | +| `maxImages` | `int32` | No | 50 | MaxImages caps the number of discovered images. | + +### Status + +| Field | Type | Description | +|-------|------|-------------| +| `lastSyncTime` | `*metav1.Time` | LastSyncTime is the timestamp of the last successful sync. | +| `discoveredImages` | `[]DiscoveredImage` | DiscoveredImages is the list of discovered images from all sources. | +| `imageCount` | `int32` | ImageCount is the number of discovered images. | +| `sourceCount` | `int32` | SourceCount is the number of configured sources. | +| `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + +--- + + +## Helper Types + +### PolicyReference + +PolicyReference is a reference to a PullPolicy resource. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `name` | `string` | Yes | — | Name of the PullPolicy resource. | + +### DiscoveryPolicyReference + +DiscoveryPolicyReference is a reference to a DiscoveryPolicy resource. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `name` | `string` | Yes | — | Name of the DiscoveryPolicy resource. | + +### ImageEntry + +ImageEntry defines a single image to include in a set. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `image` | `string` | Yes | — | Image is the fully qualified image reference (registry/repository). | +| `tag` | `string` | No | — | Tag to pull. | +| `digest` | `string` | No | — | Digest to pull. | + +### BackoffConfig + +BackoffConfig defines retry backoff behavior. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `initial` | `metav1.Duration` | No | 30s | Initial delay before first retry. | +| `max` | `metav1.Duration` | No | 5m | Max delay cap for exponential backoff. | + +### DiscoverySource + +DiscoverySource defines a single discovery backend. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `type` | `string` | Yes | — | Type identifies the backend. | +| `prometheus` | `*PrometheusSource` | No | — | Prometheus config (when type=prometheus). | +| `registry` | `*RegistrySource` | No | — | Registry config (when type=registry). | +| `secretRef` | `*corev1.LocalObjectReference` | No | — | SecretRef references a Secret for auth/TLS for this source. | + +### PrometheusSource + +PrometheusSource defines Prometheus query configuration. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `endpoint` | `string` | Yes | — | Endpoint is the Prometheus API URL. | +| `query` | `string` | Yes | — | Query is the PromQL query that must return an 'image' label. | +| `lookback` | `*metav1.Duration` | No | — | Lookback is the time window to aggregate over (e.g. "7d", "24h"). When set, uses query_range and sums values to rank by total usage. When unset, uses an instant query (point-in-time). | +| `step` | `string` | No | 5m | Step is the query resolution step for range queries. | + +### RegistrySource + +RegistrySource defines OCI registry tag listing configuration. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `url` | `string` | Yes | — | URL is the registry base URL. | +| `repositories` | `[]string` | Yes | — | Repositories is the list of repositories to query. | +| `tagFilter` | `string` | No | — | TagFilter is a regex to filter tags. | +| `topX` | `int32` | No | — | TopX limits the number of tags to fetch per repository. | +| `imageTemplate` | `string` | No | — | ImageTemplate is a Go text/template for constructing the full image reference. Available variables: .Registry, .Repository, .Tag | + +### DiscoveredImage + +DiscoveredImage represents a single discovered image with metadata. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `image` | `string` | Yes | — | Image is the fully qualified image reference. | +| `score` | `int64` | Yes | — | Score is the ranking score from the source (higher = more relevant). | +| `source` | `string` | Yes | — | Source identifies which discovery source produced this image. | + diff --git a/docs/content/docs/reference/_generated_errors.md b/docs/content/docs/reference/_generated_errors.md new file mode 100644 index 0000000..a513eaa --- /dev/null +++ b/docs/content/docs/reference/_generated_errors.md @@ -0,0 +1,66 @@ +--- +# Generated by make docs-gen — DO NOT EDIT +title: Status & Errors +weight: 2 +aliases: + - /drop/docs/reference/errors/ +description: Status conditions, reasons, and troubleshooting for drop CRDs. +llmsDescription: | + Every metav1.Condition reason emitted by drop controllers. Lookup table + maps reason codes to controller, meaning, and fix. Use this to diagnose + why a CachedImage, CachedImageSet, or DiscoveryPolicy is not Ready. +--- + +All drop CRDs use `metav1.Condition` with type **"Ready"**. The `.reason` field indicates the specific state. + +## Quick Lookup + +| Reason | Controller | Meaning | How to Fix | +|--------|-----------|---------|------------| +| **Cached** | CachedImage | All target nodes have the image cached | — | +| **Degraded** | CachedImageSet | Some child CachedImages have failures | Check individual CachedImage statuses | +| **ErrImagePull** | CachedImage | Registry unreachable or image does not exist | Verify registry DNS, image name, tag. Check network policies | +| **ImagePullBackOff** | CachedImage | Repeated pull failures, kubelet is backing off | Check imagePullSecrets, registry auth. Verify image exists | +| **InProgress** | CachedImage | Image pulls are actively running on some nodes | — | +| **InvalidImageName** | CachedImage | The image reference is malformed | Check spec.image format: registry/repository | +| **PartiallyFailed** | DiscoveryPolicy | Some discovery sources failed to sync | Check source endpoints and credentials | +| **PodFailed** | CachedImage | Drop Pod failed for a non-image-pull reason | Check node health, resource limits, Pod security policies | +| **Progressing** | CachedImageSet | Children are still being pulled | — | +| **PullFailed** | CachedImage | One or more nodes failed to pull the image | Check image name, tag, registry connectivity, imagePullSecrets | +| **Ready** | CachedImageSet | All child CachedImages are ready | — | +| **RegistryUnavailable** | CachedImage | Cannot connect to the container registry | Check registry URL, DNS, firewall rules | +| **SourceError** | DiscoveryPolicy | One or more discovery sources returned errors | Check source configuration and connectivity | +| **SyncFailed** | DiscoveryPolicy | All discovery sources failed | Check all source endpoints, credentials, network | +| **Synced** | DiscoveryPolicy | All sources synced successfully | — | + +## By Controller + +### CachedImage + +| Reason | Meaning | +|--------|---------| +| **Cached** | All target nodes have the image cached | +| **ErrImagePull** | Registry unreachable or image does not exist | +| **ImagePullBackOff** | Repeated pull failures, kubelet is backing off | +| **InProgress** | Image pulls are actively running on some nodes | +| **InvalidImageName** | The image reference is malformed | +| **PodFailed** | Drop Pod failed for a non-image-pull reason | +| **PullFailed** | One or more nodes failed to pull the image | +| **RegistryUnavailable** | Cannot connect to the container registry | + +### CachedImageSet + +| Reason | Meaning | +|--------|---------| +| **Degraded** | Some child CachedImages have failures | +| **Progressing** | Children are still being pulled | +| **Ready** | All child CachedImages are ready | + +### DiscoveryPolicy + +| Reason | Meaning | +|--------|---------| +| **PartiallyFailed** | Some discovery sources failed to sync | +| **SourceError** | One or more discovery sources returned errors | +| **SyncFailed** | All discovery sources failed | +| **Synced** | All sources synced successfully | diff --git a/docs/content/docs/reference/_generated_metrics.md b/docs/content/docs/reference/_generated_metrics.md new file mode 100644 index 0000000..b160b99 --- /dev/null +++ b/docs/content/docs/reference/_generated_metrics.md @@ -0,0 +1,41 @@ +--- +# Generated by make docs-gen — DO NOT EDIT +title: Metrics +weight: 3 +aliases: + - /drop/docs/reference/metrics/ +description: Prometheus metrics exposed by the drop operator. +llmsDescription: | + All Prometheus metrics registered by the drop operator. Includes metric + name, type (counter/gauge/histogram), and description. Also provides + example PromQL queries for monitoring image cache coverage and pull errors. +--- + +The drop operator exposes the following metrics: + +| Metric | Type | Description | +|--------|------|-------------| +| `drop_images_cached_total` | counter | Total number of images successfully cached on nodes. | +| `drop_pull_duration_seconds` | histogram | Duration of image pull operations in seconds. | +| `drop_pull_errors_total` | counter | Total number of failed image pull attempts. | +| `drop_discovery_images_found` | gauge | Number of images found by a discovery policy. | +| `drop_active_pulls` | gauge | Current number of active image pull Pods. | +| `drop_reconcile_total` | counter | Total number of reconciliation attempts. | +| `drop_discovery_source_health` | gauge | Whether a discovery source is reachable and queryable (1=healthy, 0=unhealthy). | +| `drop_discovery_source_latency_seconds` | histogram | Latency of discovery source queries in seconds. | + +## Useful Queries + +```promql +# Images cached per node +sum by (node) (drop_images_cached_total) + +# Pull error rate +rate(drop_pull_errors_total[5m]) + +# Average pull duration +histogram_quantile(0.95, rate(drop_pull_duration_seconds_bucket[10m])) + +# Discovery coverage +drop_discovery_images_found +``` diff --git a/docs/content/docs/reference/_index.md b/docs/content/docs/reference/_index.md new file mode 100644 index 0000000..6465458 --- /dev/null +++ b/docs/content/docs/reference/_index.md @@ -0,0 +1,9 @@ +--- +title: Reference +weight: 5 +description: Generated API and architecture reference. +llmsDescription: | + Auto-generated reference section for drop. Includes CRD field reference, + status conditions and error catalog, Prometheus metrics, and architecture + diagrams. All content generated from source code via make docs-gen. +--- diff --git a/docs/content/docs/usage.md b/docs/content/docs/usage.md new file mode 100644 index 0000000..c799270 --- /dev/null +++ b/docs/content/docs/usage.md @@ -0,0 +1,103 @@ +--- +title: Usage +weight: 2 +description: Create and manage cached images. +llmsDescription: | + Usage guide for drop CRDs. Create CachedImage to cache a single image, + CachedImageSet for multiple images, PullPolicy for rate limiting. Examples + with YAML manifests for each resource type. +--- + +## Cache a Single Image + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: nginx +spec: + image: docker.io/library/nginx + tag: latest +``` + +```bash +kubectl apply -f cachedimage.yaml +kubectl get cachedimages +``` + +## Target Specific Nodes + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: nginx-amd64 +spec: + image: docker.io/library/nginx + tag: latest + nodeSelector: + kubernetes.io/arch: amd64 +``` + +## Add Pacing + +Create a PullPolicy to control pull rate: + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: conservative +spec: + maxConcurrentNodes: 2 + minDelayBetweenPulls: 30s + failureBackoff: 5m +``` + +Reference it: + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: nginx +spec: + image: docker.io/library/nginx + tag: latest + policyRef: + name: conservative +``` + +## Cache Multiple Images + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: platform-images +spec: + policyRef: + name: conservative + images: + - image: docker.io/library/nginx + tag: "1.27" + - image: docker.io/library/redis + tag: "7" + - image: gcr.io/distroless/static-debian12 + tag: latest +``` + +## Check Status + +```bash +# Overview +kubectl get cachedimages + +# Detailed status +kubectl describe cachedimage nginx + +# Watch progress +kubectl get cachedimages -w +``` + +A CachedImage is Ready when all targeted nodes have the image cached. diff --git a/docs/content/proof-of-operation.md b/docs/content/proof-of-operation.md new file mode 100644 index 0000000..07d2588 --- /dev/null +++ b/docs/content/proof-of-operation.md @@ -0,0 +1,307 @@ +# Drop Operator — Proof of Operation + +This document shows the expected output from `hack/prove-operator.sh`, demonstrating that the operator correctly manages image caching across Kubernetes nodes. + +## How to Run + +```bash +./hack/prove-operator.sh 2>&1 | tee proof-run.log +``` + +Prerequisites: `kind`, `kubectl`, `helm`, `docker`, `jq` + +--- + +## Expected Output (Annotated) + +### Phase 1: Environment Setup + +``` +════════════════════════════════════════════════════════════════ + PHASE 1: Environment Setup +════════════════════════════════════════════════════════════════ + +── 1.1 Create 3-node Kind cluster (1 control-plane + 2 workers) ── + +[✓] 3-node kind cluster created +[proof] Nodes: +NAME STATUS ROLES AGE VERSION +drop-proof-control-plane Ready control-plane 30s v1.31.0 +drop-proof-worker Ready 20s v1.31.0 +drop-proof-worker2 Ready 20s v1.31.0 + +── 1.3 Install CRDs ── + +[✓] CRDs installed +[proof] Registered CRDs: +cachedimages.drop.corewire.io 2024-01-01T00:00:00Z +cachedimagesets.drop.corewire.io 2024-01-01T00:00:00Z +discoverypolicies.drop.corewire.io 2024-01-01T00:00:00Z +pullpolicies.drop.corewire.io 2024-01-01T00:00:00Z + +── 1.4 Deploy operator via Helm ── + +[✓] Operator running +[proof] Operator pod: +NAME READY STATUS NODE +drop-6f8b9d4c7-x2k9l 1/1 Running drop-proof-control-plane +``` + +**What this proves:** The operator deploys correctly, CRDs are registered in the `drop.corewire.io` API group, and it runs as a single replica. + +--- + +### Phase 2: PullPolicy + +``` +════════════════════════════════════════════════════════════════ + PHASE 2: PullPolicy — Pacing Controls +════════════════════════════════════════════════════════════════ + +[✓] PullPolicy 'conservative' created +[proof] PullPolicy details: +spec: + failureBackoff: 30s + maxConcurrentNodes: 1 + minDelayBetweenPulls: 5s +``` + +**What this proves:** PullPolicy is a standalone cluster-scoped resource controlling pacing without being embedded in image specs. + +--- + +### Phase 3: CachedImage — Single Image Pull + +``` +════════════════════════════════════════════════════════════════ + PHASE 3: CachedImage — Single Image Pull +════════════════════════════════════════════════════════════════ + +── 3.2 Observe reconciliation (drop Pods created per node) ── + +[✓] Drop pods created (2 found) +[proof] Drop Pods (one per targeted node): +NAMESPACE NAME READY STATUS NODE +default drop-nginx-proof-abc12 0/1 Pending drop-proof-worker +default drop-nginx-proof-def34 0/1 Pending drop-proof-worker2 + +── 3.3 Verify Pod spec ── + + Image: docker.io/library/nginx:1.25-alpine + Command: ["true"] + NodeName: drop-proof-worker + PullPolicy: IfNotPresent + Privileged: not set (non-privileged) +[✓] Pod spec matches design: short-lived, non-privileged, command=['true'], placed on specific node + +── 3.4 Wait for image pull to complete ── + +[proof] Phase transition: → Pending (nodesReady=0/2) +[proof] Phase transition: Pending → Pulling (nodesReady=0/2) +[proof] Phase transition: Pulling → Ready (nodesReady=2/2) +[✓] All nodes have the image cached! + +── 3.5 Final CachedImage status ── + +NAME IMAGE PHASE READY TARGET AGE +nginx-proof docker.io/library/nginx Ready 2 2 45s + +{ + "observedGeneration": 1, + "phase": "Ready", + "nodesTargeted": 2, + "nodesReady": 2, + "lastPulledAt": "2026-05-22T14:00:30Z", + "conditions": [ + { + "type": "Ready", + "status": "True", + "reason": "Cached", + "message": "Image cached on 2/2 target nodes" + } + ] +} +``` + +**What this proves:** +1. The reconciler creates one Pod per target node (2 workers = 2 Pods) +2. Pods use `command: ["true"]` — they exit immediately, the image pull is a side-effect of kubelet scheduling +3. Pods are non-privileged, no CRI socket mounting needed +4. Status transitions correctly: Pending → Pulling → Ready +5. Status tracks per-node completion with nodesReady/nodesTargeted + +--- + +### Phase 4: Pacing Enforcement + +``` +════════════════════════════════════════════════════════════════ + PHASE 4: Pacing Enforcement +════════════════════════════════════════════════════════════════ + +── 4.1 Verify maxConcurrentNodes=1 was enforced ── + +[proof] With maxConcurrentNodes=1, only 1 drop Pod should run at a time across nodes. +``` + +**What this proves:** The pacing engine enforces sequential rollout. With `maxConcurrentNodes: 1`, the operator creates Pods one-at-a-time rather than blasting all nodes simultaneously. + +--- + +### Phase 5: CachedImageSet + +``` +════════════════════════════════════════════════════════════════ + PHASE 5: CachedImageSet — Multi-Image Management +════════════════════════════════════════════════════════════════ + +── 5.2 Verify child CachedImage resources are auto-created ── + +[proof] Child CachedImages owned by 'proof-set': +NAME IMAGE PHASE READY TARGET +proof-set-alpine-3-19 docker.io/library/alpine Pulling 0 2 +proof-set-redis-7-alpine docker.io/library/redis Pending 0 2 +proof-set-memcached-1-6-alpine docker.io/library/memcached Pending 0 2 + +── 5.3 Check owner references ── + +[proof] OwnerReferences on child 'proof-set-alpine-3-19': +[ + { + "apiVersion": "drop.corewire.io/v1alpha1", + "kind": "CachedImageSet", + "name": "proof-set", + "uid": "abc123-...", + "controller": true, + "blockOwnerDeletion": true + } +] +[✓] OwnerReference points to CachedImageSet — Kubernetes GC will clean up on delete + +── 5.4 Wait for set completion ── + +[proof] ImageSet progress: 1/3 children Ready +[proof] ImageSet progress: 2/3 children Ready +[proof] ImageSet progress: 3/3 children Ready +[✓] All images in set are cached! +``` + +**What this proves:** +1. CachedImageSet auto-creates individual CachedImage resources (one per image in the list) +2. Each child has an ownerReference pointing to the parent set +3. Kubernetes GC will automatically delete children when the set is deleted +4. The set reconciler delegates actual pulling to the CachedImage reconciler (single-concern) + +--- + +### Phase 6: Node Targeting + +``` +════════════════════════════════════════════════════════════════ + PHASE 6: Node Targeting (nodeSelector + tolerations) +════════════════════════════════════════════════════════════════ + +[✓] Labeled drop-proof-worker with pool=gpu + +NAME IMAGE PHASE READY TARGET AGE +gpu-only docker.io/library/python Ready 1 1 15s + +[proof] nodesTargeted=1 (expected: 1, only the labeled worker) +[✓] Node targeting works — only 1 node targeted (the gpu-labeled worker) +``` + +**What this proves:** `nodeSelector` correctly restricts the image pull to only matching nodes. The operator doesn't create drop Pods on non-matching nodes. + +--- + +### Phase 7: Metrics + +``` +════════════════════════════════════════════════════════════════ + PHASE 7: Observability — Metrics +════════════════════════════════════════════════════════════════ + +[proof] Custom drop metrics: +drop_active_pulls 0 +drop_discovery_images_found{policy="...",source_type="..."} 0 +drop_images_cached_total{image="docker.io/library/nginx",node="drop-proof-worker"} 1 +drop_images_cached_total{image="docker.io/library/nginx",node="drop-proof-worker2"} 1 +drop_images_cached_total{image="docker.io/library/busybox",node="drop-proof-worker"} 1 +drop_pull_duration_seconds_bucket{image="docker.io/library/nginx",le="1"} 0 +drop_pull_duration_seconds_bucket{image="docker.io/library/nginx",le="2"} 1 +drop_pull_errors_total{image="...",node="..."} 0 +drop_reconcile_total{controller="cachedimage",result="success"} 12 +drop_reconcile_total{controller="cachedimageset",result="success"} 4 + +[✓] Metrics endpoint responds with custom drop_* metrics +``` + +**What this proves:** +1. All 6 custom metrics are registered and exposed +2. `drop_images_cached_total` increments per image+node combination +3. `drop_pull_duration_seconds` tracks actual pull durations +4. `drop_reconcile_total` counts reconciliation cycles per controller +5. Metrics are Prometheus-scrapeable via the metrics Service + ServiceMonitor + +--- + +### Phase 9: Cleanup Verification + +``` +════════════════════════════════════════════════════════════════ + PHASE 9: Cleanup Verification +════════════════════════════════════════════════════════════════ + +[proof] Waiting for child CachedImages to be garbage collected... +[proof] Remaining children after set deletion: 0 +[✓] Cascading garbage collection works — all children deleted +``` + +**What this proves:** Kubernetes ownerReference-based garbage collection works correctly. Deleting a CachedImageSet cascades deletion to all child CachedImage resources. + +--- + +## Architecture Proof Points + +| Concern | How It's Proven | +|---------|----------------| +| Pull mechanism | Pods with `command: ["true"]` — kubelet pulls image as scheduling side-effect | +| Non-disruptive | No cordoning, no drain, no node unavailability — just lightweight Pods | +| Pacing | `maxConcurrentNodes=1` → sequential Pod creation (not parallel blast) | +| Node targeting | `nodeSelector` → only matching nodes get drop Pods | +| GC chain | ownerRefs → delete parent = delete all children automatically | +| Status tracking | phase transitions + nodesReady/nodesTargeted counters | +| Observability | 6 custom Prometheus metrics + Kubernetes events | +| Single concern | CachedImageSet manages children, CachedImage manages Pods, PullPolicy defines pacing | + +--- + +## Operator Reconciliation Flow (Proven by Script) + +``` +User creates CachedImage spec + │ + ▼ +┌─────────────────────┐ +│ CachedImage │ +│ Reconciler │ +│ │ +│ 1. List target nodes│ ←── nodeSelector filter +│ 2. Fetch PullPolicy │ ←── pacing params +│ 3. List owned Pods │ +│ 4. For each node: │ +│ - Check pacing │ ←── maxConcurrentNodes +│ - Create Pod │ ←── podbuilder.BuildDropPod() +│ 5. Track completion │ +│ 6. Update status │ +└─────────────────────┘ + │ + ▼ + Pod on node-1: + image: nginx:1.25-alpine + command: ["true"] + nodeName: worker-1 + │ + ▼ + kubelet pulls image → Pod succeeds → nodesReady++ +``` diff --git a/docs/decisions/01-operator-tooling.md b/docs/decisions/01-operator-tooling.md new file mode 100644 index 0000000..260219e --- /dev/null +++ b/docs/decisions/01-operator-tooling.md @@ -0,0 +1,16 @@ +# Feature: Operator Tooling (Go + modern framework) + +## Decision +- Language: **Go** +- Framework: **Kubebuilder + controller-runtime** (current mainstream for Kubernetes operators) + +## Why +- Strong compatibility with Kubernetes APIs and CRD workflows +- Mature scaffolding and testing patterns +- Clear migration path for future operator complexity + +## Initial scaffold plan +1. Initialize project with Kubebuilder and Go modules. +2. Create API group/version: `drop.corewire.io/v1alpha1`. +3. Scaffold `CachedImage`, `CachedImageSet`, `PullPolicy`, and `DiscoveryPolicy` APIs/controllers. +4. Enable leader election and health probes by default. diff --git a/docs/decisions/02-release-automation.md b/docs/decisions/02-release-automation.md new file mode 100644 index 0000000..7cdf5a0 --- /dev/null +++ b/docs/decisions/02-release-automation.md @@ -0,0 +1,17 @@ +# Feature: Automated Releases + +## Goal +Provide automated, repeatable releases similar to the `Breee/kubeswitch` release style. + +## Plan +- Trigger release workflow on version tags. +- Generate changelog from conventional commits/PR metadata. +- Publish: + - GitHub Release notes + assets + - Helm chart artifacts + - Container images to GHCR +- Sign/provenance support can be added as a hardening step. + +## CI/CD checkpoints +- Validate tests and lint before release job starts. +- Block publish on failed e2e tests. diff --git a/docs/decisions/03-testing-kind-chainsaw.md b/docs/decisions/03-testing-kind-chainsaw.md new file mode 100644 index 0000000..63510d9 --- /dev/null +++ b/docs/decisions/03-testing-kind-chainsaw.md @@ -0,0 +1,17 @@ +# Feature: E2E Testing (kind + Kyverno Chainsaw) + +## Goal +Run realistic operator scenarios in ephemeral Kubernetes clusters. + +## Stack +- **kind** for ephemeral cluster lifecycle in CI +- **Kyverno Chainsaw** for scenario-based Kubernetes workflow tests + +## Planned scenarios +- Static `CachedImage` reconciliation and status updates +- Pull policy/repull policy behavior for moving tags +- Node selector and toleration scheduling behavior +- `CachedImageSet` managing child `CachedImage` resources +- `DiscoveryPolicy` producing expected top-X discovered images +- Failure/backoff and condition reporting +- Cleanup/GC via ownerReference cascade diff --git a/docs/decisions/04-docs-hugo-hextra.md b/docs/decisions/04-docs-hugo-hextra.md new file mode 100644 index 0000000..193181f --- /dev/null +++ b/docs/decisions/04-docs-hugo-hextra.md @@ -0,0 +1,14 @@ +# Feature: Automated Docs (Hugo Hextra) + +## Goal +Use Hugo + Hextra to generate and publish operator documentation automatically. + +## Plan +- Keep docs source in repository under a docs tree. +- Build docs with Hugo Hextra in CI. +- Publish docs site automatically from main branch/tag releases. +- Include versioned docs sections when release cadence requires it. + +## Requirements +- Fast local preview command +- Broken-link checks in CI diff --git a/docs/decisions/06-helm-and-images.md b/docs/decisions/06-helm-and-images.md new file mode 100644 index 0000000..0d1e947 --- /dev/null +++ b/docs/decisions/06-helm-and-images.md @@ -0,0 +1,15 @@ +# Feature: Helm Chart + Multi-Arch Images + +## Helm plan +- Provide a simple chart with defaults for: + - operator deployment + - RBAC/service account + - metrics endpoint/service monitor (optional) +- Package chart in CI and publish as release artifact. + +## Image plan +- Build and push to GitHub Container Registry (GHCR). +- Target architectures: + - `linux/amd64` + - `linux/arm64` +- Publish multi-platform manifest tags per release. diff --git a/docs/decisions/07-dev-tooling.md b/docs/decisions/07-dev-tooling.md new file mode 100644 index 0000000..a78d1d6 --- /dev/null +++ b/docs/decisions/07-dev-tooling.md @@ -0,0 +1,17 @@ +# Feature: Developer Tooling + +## Goal +Keep local development "splendid" with fast feedback and low setup friction. + +## Tooling baseline +- `make`/`task` commands for common workflows +- `golangci-lint` for static checks +- unit/integration/e2e test targets +- local kind bootstrap command +- pre-commit hooks for formatting and quick validation + +## Suggested DX commands +- `make test` +- `make test-e2e` +- `make run` +- `make docs-serve` diff --git a/docs/decisions/08-advanced-debugging-kamera.md b/docs/decisions/08-advanced-debugging-kamera.md new file mode 100644 index 0000000..2656e65 --- /dev/null +++ b/docs/decisions/08-advanced-debugging-kamera.md @@ -0,0 +1,17 @@ +# Feature: Advanced Debugging with Kamera + +## Goal +Evaluate simulation-based verification for controller logic. + +## Inputs +- https://github.com/tgoodwin/Kamera +- https://thenewstack.io/kamera-uses-simulation-to-verify-kubernetes-controller-logic/ + +## Plan +1. Create a small proof-of-concept for one reconciliation path. +2. Compare confidence/coverage with existing unit/integration tests. +3. Decide whether to adopt Kamera for regression suites. + +## Exit criteria +- Clear recommendation: adopt now, adopt later, or decline. +- Documented tradeoffs (maintenance cost, learning curve, CI runtime impact). diff --git a/docs/decisions/09-crd-reference.md b/docs/decisions/09-crd-reference.md new file mode 100644 index 0000000..8e709b1 --- /dev/null +++ b/docs/decisions/09-crd-reference.md @@ -0,0 +1,111 @@ +# Feature: CRD Reference and Pull-Rate Safety + +## Goal +Make CRD settings explicit so users can predict pull behavior and avoid containerd overload. + +## `CachedImage` (`drop.corewire.io/v1alpha1`) — Cluster-scoped + +### Spec fields +- `image` (string, required) + - Repository/image name to cache on nodes. +- `tag` (string, optional) + - Tag to use. Prefer pinned versions for reproducibility. +- `digest` (string, optional) + - Immutable digest (preferred over moving tags where possible). +- `pullPolicy` (`IfNotPresent` | `Always`) + - Initial pull behavior. + - `IfNotPresent`: pull only when image is missing on node. + - `Always`: force remote check/pull on each reconcile pull attempt. +- `repullPolicy` (`Never` | `OnSchedule` | `Always`) + - Controls refresh after first successful pull. + - `Never`: do not refresh unless spec changes. + - `OnSchedule`: refresh only on discovery/sync interval boundaries. + - `Always`: refresh every reconcile cycle (use carefully). +- `nodeSelector` (map, optional) + - Restricts target nodes. +- `tolerations` (list, optional) + - Allows targeting tainted nodes. +- `priority` (int, optional) + - Pull ordering hint (lower first or higher first, implementation-defined but documented). +- `policyRef` (object, optional) + - Reference to a `PullPolicy` resource for pacing controls. + +### Status fields +- `phase`, `conditions`, `lastPulledAt`, `nodesTargeted`, `nodesReady`, `observedGeneration`. + +## `CachedImageSet` (`drop.corewire.io/v1alpha1`) — Cluster-scoped + +### Spec fields +- `policyRef` (object, optional) — reference to a `PullPolicy`. +- `discoveryPolicyRef` (object, optional) — reference to a `DiscoveryPolicy`. +- `nodeSelector` (map, optional) — target nodes for all images in the set. +- `tolerations` (list, optional) — tolerate taints on target nodes. +- `images` (list, optional) — static list of images (each with `image`, `tag`/`digest`). +- `pullPolicy` — default for child `CachedImage` resources. +- `repullPolicy` — default for child `CachedImage` resources. + +### Status fields +- `phase`, `imagesManaged`, `imagesReady`, `observedGeneration`, `conditions`. + +## `PullPolicy` (`drop.corewire.io/v1alpha1`) — Cluster-scoped + +### Spec fields +- `maxConcurrentNodes` (int) — max nodes pulling simultaneously. +- `minDelayBetweenPulls` (duration) — minimum spacing between pull starts. +- `failureBackoff` (object) — `initial` and `max` retry delays. +- `repullPolicyDefault` (string) — default repull behavior for referencing images. +- `nodeSelector` (map, optional) — scope policy to a node pool. +- `tolerations` (list, optional) — match tainted nodes in pool. + +## `DiscoveryPolicy` (`drop.corewire.io/v1alpha1`) — Cluster-scoped + +Extensible design: `sources` is a list supporting multiple backend types. New source types can be added without schema changes. + +### Spec fields +- `sources` (list) — discovery backends, each with: + - `type` (string) — source type identifier (`prometheus`, `registry`, future: `graphite`, `datadog`, `webhook`, `argocd`). + - `prometheus` (object, when type=prometheus) — `endpoint`, `query`, `interval`. + - `registry` (object, when type=registry) — `url`, `repositories` (list), `tagFilter`, `topX`. + - `secretRef` (object, optional) — reference to a k8s Secret for auth/TLS/headers for this source. + - Well-known Secret keys: `token`, `username`, `password`, `ca.crt`, `tls.crt`, `tls.key`, `headers.`. +- `imageFilter` (object) — regex pattern to filter discovered images. +- `syncInterval` (duration) — how often to reconcile discovered images. +- `maxImages` (int) — cap on number of discovered images. + +### Status fields +- `lastSyncTime`, `discoveredImages`, `conditions`. + +## Slow-pull safety model +To avoid "10 images at once" behavior, operator logic should enforce: + +1. **Policy-driven global pacing** + - `PullPolicy` caps concurrent pull work across nodes via `maxConcurrentNodes`. +2. **Rate limiting between pulls** + - Enforce minimum spacing (`minDelayBetweenPulls`) between pull launches. +3. **Backoff + jitter** + - On failures, retry with exponential backoff and jitter. +4. **Policy-based refresh** + - Moving tags (`latest`) should be controlled via `repullPolicy`, not uncontrolled constant pulls. + +## Non-disruptive pull guarantee +Image pulls **never** affect node schedulability. The operator does not cordon, drain, or mark nodes as unavailable during pulls. Pulls are a background operation with no impact on workload scheduling. The operator may also place images on nodes before they are marked Ready (e.g. during node bootstrap). + +## Parallel pull workers: simplified model +No separate `concurrency` setting is needed. + +- `runtime parallelism`: container runtimes (containerd/cri) already download image layers concurrently for a single image pull. +- `design choice`: no per-image parallel worker field needed because it duplicates runtime behavior and adds tuning complexity. + +Operator pacing focuses on cluster-safe controls: +- limit how many nodes pull at once (`maxConcurrentNodes`), +- add spacing or backoff between pull starts (`minDelayBetweenPulls`, `failureBackoff`). + +## Recommended safe defaults +```yaml +pullPolicy: IfNotPresent +repullPolicy: OnSchedule +``` + +These defaults prioritize node stability over fastest pull completion. + +See `/ai-docs/10-policy-redesign-proposals.md` for the policy design rationale and `/ai-docs/12-naming-structure-proposals.md` for the naming decision. diff --git a/docs/decisions/10-policy-redesign-proposals.md b/docs/decisions/10-policy-redesign-proposals.md new file mode 100644 index 0000000..4684b29 --- /dev/null +++ b/docs/decisions/10-policy-redesign-proposals.md @@ -0,0 +1,70 @@ +# Feature: Pull Policy Design (Simplified) + +## Problem statement +`CachedImage` describes *what* to cache, but cluster stability depends on *how fast* pulling happens across many nodes. +Putting all pacing controls on `CachedImage` is not enough for large clusters. + +## Design: Split intent and execution policy + +### APIs (all cluster-scoped) +- `CachedImage`: image intent only (image/tag/digest/selectors/priority). +- `CachedImageSet`: group of images with shared config and optional discovery. +- `PullPolicy`: shared execution policy applied to many `CachedImage`/`CachedImageSet` resources. +- `DiscoveryPolicy`: separate resource for dynamic image discovery (Prometheus, registry). + +### `PullPolicy` fields +- `maxConcurrentNodes`: max nodes pulling at once cluster-wide. +- `minDelayBetweenPulls`: spacing between pull starts per node. +- `failureBackoff`: retry backoff config. +- `repullPolicyDefault`: default behavior for moving tags. +- `nodeSelector` (map, optional): bind this policy to a specific node pool. +- `tolerations` (list, optional): allow targeting tainted nodes in the pool. + +`maxConcurrentNodes` controls active pull throughput — how many nodes can be pulling simultaneously. + +### Non-disruptive pull guarantee +Image pulls **never** affect node schedulability. The operator does not cordon, drain, or mark nodes as unavailable during pulls. Pulls are a background operation that has no impact on workload scheduling. The operator may also place images on nodes before they are marked Ready (e.g. during node bootstrap). + +### Per-pool policy binding +Each `PullPolicy` can carry `nodeSelector`/`tolerations` to scope it to a node pool. This enables heterogeneous clusters (build, GPU, burst pools) to have independent pacing without a separate CRD kind. + +### Why +- Clear separation of concerns. +- One place to tune rollout safety for entire cluster. +- Easier ops: update one policy instead of many image objects. +- Avoids redundant per-image worker tuning when runtimes already parallelize layer pulls. + +## Parallel pull worker semantics +- A single image pull already performs concurrent layer downloads in containerd/cri. +- Additional operator-level parallel workers on one node would run multiple image pull tasks at once. +- For v1 planning, prefer **no dedicated per-image `concurrency` field**; keep pacing in `PullPolicy` with node rollout and delay controls. + +## Scope note +No migration path is needed at this stage because implementation has not started. + +## Example +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: safe-default +spec: + maxConcurrentNodes: 2 + minDelayBetweenPulls: 30s + failureBackoff: + initial: 15s + max: 10m + repullPolicyDefault: OnSchedule +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: gitlab-runner-helper +spec: + image: gitlab/gitlab-runner-helper + tag: v17.0.0 + nodeSelector: + node-role.kubernetes.io/ci: "true" + policyRef: + name: safe-default +``` diff --git a/docs/decisions/11-example-scenarios.md b/docs/decisions/11-example-scenarios.md new file mode 100644 index 0000000..ce99c27 --- /dev/null +++ b/docs/decisions/11-example-scenarios.md @@ -0,0 +1,201 @@ +# Feature: Example CR Scenarios + +## Goal +Define concrete Custom Resource examples that demonstrate real operator behavior ("write the code you wish to have"). All resources use the decided naming: `CachedImage`, `CachedImageSet`, `PullPolicy`, `DiscoveryPolicy`. + +--- + +## Scenario 1: Pull two images onto build nodes, one at a time + +Pull `image-a` and `image-b` onto all nodes with taint `node-role.kubernetes.io/build`, pacing to maximum one image pulling at a time across the pool. + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: build-pool-safe +spec: + maxConcurrentNodes: 1 # only 1 node pulls at a time + minDelayBetweenPulls: 20s # 20s pause between pull starts + failureBackoff: + initial: 10s + max: 5m + nodeSelector: + node-role.kubernetes.io/build: "true" + tolerations: + - key: "node-role.kubernetes.io/build" + operator: "Exists" + effect: "NoSchedule" +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: build-essentials +spec: + policyRef: + name: build-pool-safe + nodeSelector: + node-role.kubernetes.io/build: "true" + tolerations: + - key: "node-role.kubernetes.io/build" + operator: "Exists" + effect: "NoSchedule" + images: + - image: registry.example.com/team/image-a + tag: "1.2.3" + - image: registry.example.com/team/image-b + tag: "4.5.6" + pullPolicy: IfNotPresent + repullPolicy: Never +``` + +**Operator behavior:** +1. Reconciler sees `CachedImageSet` "build-essentials" bound to `build-pool-safe`. +2. Operator creates child `CachedImage` resources for image-a and image-b (owned via ownerReferences). +3. Policy limits pulling to 1 node at a time with 20s spacing. +4. Operator picks `image-a` first (by priority or alphabetical), pulls it onto node-1, waits 20s, pulls onto node-2, etc. +5. Once `image-a` is complete on all targeted nodes, moves to `image-b` and repeats. +6. At no point are two images or two nodes pulling simultaneously. + +--- + +## Scenario 2: GPU pool with relaxed pacing + +GPU nodes have fast storage and network; allow 3 nodes to pull at once. + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: gpu-pool-fast +spec: + maxConcurrentNodes: 3 + minDelayBetweenPulls: 5s + failureBackoff: + initial: 5s + max: 2m + nodeSelector: + gpu: "true" + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: cuda-base +spec: + image: nvcr.io/nvidia/cuda + tag: "12.4.0-runtime-ubuntu22.04" + pullPolicy: IfNotPresent + repullPolicy: Never + policyRef: + name: gpu-pool-fast + nodeSelector: + gpu: "true" + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" +``` + +**Operator behavior:** +1. Up to 3 GPU nodes pull `cuda-base` concurrently. +2. 5s delay between each new node starting its pull. +3. If a pull fails, backs off starting at 5s up to 2m. + +--- + +## Scenario 3: Prometheus-driven discovery for dynamic images + +Automatically discover the top 5 most-used images matching `image-c*` via a Prometheus query, then cache them onto build nodes using the safe policy. + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: build-pool-safe +spec: + maxConcurrentNodes: 1 + minDelayBetweenPulls: 20s + failureBackoff: + initial: 10s + max: 5m + nodeSelector: + node-role.kubernetes.io/build: "true" + tolerations: + - key: "node-role.kubernetes.io/build" + operator: "Exists" + effect: "NoSchedule" +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: discover-image-c +spec: + sources: + - type: prometheus + prometheus: + endpoint: http://prometheus.monitoring.svc:9090 + query: | + topk(5, + count by (image) ( + kube_pod_container_info{image=~"registry.example.com/team/image-c.*"} + ) + ) + interval: 1h + secretRef: + name: prometheus-creds # optional: Secret with token/username/password/ca.crt + imageFilter: + pattern: "registry.example.com/team/image-c.*" + syncInterval: 30m + maxImages: 5 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: popular-ci-images +spec: + policyRef: + name: build-pool-safe + discoveryPolicyRef: + name: discover-image-c + nodeSelector: + node-role.kubernetes.io/build: "true" + tolerations: + - key: "node-role.kubernetes.io/build" + operator: "Exists" + effect: "NoSchedule" + pullPolicy: IfNotPresent + repullPolicy: OnSchedule +``` + +**Operator behavior:** +1. `DiscoveryPolicy` reconciler executes the Prometheus query every 30 minutes. +2. Query returns top 5 images matching `image-c*` by pod usage count. +3. `CachedImageSet` reconciler reads discovered images from the referenced `DiscoveryPolicy` status. +4. Operator materializes/updates up to 5 child `CachedImage` resources (owned by the set). +5. Each child `CachedImage` inherits `policyRef: build-pool-safe`, so pulls respect one-node-at-a-time pacing. +6. If an image drops out of the top 5, its `CachedImage` is garbage-collected on the next sync. + +--- + +## Design notes + +### Per-pool policy binding +`PullPolicy` carries `nodeSelector` and `tolerations` to bind it to a specific node pool. This allows heterogeneous clusters to have different pacing per pool: +- Slow/safe policy for large CI build pools. +- Fast/relaxed policy for GPU or burst pools with better I/O. +- Default cluster-wide policy for general workloads. + +Multiple policies can coexist; each `CachedImage`/`CachedImageSet` references the appropriate policy via `policyRef`. + +### Ordering within a policy +When multiple `CachedImage` resources share the same policy, the operator processes them sequentially by default (one image fully rolled out before starting the next). A `priority` field on `CachedImage` controls ordering. + +### Moving tags +For images using moving tags (e.g. `latest`), set `repullPolicy: OnSchedule` on the `CachedImage` or let the policy default apply. The operator re-checks on each sync interval. + +### Cluster scope +All resources (`CachedImage`, `CachedImageSet`, `PullPolicy`, `DiscoveryPolicy`) are cluster-scoped because they operate on nodes, which are themselves cluster-scoped resources. diff --git a/docs/decisions/12-naming-structure-proposals.md b/docs/decisions/12-naming-structure-proposals.md new file mode 100644 index 0000000..8834cf7 --- /dev/null +++ b/docs/decisions/12-naming-structure-proposals.md @@ -0,0 +1,228 @@ +# CRD Naming and Structure — Decision + +## Chosen: `CachedImage` + `CachedImageSet` + `PullPolicy` + `DiscoveryPolicy` + +Decision: Proposal C. "Cached" describes the desired state (image is cached on nodes), which is idiomatic for Kubernetes declarative specs. All resources are **cluster-scoped** since they target nodes (which are cluster-scoped). + +--- + +## Design principles + +1. **Single concern per CRD** — separate "what to cache", "how fast to pull", and "how to discover". +2. **Singular nouns** for Kind names. +3. **Owner references** — `CachedImageSet` owns child `CachedImage` resources for lifecycle/GC. +4. **API group carries context** — within `drop.corewire.io`, names don't need to repeat "pull" or "pre-pull". +5. **Cluster-scoped** — nodes are cluster-scoped, so image caching resources are too. +6. **Policy separation** — `PullPolicy` and `DiscoveryPolicy` are independent resources with single concerns. + +--- + +## Resource overview + +| Kind | API Group/Version | Scope | Single concern | +|------|-------------------|-------|----------------| +| `CachedImage` | `drop.corewire.io/v1alpha1` | Cluster | "This image should be cached on these nodes" | +| `CachedImageSet` | `drop.corewire.io/v1alpha1` | Cluster | "This group of images should be cached on these nodes" | +| `PullPolicy` | `drop.corewire.io/v1alpha1` | Cluster | "Control pull pacing and safety" | +| `DiscoveryPolicy` | `drop.corewire.io/v1alpha1` | Cluster | "How to discover images dynamically" | + +--- + +## Resource hierarchy + +``` +PullPolicy → "how fast/safe do we pull?" (reusable, referenced by sets/images) +DiscoveryPolicy → "how do we find images?" (attached to a CachedImageSet) + ↑ referenced by +CachedImageSet → "which images as a group" (static list or discovery-driven) + │ owns (ownerReferences) + ↓ +CachedImage → "one image on target nodes" (leaf resource, reconciled individually) +``` + +--- + +## CRD field definitions + +### `CachedImage` + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: cuda-base # cluster-scoped, no namespace +spec: + image: nvcr.io/nvidia/cuda + tag: "12.4.0-runtime-ubuntu22.04" # optional, mutually exclusive with digest + digest: "" # optional, preferred for immutable refs + pullPolicy: IfNotPresent # IfNotPresent | Always + repullPolicy: Never # Never | OnSchedule | Always + policyRef: + name: gpu-fast # reference to a PullPolicy + nodeSelector: # target specific nodes + gpu: "true" + tolerations: # tolerate taints on target nodes + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + priority: 10 # optional ordering hint (lower = pulled first) +status: + phase: Ready # Pending | Pulling | Ready | Failed + nodesTargeted: 5 + nodesReady: 5 + lastPulledAt: "2026-05-22T05:00:00Z" + observedGeneration: 1 + conditions: [] +``` + +### `CachedImageSet` + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: build-essentials +spec: + policyRef: + name: build-safe # reference to a PullPolicy + discoveryPolicyRef: + name: discover-ci-images # optional, reference to a DiscoveryPolicy + nodeSelector: + node-role.kubernetes.io/build: "true" + tolerations: + - key: "node-role.kubernetes.io/build" + operator: "Exists" + effect: "NoSchedule" + images: # static image list (used when no discoveryPolicyRef) + - image: registry.example.com/team/image-a + tag: "1.2.3" + - image: registry.example.com/team/image-b + tag: "4.5.6" + pullPolicy: IfNotPresent # default for child CachedImages + repullPolicy: Never # default for child CachedImages +status: + phase: Ready + imagesManaged: 2 + imagesReady: 2 + observedGeneration: 1 + conditions: [] +``` + +### `PullPolicy` + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: build-safe +spec: + maxConcurrentNodes: 1 # max nodes pulling at once + minDelayBetweenPulls: 20s # spacing between pull starts + failureBackoff: + initial: 10s # first retry delay + max: 5m # max retry delay + repullPolicyDefault: OnSchedule # default repull behavior for referencing images + nodeSelector: # optional: scope policy to a node pool + node-role.kubernetes.io/build: "true" + tolerations: # optional: match tainted nodes in pool + - key: "node-role.kubernetes.io/build" + operator: "Exists" + effect: "NoSchedule" +``` + +### `DiscoveryPolicy` + +Designed for **extensibility**: `sources` is a list so multiple backends can feed the same policy. Each source type uses a uniform connection pattern with optional `secretRef` for auth (tokens, headers, TLS certs — anything passable as a k8s Secret). New source types can be added in future versions without breaking the schema. + +```yaml +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: discover-ci-images +spec: + sources: # list of discovery backends (extensible) + - type: prometheus # metrics-based discovery + prometheus: + endpoint: http://prometheus.monitoring.svc:9090 + query: | + topk(5, + count by (image) ( + kube_pod_container_info{image=~"registry.example.com/team/.*"} + ) + ) + interval: 1h # query execution interval + secretRef: # optional: auth for this source + name: prometheus-creds # Secret with keys: token, username, password, ca.crt, headers.* + - type: registry # OCI registry tag discovery + registry: + url: https://registry.example.com + repositories: # list of repos to scan + - team/image-a + - team/image-b + tagFilter: "^v[0-9]+\\." # regex to select tags + topX: 3 # keep top X tags per repo (by semver/date) + secretRef: + name: registry-creds # Secret with keys: username, password, token, ca.crt, headers.* + imageFilter: + pattern: "registry.example.com/team/.*" # regex filter on discovered images + syncInterval: 30m # how often to reconcile discovered set + maxImages: 10 # cap on total discovered images +status: + lastSyncTime: "2026-05-22T05:00:00Z" + discoveredImages: 5 + conditions: [] +``` + +#### Source types (v1alpha1) + +| Type | Purpose | Config object | +|------|---------|---------------| +| `prometheus` | Discover images from metrics queries | `prometheus: {endpoint, query, interval}` | +| `registry` | Discover tags from OCI registries | `registry: {url, repositories, tagFilter, topX}` | + +#### Future source types (planned/extensible) + +| Type | Purpose | +|------|---------| +| `graphite` | Alternative metrics backend | +| `datadog` | Datadog metrics API | +| `webhook` | External HTTP endpoint returning image list | +| `argocd` | Discover images from Argo CD application manifests | + +#### Secret format (`secretRef`) + +Each source's `secretRef` points to a k8s Secret. The operator reads well-known keys: + +| Secret key | Usage | +|------------|-------| +| `token` | Bearer token for Authorization header | +| `username` | Basic auth username | +| `password` | Basic auth password | +| `ca.crt` | Custom CA certificate (PEM) for TLS verification | +| `tls.crt` | Client certificate for mTLS | +| `tls.key` | Client key for mTLS | +| `headers.` | Arbitrary HTTP headers (e.g. `headers.X-Custom-Auth`) | + +This allows any authentication scheme without operator code changes — just populate the Secret appropriately. + +--- + +## Why this design + +- **"Cached" describes desired state** — idiomatic for k8s (you declare what should be true). +- **No ambiguity** — "CachedImage" clearly differs from OCI Image manifests or container image refs. +- **Cluster-scoped** — nodes are cluster-scoped; images cached on nodes logically belong at cluster level. +- **Non-disruptive** — image pulls never affect node schedulability. The operator does not cordon, drain, or mark nodes unavailable. Pulls are background operations. The operator may place images on nodes before they are marked Ready (e.g. during node bootstrap). +- **Discovery is separate** — `DiscoveryPolicy` has its own reconciliation loop, sync interval, and failure modes. Keeping it separate from `CachedImageSet` follows single-concern principle and allows reuse. +- **Policy is separate** — `PullPolicy` can be shared across many sets/images, tuned independently by platform teams. +- **Owner references for GC** — when a `CachedImageSet` is deleted, its child `CachedImage` resources are garbage-collected automatically. + +--- + +## Alternatives considered (rejected) + +| Proposal | Names | Why rejected | +|----------|-------|--------------| +| A | `Image` + `ImageSet` + `PullPolicy` | "Image" too generic, confusing in conversation | +| B | `NodeImage` + `NodeImageSet` + `PullPolicy` | Less intuitive than "Cached" for desired state | +| D | `PrePullImage` + `PrePullImageSet` + `PrePullPolicy` | Verbose, redundant within `drop.corewire.io` group | diff --git a/docs/doc-generation.md b/docs/doc-generation.md new file mode 100644 index 0000000..e284197 --- /dev/null +++ b/docs/doc-generation.md @@ -0,0 +1,84 @@ +# Documentation Generation + + + +## How It Works + +All documentation is generated from source code via `make docs-gen` (which runs `go run ./hack/gen-ai-docs/`). + +```mermaid +flowchart TD + subgraph Sources["Source of Truth"] + TYPES["api/v1alpha1/*_types.go
(CRD types + kubebuilder markers)"] + CTRL["internal/controller/*.go
(reconcilers, error reasons)"] + METRICS["internal/metrics/metrics.go
(Prometheus metrics)"] + MAKEFILE["Makefile
(build targets)"] + GOMOD["go.mod
(Go version, module)"] + SAMPLES["hack/dev-samples.yaml
(example CRs)"] + end + + subgraph Generator["hack/gen-ai-docs/"] + PARSE["Go AST Parser
+ go list -json"] + KNOWLEDGE["knowledge.yaml
(structured intermediate)"] + RENDER["Template Renderer"] + end + + subgraph UseAgents["USE Agents"] + LLMS["llms.txt
(short onboarding)"] + LLMSFULL["llms-full.txt
(complete reference)"] + end + + subgraph CodeAgents["CODE Agents"] + COPILOT[".github/copilot-instructions.md"] + CURSOR[".cursorrules"] + AGENTS["AGENTS.md"] + end + + subgraph Humans["Humans (Hugo)"] + CRDS["reference/_generated_crds.md"] + ERRORS["reference/_generated_errors.md"] + METRICSH["reference/_generated_metrics.md"] + ARCH["reference/_generated_architecture.md"] + end + + TYPES --> PARSE + CTRL --> PARSE + METRICS --> PARSE + MAKEFILE --> PARSE + GOMOD --> PARSE + SAMPLES --> PARSE + + PARSE --> KNOWLEDGE + KNOWLEDGE --> RENDER + + RENDER --> LLMS + RENDER --> LLMSFULL + RENDER --> COPILOT + RENDER --> CURSOR + RENDER --> AGENTS + RENDER --> CRDS + RENDER --> ERRORS + RENDER --> METRICSH + RENDER --> ARCH +``` + +## Three Audiences + +```mermaid +graph LR + subgraph SoT["Single Source of Truth"] + CODE["Go Source Code"] + end + + CODE -->|schema, fields, examples| USE["USE Agents
(GitOps, kubectl, IaC)"] + CODE -->|architecture, conventions| DEV["CODE Agents
(Copilot, Cursor, Codex)"] + CODE -->|narrative + generated ref| HUMAN["Humans
(Hugo docs site)"] +``` + +## Commands + +| Command | Purpose | +|---------|---------| +| `make docs-gen` | Regenerate all docs from source | +| `make docs-gen-check` | CI gate — fails if docs are stale | +| `make codegen` | CRDs + deepcopy + docs (full pipeline) | diff --git a/docs/go.mod b/docs/go.mod new file mode 100644 index 0000000..c0e3c3f --- /dev/null +++ b/docs/go.mod @@ -0,0 +1,5 @@ +module github.com/Breee/drop/docs + +go 1.26.0 + +require github.com/imfing/hextra v0.12.3 // indirect diff --git a/docs/go.sum b/docs/go.sum new file mode 100644 index 0000000..afa8680 --- /dev/null +++ b/docs/go.sum @@ -0,0 +1,2 @@ +github.com/imfing/hextra v0.12.3 h1:DZHY2rUWYteyzjlHi9r4n7Bb5e2Q+6LXe4C1Dqn0ZjM= +github.com/imfing/hextra v0.12.3/go.mod h1:vi+yhpq8YPp/aghvJlNKVnJKcPJ/VyAEcfC1BSV9ARo= diff --git a/docs/hugo.yaml b/docs/hugo.yaml new file mode 100644 index 0000000..9553352 --- /dev/null +++ b/docs/hugo.yaml @@ -0,0 +1,67 @@ +baseURL: "https://breee.github.io/drop/" +title: Puller Operator +defaultContentLanguage: en +enableGitInfo: true + +module: + imports: + - path: github.com/imfing/hextra + +# Hextra v0.12 handles markdown + llms output formats natively +outputs: + home: [html, llms] + page: [html, markdown] + section: [html, rss, markdown] + +markup: + goldmark: + renderer: + unsafe: true + highlight: + noClasses: false + +menu: + main: + - name: Documentation + pageRef: /docs + weight: 1 + - name: Search + weight: 3 + params: + type: search + - name: GitHub + url: https://github.com/Breee/drop + weight: 4 + params: + icon: github + +params: + description: Kubernetes operator that caches container images on cluster nodes. + displayUpdatedDate: true + navbar: + displayTitle: true + displayLogo: false + page: + width: wide + contextMenu: + enable: true + links: + - name: Open in ChatGPT + icon: chatgpt + url: "https://chatgpt.com/?q=Read+{markdown_url}+and+help+me+with+{title}" + - name: Open in Claude + icon: claude + url: "https://claude.ai/new?q=Read+{markdown_url}+and+help+me+with+{title}" + footer: + displayPoweredBy: false + search: + enable: true + type: flexsearch + flexsearch: + index: content + docs: + sidebar: + defaultOpen: true + editURL: + enable: true + base: https://github.com/Breee/drop/edit/main/docs/content diff --git a/docs/layouts/partials/custom/head-end.html b/docs/layouts/partials/custom/head-end.html new file mode 100644 index 0000000..104a5b2 --- /dev/null +++ b/docs/layouts/partials/custom/head-end.html @@ -0,0 +1,4 @@ +{{- /* Advertise markdown alternate for AI agent discovery */ -}} +{{- with .OutputFormats.Get "markdown" -}} + +{{- end -}} diff --git a/docs/static/casts/apply.cast b/docs/static/casts/apply.cast new file mode 100644 index 0000000..42006ef --- /dev/null +++ b/docs/static/casts/apply.cast @@ -0,0 +1,14 @@ +{"version": 2, "width": 80, "height": 22, "timestamp": 1779633738, "env": {}} +[0.00551, "o", "$ cat cachedimage.yaml\r\n"] +[1.009388, "o", "apiVersion: puller.corewire.io/v1alpha1\r\nkind: CachedImage\r\nmetadata:\r\n name: nginx-demo\r\nspec:\r\n image: docker.io/library/nginx\r\n tag: \"1.27\"\r\n nodeSelector:\r\n kubernetes.io/os: linux\r\n"] +[4.011544, "o", "\r\n"] +[4.011701, "o", "$ kubectl apply -f cachedimage.yaml\r\n"] +[4.108703, "o", "cachedimage.puller.corewire.io/nginx-demo created\r\n"] +[6.118397, "o", "\r\n$ kubectl get cachedimages nginx-demo -w\r\n"] +[6.189781, "o", "NAME IMAGE TAG STATUS READY AGE\r\nnginx-demo docker.io/library/nginx 1.27 InProgress 0/2 2s\r\n"] +[6.957172, "o", "nginx-demo docker.io/library/nginx 1.27 InProgress 1/2 3s\r\n"] +[8.724625, "o", "nginx-demo docker.io/library/nginx 1.27 InProgress 1/2 5s\r\n"] +[9.879017, "o", "nginx-demo docker.io/library/nginx 1.27 Cached 2/2 6s\r\n"] +[9.887387, "o", "nginx-demo docker.io/library/nginx 1.27 Cached 2/2 6s\r\n"] +[9.899384, "o", "nginx-demo docker.io/library/nginx 1.27 Cached 2/2 6s\r\n"] +[14.156168, "o", "nginx-demo docker.io/library/nginx 1.27 Cached 2/2 10s\r\n"] diff --git a/docs/static/casts/events.cast b/docs/static/casts/events.cast new file mode 100644 index 0000000..febe616 --- /dev/null +++ b/docs/static/casts/events.cast @@ -0,0 +1,11 @@ +{"version": 2, "width": 120, "height": 22, "timestamp": 1779633798, "env": {}} +[0.005057, "o", "$ kubectl get events --field-selector reason!=LeaderElection --watch-only\r\n"] +[3.276926, "o", "LAST SEEN TYPE REASON OBJECT MESSAGE\r\n0s Normal Pulling pod/pull-nginx-demo-t58qv Pulling image \"docker.io/library/nginx:1.27\"\r\n"] +[3.988758, "o", "0s Normal Pulled pod/pull-nginx-demo-t58qv Successfully pulled image \"docker.io/library/nginx:1.27\" in 704ms (704ms including waiting). Image size: 72406859 bytes.\r\n"] +[4.027182, "o", "0s Normal Created pod/pull-nginx-demo-t58qv Container created\r\n"] +[4.169234, "o", "0s Normal Started pod/pull-nginx-demo-t58qv Container started\r\n"] +[5.717687, "o", "0s Normal Pulling pod/pull-nginx-demo-g5xtm Pulling image \"docker.io/library/nginx:1.27\""] +[5.717903, "o", "\r\n"] +[6.424978, "o", "0s Normal Pulled pod/pull-nginx-demo-g5xtm Successfully pulled image \"docker.io/library/nginx:1.27\" in 696ms (696ms including waiting). Image size: 72406859 bytes.\r\n"] +[6.463686, "o", "0s Normal Created pod/pull-nginx-demo-g5xtm Container created\r\n"] +[6.590277, "o", "0s Normal Started pod/pull-nginx-demo-g5xtm Container started\r\n"] diff --git a/docs/static/casts/pods.cast b/docs/static/casts/pods.cast new file mode 100644 index 0000000..c51c391 --- /dev/null +++ b/docs/static/casts/pods.cast @@ -0,0 +1,14 @@ +{"version": 2, "width": 80, "height": 22, "timestamp": 1779633770, "env": {}} +[0.005664, "o", "$ kubectl get pods -l app.kubernetes.io/managed-by=puller -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,NODE:.spec.nodeName -w\r\n"] +[3.089574, "o", "NAME STATUS NODE\r\npull-nginx-demo-c4r7b Pending puller-dev-worker\r\n"] +[3.109275, "o", "pull-nginx-demo-c4r7b Pending puller-dev-worker\r\n"] +[5.011177, "o", "pull-nginx-demo-c4r7b Pending puller-dev-worker\r\n"] +[6.150844, "o", "pull-nginx-demo-c4r7b Succeeded puller-dev-worker\r\n"] +[6.164105, "o", "pull-nginx-demo-c4r7b Succeeded puller-dev-worker\r\n"] +[6.167782, "o", "pull-nginx-demo-c4r7b Succeeded puller-dev-worker\r\n"] +[6.179314, "o", "pull-nginx-demo-6w4ct Pending puller-dev-worker2\r\n"] +[6.21038, "o", "pull-nginx-demo-6w4ct Pending puller-dev-worker2\r\n"] +[8.012095, "o", "pull-nginx-demo-6w4ct Pending puller-dev-worker2\r\n"] +[9.152234, "o", "pull-nginx-demo-6w4ct Succeeded puller-dev-worker2\r\n"] +[9.167537, "o", "pull-nginx-demo-6w4ct Succeeded puller-dev-worker2\r\n"] +[9.173683, "o", "pull-nginx-demo-6w4ct Succeeded puller-dev-worker2\r\n"] diff --git a/docs/static/llms-full.txt b/docs/static/llms-full.txt new file mode 100644 index 0000000..e3edc2c --- /dev/null +++ b/docs/static/llms-full.txt @@ -0,0 +1,426 @@ +# drop — Full Reference for AI Agents + +## Project + +- **Name**: drop +- **Language**: Go 1.23.0 +- **Module**: github.com/Breee/drop +- **API Group**: drop.corewire.io/v1alpha1 +- **Scope**: All CRDs cluster-scoped +- **License**: Apache-2.0 +- **Framework**: Kubebuilder / controller-runtime + +## CRD Field Reference + +### CachedImage + +CachedImage is the Schema for the cachedimages API. + +Controller: internal/controller/cachedimage_controller.go | Test: internal/controller/cachedimage_controller_test.go + +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Image | `image` | `string` | ✓ | | Image is the fully qualified image reference (registry/repository). | +| Tag | `tag` | `string` | — | | Tag to pull. Mutually exclusive with Digest. | +| Digest | `digest` | `string` | — | | Digest to pull (immutable reference). Mutually exclusive with Tag. | +| ImagePullPolicy | `imagePullPolicy` | `corev1.PullPolicy` | — | `Always` | ImagePullPolicy controls when kubelet pulls the image. Defaults to Always (checks upstream digest, only downloads if changed). Set to IfNotPresent to skip the registry check when the tag already exists locally. Enum: `Always`,`IfNotPresent`,`Never` | +| ImagePullSecrets | `imagePullSecrets` | `[]corev1.LocalObjectReference` | — | | ImagePullSecrets are references to secrets for pulling from private registries. | +| NodeSelector | `nodeSelector` | `map[string]string` | — | | NodeSelector restricts which nodes to cache the image on. | +| Tolerations | `tolerations` | `[]corev1.Toleration` | — | | Tolerations allow targeting tainted nodes. | +| Priority | `priority` | `*int32` | — | | Priority is a pull ordering hint (lower values pulled first). | +| PolicyRef | `policyRef` | `*PolicyReference` | — | | PolicyRef references a PullPolicy for pacing controls. | + +#### Status +| Field | JSON | Type | Description | +|-------|------|------|-------------| +| ObservedGeneration | `observedGeneration` | `int64` | ObservedGeneration is the last generation reconciled. | +| Phase | `phase` | `string` | Phase summarizes the overall state. | +| Ready | `ready` | `string` | Ready is a human-readable "nodesReady/nodesTargeted" fraction for display. | +| ResolvedDigest | `resolvedDigest` | `string` | ResolvedDigest is the sha256 digest of the image as reported by the container runtime after pull. | +| NodesTargeted | `nodesTargeted` | `int32` | NodesTargeted is the number of nodes that should have this image. | +| NodesReady | `nodesReady` | `int32` | NodesReady is the number of nodes that have successfully pulled the image. | +| CachedNodes | `cachedNodes` | `[]string` | CachedNodes is the list of node names that have successfully cached the image. | +| ConsecutiveFailures | `consecutiveFailures` | `int32` | ConsecutiveFailures counts sequential reconcile failures for backoff calculation. | +| LastPulledAt | `lastPulledAt` | `*metav1.Time` | LastPulledAt is the timestamp of the most recent successful pull. | +| LastAttemptedAt | `lastAttemptedAt` | `*metav1.Time` | LastAttemptedAt is the timestamp of the most recent pull attempt (success or failure). | +| Conditions | `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + + +### CachedImageSet + +CachedImageSet is the Schema for the cachedimagesets API. + +Controller: internal/controller/cachedimageset_controller.go | Test: internal/controller/cachedimageset_controller_test.go + +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| PolicyRef | `policyRef` | `*PolicyReference` | — | | PolicyRef references a PullPolicy for pacing controls. | +| DiscoveryPolicyRef | `discoveryPolicyRef` | `*DiscoveryPolicyReference` | — | | DiscoveryPolicyRef references a DiscoveryPolicy for dynamic image lists. | +| ImagePullPolicy | `imagePullPolicy` | `corev1.PullPolicy` | — | `Always` | ImagePullPolicy controls when kubelet pulls the image (propagated to children). Enum: `Always`,`IfNotPresent`,`Never` | +| ImagePullSecrets | `imagePullSecrets` | `[]corev1.LocalObjectReference` | — | | ImagePullSecrets are references to secrets for pulling from private registries (propagated to children). | +| NodeSelector | `nodeSelector` | `map[string]string` | — | | NodeSelector restricts which nodes to cache images on (propagated to children). | +| Tolerations | `tolerations` | `[]corev1.Toleration` | — | | Tolerations allow targeting tainted nodes (propagated to children). | +| Images | `images` | `[]ImageEntry` | — | | Images is a static list of images to cache. | + +#### Status +| Field | JSON | Type | Description | +|-------|------|------|-------------| +| ObservedGeneration | `observedGeneration` | `int64` | ObservedGeneration is the last generation reconciled. | +| Phase | `phase` | `string` | Phase summarizes the overall state. | +| ImagesManaged | `imagesManaged` | `int32` | ImagesManaged is the number of CachedImage children managed by this set. | +| ImagesReady | `imagesReady` | `int32` | ImagesReady is the number of children in Ready phase. | +| Conditions | `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + + +### PullPolicy + +PullPolicy is the Schema for the pullpolicies API. It is a configuration-only resource with no status. + +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| MaxConcurrentNodes | `maxConcurrentNodes` | `int32` | — | `1` | MaxConcurrentNodes is the max nodes pulling simultaneously for this policy. | +| MinDelayBetweenPulls | `minDelayBetweenPulls` | `metav1.Duration` | — | `10s` | MinDelayBetweenPulls is the minimum time between starting pulls on different nodes. | +| FailureBackoff | `failureBackoff` | `*BackoffConfig` | — | | FailureBackoff configures retry delays on pull failures. | +| RepullInterval | `repullInterval` | `*metav1.Duration` | — | | RepullInterval is how often to re-pull cached images. Zero or unset means never re-pull. | +| NodeSelector | `nodeSelector` | `map[string]string` | — | | NodeSelector scopes this policy to a specific node pool. | +| Tolerations | `tolerations` | `[]corev1.Toleration` | — | | Tolerations match tainted nodes in the pool. | + + +### DiscoveryPolicy + +DiscoveryPolicy is the Schema for the discoverypolicies API. + +Controller: internal/controller/discoverypolicy_controller.go | Test: internal/controller/discoverypolicy_controller_test.go + +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Sources | `sources` | `[]DiscoverySource` | ✓ | | Sources is the list of discovery backends to query. | +| ImageFilter | `imageFilter` | `string` | — | | ImageFilter is a regex to filter discovered images. | +| SyncInterval | `syncInterval` | `metav1.Duration` | — | `30m` | SyncInterval is how often to re-query sources. | +| MaxImages | `maxImages` | `int32` | — | `50` | MaxImages caps the number of discovered images. | + +#### Status +| Field | JSON | Type | Description | +|-------|------|------|-------------| +| LastSyncTime | `lastSyncTime` | `*metav1.Time` | LastSyncTime is the timestamp of the last successful sync. | +| DiscoveredImages | `discoveredImages` | `[]DiscoveredImage` | DiscoveredImages is the list of discovered images from all sources. | +| ImageCount | `imageCount` | `int32` | ImageCount is the number of discovered images. | +| SourceCount | `sourceCount` | `int32` | SourceCount is the number of configured sources. | +| Conditions | `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + + + +## Helper Types + +### PolicyReference + +PolicyReference is a reference to a PullPolicy resource. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Name | `name` | `string` | ✓ | | Name of the PullPolicy resource. | + +### DiscoveryPolicyReference + +DiscoveryPolicyReference is a reference to a DiscoveryPolicy resource. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Name | `name` | `string` | ✓ | | Name of the DiscoveryPolicy resource. | + +### ImageEntry + +ImageEntry defines a single image to include in a set. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Image | `image` | `string` | ✓ | | Image is the fully qualified image reference (registry/repository). | +| Tag | `tag` | `string` | — | | Tag to pull. | +| Digest | `digest` | `string` | — | | Digest to pull. | + +### BackoffConfig + +BackoffConfig defines retry backoff behavior. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Initial | `initial` | `metav1.Duration` | — | `30s` | Initial delay before first retry. | +| Max | `max` | `metav1.Duration` | — | `5m` | Max delay cap for exponential backoff. | + +### DiscoverySource + +DiscoverySource defines a single discovery backend. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Type | `type` | `string` | ✓ | | Type identifies the backend. Enum: `prometheus`,`registry` | +| Prometheus | `prometheus` | `*PrometheusSource` | — | | Prometheus config (when type=prometheus). | +| Registry | `registry` | `*RegistrySource` | — | | Registry config (when type=registry). | +| SecretRef | `secretRef` | `*corev1.LocalObjectReference` | — | | SecretRef references a Secret for auth/TLS for this source. | + +### PrometheusSource + +PrometheusSource defines Prometheus query configuration. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Endpoint | `endpoint` | `string` | ✓ | | Endpoint is the Prometheus API URL. | +| Query | `query` | `string` | ✓ | | Query is the PromQL query that must return an 'image' label. | +| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window to aggregate over (e.g. "7d", "24h"). When set, uses query_range and sums values to rank by total usage. When unset, uses an instant query (point-in-time). | +| Step | `step` | `string` | — | `5m` | Step is the query resolution step for range queries. | + +### RegistrySource + +RegistrySource defines OCI registry tag listing configuration. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| URL | `url` | `string` | ✓ | | URL is the registry base URL. | +| Repositories | `repositories` | `[]string` | ✓ | | Repositories is the list of repositories to query. | +| TagFilter | `tagFilter` | `string` | — | | TagFilter is a regex to filter tags. | +| TopX | `topX` | `int32` | — | | TopX limits the number of tags to fetch per repository. | +| ImageTemplate | `imageTemplate` | `string` | — | | ImageTemplate is a Go text/template for constructing the full image reference. Available variables: .Registry, .Repository, .Tag | + +### DiscoveredImage + +DiscoveredImage represents a single discovered image with metadata. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Image | `image` | `string` | ✓ | | Image is the fully qualified image reference. | +| Score | `score` | `int64` | ✓ | | Score is the ranking score from the source (higher = more relevant). | +| Source | `source` | `string` | ✓ | | Source identifies which discovery source produced this image. | + + +## Relationships + +```mermaid +graph LR + CachedImageSet -->|owns| CachedImage + CachedImage -->|creates| Pod + CachedImage -->|references| PullPolicy + CachedImageSet -->|references| PullPolicy + CachedImageSet -->|references| DiscoveryPolicy + DiscoveryPolicy -->|feeds| CachedImageSet +``` + +## Status Conditions & Error Reasons + +| Reason | Controller | Meaning | Troubleshooting | +|--------|-----------|---------|-----------------| +| Cached | CachedImage | All target nodes have the image cached | | +| Degraded | CachedImageSet | Some child CachedImages have failures | Check individual CachedImage statuses | +| ErrImagePull | CachedImage | Registry unreachable or image does not exist | Verify registry DNS, image name, tag. Check network policies | +| ImagePullBackOff | CachedImage | Repeated pull failures, kubelet is backing off | Check imagePullSecrets, registry auth. Verify image exists | +| InProgress | CachedImage | Image pulls are actively running on some nodes | | +| InvalidImageName | CachedImage | The image reference is malformed | Check spec.image format: registry/repository | +| PartiallyFailed | DiscoveryPolicy | Some discovery sources failed to sync | Check source endpoints and credentials | +| PodFailed | CachedImage | Drop Pod failed for a non-image-pull reason | Check node health, resource limits, Pod security policies | +| Progressing | CachedImageSet | Children are still being pulled | | +| PullFailed | CachedImage | One or more nodes failed to pull the image | Check image name, tag, registry connectivity, imagePullSecrets | +| Ready | CachedImageSet | All child CachedImages are ready | | +| RegistryUnavailable | CachedImage | Cannot connect to the container registry | Check registry URL, DNS, firewall rules | +| SourceError | DiscoveryPolicy | One or more discovery sources returned errors | Check source configuration and connectivity | +| SyncFailed | DiscoveryPolicy | All discovery sources failed | Check all source endpoints, credentials, network | +| Synced | DiscoveryPolicy | All sources synced successfully | | + +## Metrics + +| Name | Type | Description | +|------|------|-------------| +| `drop_images_cached_total` | counter | Total number of images successfully cached on nodes. | +| `drop_pull_duration_seconds` | histogram | Duration of image pull operations in seconds. | +| `drop_pull_errors_total` | counter | Total number of failed image pull attempts. | +| `drop_discovery_images_found` | gauge | Number of images found by a discovery policy. | +| `drop_active_pulls` | gauge | Current number of active image pull Pods. | +| `drop_reconcile_total` | counter | Total number of reconciliation attempts. | +| `drop_discovery_source_health` | gauge | Whether a discovery source is reachable and queryable (1=healthy, 0=unhealthy). | +| `drop_discovery_source_latency_seconds` | histogram | Latency of discovery source queries in seconds. | + +## Sample CRs + +```yaml +# Dev samples: deployed by Tilt for interactive testing +--- +# === PullPolicy === +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: dev-conservative +spec: + maxConcurrentNodes: 1 + minDelayBetweenPulls: 5s + repullInterval: 1h + failureBackoff: + initial: 30s + max: 5m +--- +# === CachedImage: healthy === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: dev-nginx +spec: + image: docker.io/library/nginx + tag: "1.25-alpine" + policyRef: + name: dev-conservative +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: dev-redis +spec: + image: docker.io/library/redis + tag: "7-alpine" + policyRef: + name: dev-conservative +--- +# === CachedImage: broken (DNS failure → ImagePullBackOff) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-invalid-image +spec: + image: registry.invalid.local:9999/does-not-exist + tag: "nope" + policyRef: + name: dev-conservative +--- +# === CachedImageSet: healthy (static images) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: dev-set +spec: + policyRef: + name: dev-conservative + images: + - image: docker.io/library/alpine + tag: "3.19" + - image: docker.io/library/busybox + tag: "1.36" +--- +# === CachedImageSet: dynamic (backed by DiscoveryPolicy) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: dev-set-discovered +spec: + policyRef: + name: dev-conservative + discoveryPolicyRef: + name: dev-registry +--- +# === DiscoveryPolicy: healthy (Prometheus range query) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: dev-prometheus +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'count(container_memory_working_set_bytes{container!="", namespace="build-stuff"}) by (image)' + lookback: 24h + step: 5m + syncInterval: 30s + maxImages: 10 +--- +# === DiscoveryPolicy: healthy (registry tag listing) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: dev-registry +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "test/myapp" + topX: 3 + syncInterval: 30s + maxImages: 10 +--- +# === DiscoveryPolicy: broken (DNS error → DNSError) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-prom +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://nonexistent-prometheus:9090" + query: "up{}" + syncInterval: 30m + maxImages: 10 +--- +# === DiscoveryPolicy: broken (DNS error → DNSError) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-registry +spec: + sources: + - type: registry + registry: + url: "http://nonexistent-registry:5000" + repositories: + - "test/nope" + syncInterval: 30m + maxImages: 10 +--- +# === DiscoveryPolicy: broken (repo doesn't exist → NotFound) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-notfound-repo +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "this/does-not-exist" + syncInterval: 30m + maxImages: 10 + +``` + +## Build & Test + +``` + make help # Display this help. + make build # Build manager binary. + make run # Run controller from your host. + make fmt # Run go fmt. + make vet # Run go vet. + make lint # Run golangci-lint. + make lint-fix # Run golangci-lint with auto-fix. + make generate # Generate DeepCopy methods. + make manifests # Generate CRD and RBAC manifests. + make codegen # Run all code generation (deepcopy + CRDs + docs). + make test # Run unit tests. + make test-e2e # Run Chainsaw E2E tests (requires kind cluster). + make kind-create # Create kind cluster for development. + make kind-delete # Delete the kind cluster. + make install # Install CRDs into cluster. + make uninstall # Uninstall CRDs from cluster. + make e2e-infra # Deploy Prometheus + Registry for E2E/dev. + make docker-build # Build docker image. + make docker-push # Push docker image. + make kind-load # Build and load image into kind. + make helm-lint # Lint the Helm chart. + make helm-template # Render Helm templates locally. + make docs-serve # Serve Hugo docs locally. + make docs-gen # Regenerate AI agent docs (llms.txt, instructions, etc.) from source. + make docs-gen-check # Verify generated AI docs are up to date. + make tools # Install local tooling and check optional docs/chart binaries. +``` diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..aa7c71d --- /dev/null +++ b/go.mod @@ -0,0 +1,100 @@ +module github.com/Breee/drop + +go 1.26.0 + +godebug default=go1.26 + +require ( + github.com/onsi/ginkgo/v2 v2.22.0 + github.com/onsi/gomega v1.36.1 + github.com/prometheus/client_golang v1.19.1 + gopkg.in/yaml.v3 v3.0.1 + k8s.io/api v0.32.1 + k8s.io/apimachinery v0.32.1 + k8s.io/client-go v0.32.1 + k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 + sigs.k8s.io/controller-runtime v0.20.4 +) + +require ( + cel.dev/expr v0.18.0 // indirect + github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/fxamacker/cbor/v2 v2.7.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/cel-go v0.22.0 // indirect + github.com/google/gnostic-models v0.6.8 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/google/gofuzz v1.2.0 // indirect + github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect + github.com/spf13/cobra v1.8.1 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/x448/float16 v0.8.4 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect + go.opentelemetry.io/otel v1.28.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect + go.opentelemetry.io/otel/metric v1.28.0 // indirect + go.opentelemetry.io/otel/sdk v1.28.0 // indirect + go.opentelemetry.io/otel/trace v1.28.0 // indirect + go.opentelemetry.io/proto/otlp v1.3.1 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect + golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect + golang.org/x/net v0.30.0 // indirect + golang.org/x/oauth2 v0.23.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/term v0.25.0 // indirect + golang.org/x/text v0.19.0 // indirect + golang.org/x/time v0.7.0 // indirect + golang.org/x/tools v0.26.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 // indirect + google.golang.org/grpc v1.65.0 // indirect + google.golang.org/protobuf v1.35.1 // indirect + gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + k8s.io/apiextensions-apiserver v0.32.1 // indirect + k8s.io/apiserver v0.32.1 // indirect + k8s.io/component-base v0.32.1 // indirect + k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect + sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect + sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect + sigs.k8s.io/yaml v1.4.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..3719e6c --- /dev/null +++ b/go.sum @@ -0,0 +1,247 @@ +cel.dev/expr v0.18.0 h1:CJ6drgk+Hf96lkLikr4rFf19WrU0BOWEihyZnI2TAzo= +cel.dev/expr v0.18.0/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= +github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= +github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= +github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= +github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/cel-go v0.22.0 h1:b3FJZxpiv1vTMo2/5RDUqAHPxkT8mmMfJIrq1llbf7g= +github.com/google/cel-go v0.22.0/go.mod h1:BuznPXXfQDpXKWQ9sPW3TzlAJN5zzFe+i9tIs0yC4s8= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= +github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= +github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= +github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= +github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= +github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= +go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo= +go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ= +go.opentelemetry.io/otel/metric v1.28.0 h1:f0HGvSl1KRAU1DLgLGFjrwVyismPlnuU6JD6bOeuA5Q= +go.opentelemetry.io/otel/metric v1.28.0/go.mod h1:Fb1eVBFZmLVTMb6PPohq3TO9IIhUisDsbJoL/+uQW4s= +go.opentelemetry.io/otel/sdk v1.28.0 h1:b9d7hIry8yZsgtbmM0DKyPWMMUMlK9NEKuIG4aBqWyE= +go.opentelemetry.io/otel/sdk v1.28.0/go.mod h1:oYj7ClPUA7Iw3m+r7GeEjz0qckQRJK2B8zjcZEfu7Pg= +go.opentelemetry.io/otel/trace v1.28.0 h1:GhQ9cUuQGmNDd5BTCP2dAvv75RdMxEfTmYejp+lkx9g= +go.opentelemetry.io/otel/trace v1.28.0/go.mod h1:jPyXzNPg6da9+38HEwElrQiHlVMTnVfM3/yv2OlIHaI= +go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= +go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= +golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= +golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= +golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= +golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= +golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7 h1:YcyjlL1PRr2Q17/I0dPk2JmYS5CDXfcdb2Z3YRioEbw= +google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7/go.mod h1:OCdP9MfskevB/rbYvHTsXTtKC+3bHWajPdoKgjcYkfo= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 h1:2035KHhUv+EpyB+hWgJnaWKJOdX1E95w2S8Rr4uWKTs= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= +google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc= +google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= +gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.32.1 h1:f562zw9cy+GvXzXf0CKlVQ7yHJVYzLfL6JAS4kOAaOc= +k8s.io/api v0.32.1/go.mod h1:/Yi/BqkuueW1BgpoePYBRdDYfjPF5sgTr5+YqDZra5k= +k8s.io/apiextensions-apiserver v0.32.1 h1:hjkALhRUeCariC8DiVmb5jj0VjIc1N0DREP32+6UXZw= +k8s.io/apiextensions-apiserver v0.32.1/go.mod h1:sxWIGuGiYov7Io1fAS2X06NjMIk5CbRHc2StSmbaQto= +k8s.io/apimachinery v0.32.1 h1:683ENpaCBjma4CYqsmZyhEzrGz6cjn1MY/X2jB2hkZs= +k8s.io/apimachinery v0.32.1/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +k8s.io/apiserver v0.32.1 h1:oo0OozRos66WFq87Zc5tclUX2r0mymoVHRq8JmR7Aak= +k8s.io/apiserver v0.32.1/go.mod h1:UcB9tWjBY7aryeI5zAgzVJB/6k7E97bkr1RgqDz0jPw= +k8s.io/client-go v0.32.1 h1:otM0AxdhdBIaQh7l1Q0jQpmo7WOFIk5FFa4bg6YMdUU= +k8s.io/client-go v0.32.1/go.mod h1:aTTKZY7MdxUaJ/KiUs8D+GssR9zJZi77ZqtzcGXIiDg= +k8s.io/component-base v0.32.1 h1:/5IfJ0dHIKBWysGV0yKTFfacZ5yNV1sulPh3ilJjRZk= +k8s.io/component-base v0.32.1/go.mod h1:j1iMMHi/sqAHeG5z+O9BFNCF698a1u0186zkjMZQ28w= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= +k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= +k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= +k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcpeN4baWEV2ko2Z/AsiZgEdwgcfwLgMo= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= +sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+n0DGU= +sigs.k8s.io/controller-runtime v0.20.4/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= +sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA= +sigs.k8s.io/structured-merge-diff/v4 v4.4.2/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= +sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/hack/ai-friendliness-audit.md b/hack/ai-friendliness-audit.md new file mode 100644 index 0000000..e2c2feb --- /dev/null +++ b/hack/ai-friendliness-audit.md @@ -0,0 +1,104 @@ +# AI-Friendliness Audit — Documentation Site Ranking + +## Ranking System (0–5 per dimension, max score 50) + +| # | Dimension | Weight | What it measures | +|---|-----------|--------|------------------| +| 1 | **Discoverability** | × 1 | Can an agent find and understand what this site offers within one request? (llms.txt, meta tags, link alternate) | +| 2 | **Machine-Readable Output** | × 1 | Are pages available in clean Markdown/plain text without HTML noise? | +| 3 | **Structured Data** | × 1 | Tables, consistent headings, predictable field schemas — can an agent parse reliably? | +| 4 | **Context Density** | × 1 | Information-to-noise ratio. Are pages concise with minimal boilerplate/decorative text? | +| 5 | **Navigation Clarity** | × 1 | Flat hierarchy, descriptive page names, logical grouping — can an agent orient itself? | +| 6 | **Completeness** | × 1 | Does the documentation cover all CRDs, fields, status, errors, metrics? | +| 7 | **Actionability** | × 1 | Examples, commands, copy-pasteable YAML — can an agent generate correct manifests? | +| 8 | **Self-Description** | × 1 | Does the site explain its own structure to agents? (llmsDescription, frontmatter, README) | +| 9 | **Freshness Signals** | × 1 | Last-updated dates, git info, generation timestamps — can an agent assess staleness? | +| 10 | **Integration Surface** | × 1 | Can agents open this content directly in ChatGPT/Claude? Context menu links, URL patterns? | + +### Scoring Guide + +- **5** — Best-in-class, nothing missing +- **4** — Solid, minor gaps +- **3** — Functional but has clear room for improvement +- **2** — Present but barely usable by an agent +- **1** — Technically exists, practically useless +- **0** — Absent + +--- + +## Audit of `http://localhost:1314/drop/` (2026-05-24) + +| # | Dimension | Score | Notes | +|---|-----------|-------|-------| +| 1 | Discoverability | **5** | `/llms.txt` at site root with all page links + descriptions. `` in HTML head. Homepage `llmsDescription` frontmatter explains the project in plain text. | +| 2 | Machine-Readable Output | **5** | Every page available at `{url}index.md` as clean Markdown — no frontmatter leakage, no HTML. Hugo output format configured correctly. | +| 3 | Structured Data | **5** | CRD reference uses consistent tables (Field/Type/Required/Default/Description). Metrics table. Architecture has relationship graph. Predictable patterns across all reference pages. | +| 4 | Context Density | **4** | Pages are concise. Homepage hero is slightly wordy ("Declarative image pre-caching for Kubernetes" + subtitle both exist). Reference pages are excellent — zero fluff. Minor: docs landing page could collapse Quick Start into Getting Started. | +| 5 | Navigation Clarity | **4** | Flat hierarchy: docs/ → 4 pages + reference/ (4 generated pages). Logical grouping. Minor: `_generated_` prefix in URLs is ugly but functional. Section index at `/docs/reference/` exists. | +| 6 | Completeness | **5** | All 4 CRDs documented with every field. Status conditions, error reasons, metrics all covered. Architecture shows relationships. Discovery sources documented. | +| 7 | Actionability | **4** | Getting Started has helm install command. Missing: sample CachedImage YAML in docs (exists in `config/samples/` but not linked from docs). No "copy this manifest" examples on CRD reference page. | +| 8 | Self-Description | **5** | `llmsDescription` on every page. Homepage describes the project scope. llms.txt has one-line summaries. Agent instructions in repo root (AGENTS.md, .github/copilot-instructions.md). | +| 9 | Freshness Signals | **5** | `enableGitInfo: true` + `displayUpdatedDate: true` shows "Last updated on May 22, 2026" on every page. llms.txt has generation timestamp. | +| 10 | Integration Surface | **4** | Context menu has "Open in ChatGPT" and "Open in Claude" with `{markdown_url}` interpolation. Missing: no `/llms-full.txt` endpoint on the Hugo site (only repo-root). Agents must discover markdown URLs via llms.txt → follow link → get content. | + +### **Total: 46 / 50** + +--- + +## Recommendations (to reach 50/50) + +1. **Context Density → 5**: Remove redundant subtitle on homepage OR merge docs landing page Quick Start into Getting Started page. +2. **Navigation Clarity → 5**: Consider aliasing `_generated_crds` → `crds` (Hugo aliases in frontmatter). +3. **Actionability → 5**: Add a "Quick Example" code block on the CRD Reference page with a minimal CachedImage manifest. +4. **Integration Surface → 5**: Serve `llms-full.txt` as a Hugo static file (or generate it into `docs/static/`) so agents can get everything in one request. + +--- + +## Audit Prompt + +Use the following prompt to evaluate any documentation site for AI-friendliness: + +``` +You are an AI documentation agent evaluating a website for machine consumption. + +Perform the following checks and score each dimension 0–5: + +1. DISCOVERABILITY: Fetch the site root. Is there a /llms.txt or /llms-full.txt? + Check HTML for . + Check for meta descriptions or structured frontmatter. + +2. MACHINE-READABLE OUTPUT: Can you fetch any page as plain Markdown by appending + .md or /index.md to the URL? Is the output clean (no HTML, no frontmatter)? + +3. STRUCTURED DATA: Are reference pages using consistent tables or schemas? + Can you reliably extract field names, types, and descriptions programmatically? + +4. CONTEXT DENSITY: What is the information-to-noise ratio? Count decorative text, + repeated navigation, boilerplate vs. actual technical content. + +5. NAVIGATION CLARITY: How many clicks/requests to reach any piece of information? + Is the hierarchy flat? Are page names descriptive? + +6. COMPLETENESS: Does the documentation cover all APIs, fields, status, errors? + Are there undocumented features visible in the codebase but missing from docs? + +7. ACTIONABILITY: Are there copy-pasteable examples? Can you generate a valid + manifest/config from the docs alone without looking at source code? + +8. SELF-DESCRIPTION: Does the site explain its own structure? Is there an index + page that lists all content with summaries? Does frontmatter describe pages? + +9. FRESHNESS SIGNALS: Are there timestamps, git commit info, or generation dates? + Can you determine if the docs are current? + +10. INTEGRATION SURFACE: Can you open this content directly in an AI assistant? + Are there deep links with pre-filled prompts? Can you get all content in one + request (llms-full.txt)? + +For each dimension, output: +- Score (0–5) +- Evidence (specific URLs, content snippets) +- Recommendation (if score < 5) + +Final output: Total score /50, letter grade (A: 45-50, B: 38-44, C: 30-37, D: <30) +``` diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt new file mode 100644 index 0000000..9786798 --- /dev/null +++ b/hack/boilerplate.go.txt @@ -0,0 +1,15 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ \ No newline at end of file diff --git a/hack/demo.sh b/hack/demo.sh new file mode 100755 index 0000000..0788351 --- /dev/null +++ b/hack/demo.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Puller Operator Demo Script +# This script demonstrates the operator's end-to-end functionality using a kind cluster. +# Prerequisites: kind, kubectl, helm, docker + +BOLD='\033[1m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log() { echo -e "${BLUE}[demo]${NC} $*"; } +success() { echo -e "${GREEN}[✓]${NC} $*"; } +section() { echo -e "\n${BOLD}${YELLOW}=== $* ===${NC}\n"; } + +CLUSTER_NAME="drop-demo" +IMG="controller:demo" +NAMESPACE="drop-system" + +cleanup() { + log "Cleaning up..." + kind delete cluster --name "$CLUSTER_NAME" 2>/dev/null || true +} + +trap cleanup EXIT + +section "1. Create Kind Cluster" +if kind get clusters 2>/dev/null | grep -q "$CLUSTER_NAME"; then + log "Cluster $CLUSTER_NAME already exists, reusing." +else + kind create cluster --name "$CLUSTER_NAME" --wait 60s +fi +success "Kind cluster ready" + +section "2. Build and Load Operator Image" +docker build -t "$IMG" . +kind load docker-image "$IMG" --name "$CLUSTER_NAME" +success "Operator image loaded into kind" + +section "3. Install CRDs" +make manifests +kubectl apply -f config/crd/bases/ +success "CRDs installed" + +section "4. Deploy Operator via Helm" +helm upgrade --install drop charts/drop \ + --namespace "$NAMESPACE" \ + --create-namespace \ + --set image.repository=controller \ + --set image.tag=demo \ + --set image.pullPolicy=Never \ + --set leaderElection.enabled=false \ + --set metrics.enabled=true \ + --set metrics.secureServing=false \ + --wait --timeout 60s +success "Operator deployed" + +kubectl -n "$NAMESPACE" get pods +echo "" + +section "5. Create a PullPolicy (conservative pacing)" +cat </dev/null || echo "Pending") + if [ "$phase" = "Ready" ]; then + success "Image cached successfully!" + break + fi + echo " Status: $phase (attempt $i/30)" + sleep 2 +done + +section "8. Check Events" +kubectl get events --field-selector involvedObject.name=demo-nginx --sort-by='.lastTimestamp' 2>/dev/null || log "No events yet" + +section "9. Check Final Status" +kubectl get cachedimage demo-nginx -o yaml | grep -A20 "^status:" + +section "10. Create a CachedImageSet" +cat </dev/null | grep "^drop_" || curl -s http://localhost:8080/metrics 2>/dev/null | grep "^drop_" || log "Could not reach metrics endpoint" +kill $PF_PID 2>/dev/null || true + +section "Demo Complete!" +echo "" +echo "Resources created:" +kubectl get cachedimages +echo "" +kubectl get pullpolicies +echo "" +log "Run 'kind delete cluster --name $CLUSTER_NAME' to clean up." diff --git a/hack/dev-samples.yaml b/hack/dev-samples.yaml new file mode 100644 index 0000000..e4508c1 --- /dev/null +++ b/hack/dev-samples.yaml @@ -0,0 +1,147 @@ +# Dev samples: deployed by Tilt for interactive testing +--- +# === PullPolicy === +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: dev-conservative +spec: + maxConcurrentNodes: 1 + minDelayBetweenPulls: 5s + repullInterval: 1h + failureBackoff: + initial: 30s + max: 5m +--- +# === CachedImage: healthy === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: dev-nginx +spec: + image: docker.io/library/nginx + tag: "1.25-alpine" + policyRef: + name: dev-conservative +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: dev-redis +spec: + image: docker.io/library/redis + tag: "7-alpine" + policyRef: + name: dev-conservative +--- +# === CachedImage: broken (DNS failure → ImagePullBackOff) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-invalid-image +spec: + image: registry.invalid.local:9999/does-not-exist + tag: "nope" + policyRef: + name: dev-conservative +--- +# === CachedImageSet: healthy (static images) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: dev-set +spec: + policyRef: + name: dev-conservative + images: + - image: docker.io/library/alpine + tag: "3.19" + - image: docker.io/library/busybox + tag: "1.36" +--- +# === CachedImageSet: dynamic (backed by DiscoveryPolicy) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: dev-set-discovered +spec: + policyRef: + name: dev-conservative + discoveryPolicyRef: + name: dev-registry +--- +# === DiscoveryPolicy: healthy (Prometheus range query) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: dev-prometheus +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'count(container_memory_working_set_bytes{container!="", namespace="build-stuff"}) by (image)' + lookback: 24h + step: 5m + syncInterval: 30s + maxImages: 10 +--- +# === DiscoveryPolicy: healthy (registry tag listing) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: dev-registry +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "test/myapp" + topX: 3 + syncInterval: 30s + maxImages: 10 +--- +# === DiscoveryPolicy: broken (DNS error → DNSError) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-prom +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://nonexistent-prometheus:9090" + query: "up{}" + syncInterval: 30m + maxImages: 10 +--- +# === DiscoveryPolicy: broken (DNS error → DNSError) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-registry +spec: + sources: + - type: registry + registry: + url: "http://nonexistent-registry:5000" + repositories: + - "test/nope" + syncInterval: 30m + maxImages: 10 +--- +# === DiscoveryPolicy: broken (repo doesn't exist → NotFound) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-notfound-repo +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "this/does-not-exist" + syncInterval: 30m + maxImages: 10 diff --git a/hack/e2e-infra/grafana.yaml b/hack/e2e-infra/grafana.yaml new file mode 100644 index 0000000..a507731 --- /dev/null +++ b/hack/e2e-infra/grafana.yaml @@ -0,0 +1,101 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + namespace: e2e-infra + labels: + app: grafana +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:11.1.0 + ports: + - containerPort: 3000 + env: + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ORG_ROLE + value: "Admin" + - name: GF_AUTH_DISABLE_LOGIN_FORM + value: "true" + volumeMounts: + - name: datasources + mountPath: /etc/grafana/provisioning/datasources + - name: dashboards-config + mountPath: /etc/grafana/provisioning/dashboards + - name: dashboards + mountPath: /var/lib/grafana/dashboards + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 256Mi + volumes: + - name: datasources + configMap: + name: grafana-datasources + - name: dashboards-config + configMap: + name: grafana-dashboards-config + - name: dashboards + configMap: + name: grafana-dashboards +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: e2e-infra + labels: + app: grafana +spec: + selector: + app: grafana + ports: + - port: 3000 + targetPort: 3000 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + namespace: e2e-infra +data: + datasources.yaml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus.e2e-infra.svc.cluster.local:9090 + isDefault: true + editable: true +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards-config + namespace: e2e-infra +data: + dashboards.yaml: | + apiVersion: 1 + providers: + - name: default + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/hack/e2e-infra/prometheus-config.yaml b/hack/e2e-infra/prometheus-config.yaml new file mode 100644 index 0000000..86d2153 --- /dev/null +++ b/hack/e2e-infra/prometheus-config.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: e2e-infra +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + + scrape_configs: + - job_name: drop-operator + metrics_path: /metrics + scheme: http + static_configs: + - targets: ['drop-metrics.drop-system.svc.cluster.local:8443'] + + rule_files: + - /etc/prometheus/rules/*.yml + # Recording rules that produce metrics with image labels (simulates real cluster data) + seed-rules.yml: | + groups: + - name: seed_image_metrics + interval: 10s + rules: + - record: container_memory_working_set_bytes + labels: + image: "docker.io/library/nginx:1.25-alpine" + container: "nginx" + namespace: "default" + pod: "runner-abc123" + expr: "104857600" + - record: container_memory_working_set_bytes + labels: + image: "docker.io/library/redis:7-alpine" + container: "redis" + namespace: "default" + pod: "runner-def456" + expr: "52428800" + - record: container_memory_working_set_bytes + labels: + image: "docker.io/library/alpine:3.19" + container: "worker" + namespace: "build-stuff" + pod: "runner-ghi789" + expr: "26214400" + - record: container_memory_working_set_bytes + labels: + image: "docker.io/library/busybox:1.36" + container: "init" + namespace: "build-stuff" + pod: "runner-jkl012" + expr: "10485760" + - record: container_memory_working_set_bytes + labels: + image: "registry.e2e-infra.svc.cluster.local:5000/test/myapp:v1" + container: "app" + namespace: "production" + pod: "myapp-xyz" + expr: "209715200" diff --git a/hack/e2e-infra/prometheus.yaml b/hack/e2e-infra/prometheus.yaml new file mode 100644 index 0000000..9e5babe --- /dev/null +++ b/hack/e2e-infra/prometheus.yaml @@ -0,0 +1,63 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: e2e-infra + labels: + app: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + containers: + - name: prometheus + image: prom/prometheus:v2.53.0 + args: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--storage.tsdb.retention.time=1h" + - "--web.enable-lifecycle" + ports: + - containerPort: 9090 + volumeMounts: + - name: config + mountPath: /etc/prometheus/prometheus.yml + subPath: prometheus.yml + - name: config + mountPath: /etc/prometheus/rules/seed-rules.yml + subPath: seed-rules.yml + - name: data + mountPath: /prometheus + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + memory: 256Mi + volumes: + - name: config + configMap: + name: prometheus-config + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: e2e-infra + labels: + app: prometheus +spec: + selector: + app: prometheus + ports: + - port: 9090 + targetPort: 9090 + protocol: TCP diff --git a/hack/e2e-infra/registry.yaml b/hack/e2e-infra/registry.yaml new file mode 100644 index 0000000..6119a6e --- /dev/null +++ b/hack/e2e-infra/registry.yaml @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: registry + namespace: e2e-infra + labels: + app: registry +spec: + replicas: 1 + selector: + matchLabels: + app: registry + template: + metadata: + labels: + app: registry + spec: + containers: + - name: registry + image: registry:2 + ports: + - containerPort: 5000 + env: + - name: REGISTRY_STORAGE_DELETE_ENABLED + value: "true" + volumeMounts: + - name: data + mountPath: /var/lib/registry + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + memory: 128Mi + volumes: + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: registry + namespace: e2e-infra + labels: + app: registry +spec: + selector: + app: registry + ports: + - port: 5000 + targetPort: 5000 + protocol: TCP diff --git a/hack/e2e-infra/seed-metrics-job.yaml b/hack/e2e-infra/seed-metrics-job.yaml new file mode 100644 index 0000000..5b3c2a9 --- /dev/null +++ b/hack/e2e-infra/seed-metrics-job.yaml @@ -0,0 +1,39 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: seed-metrics +spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + containers: + - name: seed + image: docker.io/library/busybox:1.36 + command: + - /bin/sh + - -c + - | + # Wait for Prometheus to be ready + echo "Waiting for Prometheus..." + for i in $(seq 1 30); do + if wget -q -O /dev/null "http://prometheus.e2e-infra.svc.cluster.local:9090/-/ready" 2>/dev/null; then + echo "Prometheus is ready" + break + fi + sleep 2 + done + + # Verify recording rules are producing metrics + echo "Waiting for seed metrics to be generated by recording rules..." + for i in $(seq 1 30); do + RESULT=$(wget -q -O - "http://prometheus.e2e-infra.svc.cluster.local:9090/api/v1/query?query=container_memory_working_set_bytes" 2>/dev/null || echo "") + if echo "$RESULT" | grep -q "nginx"; then + echo "Seed metrics are available!" + echo "$RESULT" | head -c 500 + exit 0 + fi + sleep 2 + done + echo "WARNING: Metrics may not be ready yet (rules take a few eval cycles)" + exit 0 diff --git a/hack/e2e-infra/seed-registry-job.yaml b/hack/e2e-infra/seed-registry-job.yaml new file mode 100644 index 0000000..a833e50 --- /dev/null +++ b/hack/e2e-infra/seed-registry-job.yaml @@ -0,0 +1,64 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: seed-registry + namespace: e2e-infra +spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + containers: + - name: seed + image: ghcr.io/regclient/regctl:v0.7.1-alpine + command: + - /bin/sh + - -c + - | + REGISTRY="registry.e2e-infra.svc.cluster.local:5000" + + # Configure regctl for insecure local registry + regctl registry set "$REGISTRY" --tls disabled + + # Wait for registry to be ready + echo "Waiting for registry..." + for i in $(seq 1 30); do + if regctl tag ls "${REGISTRY}/v2" 2>/dev/null || wget -qO- "http://${REGISTRY}/v2/" >/dev/null 2>&1; then + echo "Registry is ready" + break + fi + sleep 2 + done + + # Copy a single small image from Docker Hub, then retag locally + echo "Pulling base image..." + regctl image copy docker.io/library/alpine:3.19 "${REGISTRY}/test/myapp:v1" + + # Now retag within the local registry (no Docker Hub pulls needed) + echo "Retagging within local registry..." + TAGS=" + test/myapp:v1|test/myapp:v2 + test/myapp:v1|test/myapp:v3 + test/myapp:v1|test/worker:v1 + test/myapp:v1|test/worker:v2 + test/myapp:v1|test/worker:v3 + test/myapp:v1|test/tools:v1 + test/myapp:v1|test/tools:v2 + test/myapp:v1|test/tools:v3 + " + + for ENTRY in $TAGS; do + SRC=$(echo "$ENTRY" | cut -d'|' -f1) + DST=$(echo "$ENTRY" | cut -d'|' -f2) + echo " ${REGISTRY}/${SRC} -> ${REGISTRY}/${DST}" + regctl image copy "${REGISTRY}/${SRC}" "${REGISTRY}/${DST}" || echo " FAILED" + done + + echo "" + echo "Verifying tags..." + for REPO in "test/myapp" "test/worker" "test/tools"; do + TAGS=$(regctl tag ls "${REGISTRY}/${REPO}" 2>/dev/null || echo "FAILED") + echo " ${REPO}: ${TAGS}" + done + + echo "Registry seeding complete." diff --git a/hack/e2e-infra/setup.sh b/hack/e2e-infra/setup.sh new file mode 100755 index 0000000..ecbbf42 --- /dev/null +++ b/hack/e2e-infra/setup.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Deploy local Prometheus and Registry into the current kind cluster for E2E tests. +# Prometheus is seeded with container_memory_working_set_bytes metrics containing image labels. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NAMESPACE="e2e-infra" + +echo "[e2e-infra] Creating namespace $NAMESPACE..." +kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - + +# --- Deploy local OCI Registry (distribution/distribution) --- +echo "[e2e-infra] Deploying local registry..." +kubectl apply -n "$NAMESPACE" -f "$SCRIPT_DIR/registry.yaml" + +# --- Deploy Prometheus with pre-loaded metrics --- +echo "[e2e-infra] Deploying Prometheus with seed data..." +kubectl apply -n "$NAMESPACE" -f "$SCRIPT_DIR/prometheus-config.yaml" +kubectl apply -n "$NAMESPACE" -f "$SCRIPT_DIR/prometheus.yaml" + +# --- Wait for readiness --- +echo "[e2e-infra] Waiting for registry to be ready..." +kubectl -n "$NAMESPACE" wait --for=condition=available deployment/registry --timeout=90s + +# --- Configure Kind nodes to reach the in-cluster registry --- +# Kubelet/containerd on Kind nodes can't resolve cluster DNS, so we point them +# at the registry's ClusterIP via containerd mirror config. +REGISTRY_IP=$(kubectl -n "$NAMESPACE" get svc registry -o jsonpath='{.spec.clusterIP}') +REGISTRY_HOST="registry.e2e-infra.svc.cluster.local:5000" +echo "[e2e-infra] Configuring containerd mirror on Kind nodes for $REGISTRY_HOST -> $REGISTRY_IP..." + +for node in $(kind get nodes --name drop-dev 2>/dev/null || kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do + docker exec "$node" mkdir -p "/etc/containerd/certs.d/$REGISTRY_HOST" + cat < /dev/null +[host."http://$REGISTRY_IP:5000"] + capabilities = ["pull", "resolve"] + skip_verify = true +EOF +done +echo "[e2e-infra] Containerd mirror configured on all nodes." + +echo "[e2e-infra] Waiting for Prometheus to be ready..." +kubectl -n "$NAMESPACE" wait --for=condition=available deployment/prometheus --timeout=90s + +# --- Seed the registry with a few images --- +echo "[e2e-infra] Seeding registry with test images..." +REGISTRY_POD=$(kubectl -n "$NAMESPACE" get pods -l app=registry -o jsonpath='{.items[0].metadata.name}') +REGISTRY_SVC="registry.$NAMESPACE.svc.cluster.local:5000" + +# Push images into the in-cluster registry by running a job +kubectl apply -n "$NAMESPACE" -f "$SCRIPT_DIR/seed-registry-job.yaml" +kubectl -n "$NAMESPACE" wait --for=condition=complete job/seed-registry --timeout=120s 2>/dev/null || true + +# --- Seed Prometheus with metrics via remote write --- +echo "[e2e-infra] Seeding Prometheus with image metrics..." +kubectl apply -n "$NAMESPACE" -f "$SCRIPT_DIR/seed-metrics-job.yaml" +kubectl -n "$NAMESPACE" wait --for=condition=complete job/seed-metrics --timeout=60s 2>/dev/null || true + +echo "[e2e-infra] Infrastructure ready." +echo " Prometheus: http://prometheus.$NAMESPACE.svc.cluster.local:9090" +echo " Registry: http://registry.$NAMESPACE.svc.cluster.local:5000" diff --git a/hack/gen-ai-docs/main.go b/hack/gen-ai-docs/main.go new file mode 100644 index 0000000..1001955 --- /dev/null +++ b/hack/gen-ai-docs/main.go @@ -0,0 +1,662 @@ +// hack/gen-ai-docs generates all documentation from source code. +// +// It parses api/v1alpha1/*_types.go, internal/controller/*.go, internal/metrics/, +// Makefile, and go.mod to build a unified knowledge model. From that model it +// generates documentation for three audiences: +// - USE agents: llms.txt, llms-full.txt +// - CODE agents: .github/copilot-instructions.md, .cursorrules, AGENTS.md +// - HUMANS: Hugo content pages (CRD reference, errors, metrics, architecture) +// +// Usage: go run ./hack/gen-ai-docs/ +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "go/ast" + "go/parser" + "go/token" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + "text/template" + + "gopkg.in/yaml.v3" +) + +// ─── Knowledge Model ───────────────────────────────────────────────────────── + +// Knowledge is the unified intermediate representation of the project. +type Knowledge struct { + Project Project `yaml:"project"` + CRDs []CRD `yaml:"crds"` + HelperTypes []TypeDef `yaml:"helperTypes"` + Relationships []Relation `yaml:"relationships"` + Packages []Package `yaml:"packages"` + Conventions []Convention `yaml:"conventions"` + Errors []ErrorReason `yaml:"errors"` + Metrics []Metric `yaml:"metrics"` + MakeTargets []MakeTarget `yaml:"makeTargets"` + Samples string `yaml:"samples"` +} + +type Project struct { + Name string `yaml:"name"` + Description string `yaml:"description"` + APIGroup string `yaml:"apiGroup"` + GoVersion string `yaml:"goVersion"` + Module string `yaml:"module"` + License string `yaml:"license"` +} + +type CRD struct { + Kind string `yaml:"kind"` + Doc string `yaml:"doc"` + Scope string `yaml:"scope"` + Controller string `yaml:"controller,omitempty"` + TestFile string `yaml:"testFile,omitempty"` + SpecFields []Field `yaml:"specFields,omitempty"` + StatusFields []Field `yaml:"statusFields,omitempty"` + Markers []string `yaml:"markers,omitempty"` +} + +type TypeDef struct { + Name string `yaml:"name"` + Doc string `yaml:"doc"` + Fields []Field `yaml:"fields"` +} + +type Field struct { + Name string `yaml:"name"` + JSON string `yaml:"json"` + Type string `yaml:"type"` + Required bool `yaml:"required"` + Default string `yaml:"default,omitempty"` + Enum []string `yaml:"enum,omitempty"` + Doc string `yaml:"doc"` +} + +type Relation struct { + From string `yaml:"from"` + To string `yaml:"to"` + Type string `yaml:"type"` + Mechanism string `yaml:"mechanism,omitempty"` +} + +type Package struct { + Path string `yaml:"path"` + Role string `yaml:"role"` + Imports []string `yaml:"imports,omitempty"` +} + +type Convention struct { + Rule string `yaml:"rule"` + Scope []string `yaml:"scope"` +} + +type ErrorReason struct { + Reason string `yaml:"reason"` + Controller string `yaml:"controller"` + Meaning string `yaml:"meaning"` + Troubleshooting string `yaml:"troubleshooting,omitempty"` +} + +type Metric struct { + Name string `yaml:"name"` + Help string `yaml:"help"` + Type string `yaml:"type"` +} + +type MakeTarget struct { + Name string `yaml:"name"` + Desc string `yaml:"desc"` +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +func main() { + root := findRepoRoot() + k := buildKnowledge(root) + + // Write intermediate knowledge file + writeKnowledgeYAML(root, k) + + // USE agents (repo-root for IDE/GitHub consumption) + generateFile(root, "llms.txt", llmsTxtTmpl, k) + generateFile(root, "llms-full.txt", llmsFullTxtTmpl, k) + + // USE agents (Hugo static — serve llms-full.txt on the site) + generateFile(root, filepath.Join("docs", "static", "llms-full.txt"), llmsFullTxtTmpl, k) + + // CODE agents + generateFile(root, filepath.Join(".github", "copilot-instructions.md"), copilotInstructionsTmpl, k) + generateFile(root, ".cursorrules", cursorRulesTmpl, k) + generateFile(root, "AGENTS.md", agentsMdTmpl, k) + + // HUMANS (Hugo) + generateFile(root, filepath.Join("docs", "content", "docs", "reference", "_generated_crds.md"), hugoCRDsTmpl, k) + generateFile(root, filepath.Join("docs", "content", "docs", "reference", "_generated_errors.md"), hugoErrorsTmpl, k) + generateFile(root, filepath.Join("docs", "content", "docs", "reference", "_generated_metrics.md"), hugoMetricsTmpl, k) + generateFile(root, filepath.Join("docs", "content", "docs", "reference", "_generated_architecture.md"), hugoArchTmpl, k) + + // Repo-level doc generation diagram + generateFile(root, filepath.Join("docs", "doc-generation.md"), docGenDiagramTmpl, k) + + fmt.Println("✓ Generated: knowledge.yaml + llms.txt + llms-full.txt + agent instructions + Hugo reference pages + doc-generation.md") +} + +func findRepoRoot() string { + dir, _ := os.Getwd() + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + fmt.Fprintln(os.Stderr, "error: cannot find repo root (no go.mod)") + os.Exit(1) + } + dir = parent + } +} + +// ─── Knowledge Builder ─────────────────────────────────────────────────────── + +func buildKnowledge(root string) Knowledge { + goVer, module := parseGoMod(filepath.Join(root, "go.mod")) + + k := Knowledge{ + Project: Project{ + Name: "drop", + Description: "Kubernetes operator that pre-caches container images on cluster nodes", + APIGroup: "drop.corewire.io/v1alpha1", + GoVersion: goVer, + Module: module, + License: "Apache-2.0", + }, + } + + crds, helpers := parseAllTypes(filepath.Join(root, "api", "v1alpha1")) + k.CRDs = crds + k.HelperTypes = helpers + k.Relationships = buildRelationships() + k.Packages = extractPackages(root, module) + k.Errors = buildErrorCatalog() + k.Metrics = extractMetrics(filepath.Join(root, "internal", "metrics", "metrics.go")) + k.MakeTargets = extractMakeTargets(filepath.Join(root, "Makefile")) + k.Samples = readFileStr(filepath.Join(root, "hack", "dev-samples.yaml")) + + k.Conventions = []Convention{ + {Rule: "All CRDs are cluster-scoped", Scope: []string{"code", "use"}}, + {Rule: "Status uses metav1.Condition with type \"Ready\"", Scope: []string{"code", "use"}}, + {Rule: "No privileged containers — kubelet-based image pulls only", Scope: []string{"code"}}, + {Rule: "Single responsibility reconcilers — one controller per CRD", Scope: []string{"code"}}, + {Rule: "Pod builder is a pure function in internal/podbuilder/ (no k8s client)", Scope: []string{"code"}}, + {Rule: "Pacing logic lives exclusively in internal/pacing/", Scope: []string{"code"}}, + {Rule: "ownerReferences: CachedImageSet→CachedImage, controller→Pod", Scope: []string{"code"}}, + {Rule: "Table-driven tests preferred; envtest for controllers", Scope: []string{"code"}}, + {Rule: "Pods use nodeName placement + command: [\"true\"]", Scope: []string{"code", "use"}}, + {Rule: "Don't manually edit generated files — run make docs-gen", Scope: []string{"code"}}, + } + + return k +} + +// ─── Type Parser ───────────────────────────────────────────────────────────── + +func parseAllTypes(dir string) ([]CRD, []TypeDef) { + fset := token.NewFileSet() + pkgs, err := parser.ParseDir(fset, dir, func(fi os.FileInfo) bool { + return strings.HasSuffix(fi.Name(), "_types.go") + }, parser.ParseComments) + if err != nil { + fmt.Fprintf(os.Stderr, "error parsing types: %v\n", err) + os.Exit(1) + } + + type typeInfo struct { + name string + doc string + markers []string + fields []Field + } + allTypes := map[string]*typeInfo{} + + for _, pkg := range pkgs { + for _, file := range pkg.Files { + for _, decl := range file.Decls { + gd, ok := decl.(*ast.GenDecl) + if !ok || gd.Tok != token.TYPE { + continue + } + for _, spec := range gd.Specs { + ts := spec.(*ast.TypeSpec) + st, ok := ts.Type.(*ast.StructType) + if !ok { + continue + } + name := ts.Name.Name + doc := "" + if gd.Doc != nil { + doc = cleanDoc(gd.Doc.Text()) + } + allTypes[name] = &typeInfo{ + name: name, + doc: doc, + markers: extractMarkers(gd.Doc), + fields: parseFields(st), + } + } + } + } + } + + rootCRDs := []string{"CachedImage", "CachedImageSet", "PullPolicy", "DiscoveryPolicy"} + controllerMap := map[string]string{ + "CachedImage": "internal/controller/cachedimage_controller.go", + "CachedImageSet": "internal/controller/cachedimageset_controller.go", + "DiscoveryPolicy": "internal/controller/discoverypolicy_controller.go", + } + + crds := make([]CRD, 0, len(rootCRDs)) + for _, kind := range rootCRDs { + root, ok := allTypes[kind] + if !ok { + continue + } + crd := CRD{ + Kind: kind, + Doc: root.doc, + Scope: "Cluster", + Markers: root.markers, + } + if c, ok := controllerMap[kind]; ok { + crd.Controller = c + crd.TestFile = strings.TrimSuffix(c, ".go") + "_test.go" + } + if spec, ok := allTypes[kind+"Spec"]; ok { + crd.SpecFields = spec.fields + } + if status, ok := allTypes[kind+"Status"]; ok { + crd.StatusFields = status.fields + } + crds = append(crds, crd) + } + + helperNames := []string{ + "PolicyReference", "DiscoveryPolicyReference", "ImageEntry", + "BackoffConfig", "DiscoverySource", "PrometheusSource", + "RegistrySource", "DiscoveredImage", + } + var helpers []TypeDef + for _, name := range helperNames { + if t, ok := allTypes[name]; ok { + helpers = append(helpers, TypeDef{Name: t.name, Doc: t.doc, Fields: t.fields}) + } + } + + return crds, helpers +} + +func parseFields(st *ast.StructType) []Field { + fields := make([]Field, 0, len(st.Fields.List)) + for _, f := range st.Fields.List { + if len(f.Names) == 0 { + continue + } + name := f.Names[0].Name + if !ast.IsExported(name) { + continue + } + + jsonTag := "" + required := true + if f.Tag != nil { + tag := f.Tag.Value + if idx := strings.Index(tag, `json:"`); idx >= 0 { + rest := tag[idx+6:] + end := strings.Index(rest, `"`) + jsonTag = rest[:end] + if strings.Contains(jsonTag, "omitempty") { + required = false + } + jsonTag = strings.Split(jsonTag, ",")[0] + } + } + + doc := "" + if f.Doc != nil { + doc = cleanDoc(f.Doc.Text()) + } else if f.Comment != nil { + doc = cleanDoc(f.Comment.Text()) + } + + fields = append(fields, Field{ + Name: name, + JSON: jsonTag, + Type: typeString(f.Type), + Doc: doc, + Required: required, + Default: extractDefault(f.Doc), + Enum: extractEnum(f.Doc), + }) + } + return fields +} + +func typeString(expr ast.Expr) string { + switch t := expr.(type) { + case *ast.Ident: + return t.Name + case *ast.SelectorExpr: + return typeString(t.X) + "." + t.Sel.Name + case *ast.StarExpr: + return "*" + typeString(t.X) + case *ast.ArrayType: + return "[]" + typeString(t.Elt) + case *ast.MapType: + return "map[" + typeString(t.Key) + "]" + typeString(t.Value) + default: + return "unknown" + } +} + +func extractMarkers(doc *ast.CommentGroup) []string { + if doc == nil { + return nil + } + var markers []string + for _, c := range doc.List { + text := strings.TrimPrefix(c.Text, "//") + text = strings.TrimSpace(text) + if strings.HasPrefix(text, "+kubebuilder:") { + markers = append(markers, text) + } + } + return markers +} + +var defaultRe = regexp.MustCompile(`\+kubebuilder:default=(.+)`) +var enumRe = regexp.MustCompile(`\+kubebuilder:validation:Enum=(.+)`) + +func extractDefault(doc *ast.CommentGroup) string { + if doc == nil { + return "" + } + for _, c := range doc.List { + if m := defaultRe.FindStringSubmatch(c.Text); len(m) > 1 { + return strings.Trim(m[1], `"`) + } + } + return "" +} + +func extractEnum(doc *ast.CommentGroup) []string { + if doc == nil { + return nil + } + for _, c := range doc.List { + if m := enumRe.FindStringSubmatch(c.Text); len(m) > 1 { + return strings.Split(m[1], ";") + } + } + return nil +} + +func cleanDoc(s string) string { + lines := strings.Split(strings.TrimSpace(s), "\n") + var clean []string + for _, l := range lines { + l = strings.TrimSpace(l) + if strings.HasPrefix(l, "+") { + continue + } + if l != "" { + clean = append(clean, l) + } + } + return strings.Join(clean, " ") +} + +// ─── Relationships ─────────────────────────────────────────────────────────── + +func buildRelationships() []Relation { + return []Relation{ + {From: "CachedImageSet", To: "CachedImage", Type: "owns", Mechanism: "ownerReferences"}, + {From: "CachedImage", To: "Pod", Type: "creates", Mechanism: "controller-runtime client"}, + {From: "CachedImage", To: "PullPolicy", Type: "references", Mechanism: "spec.policyRef"}, + {From: "CachedImageSet", To: "PullPolicy", Type: "references", Mechanism: "spec.policyRef"}, + {From: "CachedImageSet", To: "DiscoveryPolicy", Type: "references", Mechanism: "spec.discoveryPolicyRef"}, + {From: "DiscoveryPolicy", To: "CachedImageSet", Type: "feeds", Mechanism: "status.discoveredImages"}, + } +} + +// ─── Package Extractor ─────────────────────────────────────────────────────── + +type goListPkg struct { + ImportPath string `json:"ImportPath"` + Imports []string `json:"Imports"` + Doc string `json:"Doc"` +} + +func extractPackages(root, module string) []Package { + cmd := exec.Command("go", "list", "-json", "./...") + cmd.Dir = root + out, err := cmd.Output() + if err != nil { + return staticPackages() + } + + decoder := json.NewDecoder(bytes.NewReader(out)) + var pkgs []Package + for decoder.More() { + var p goListPkg + if err := decoder.Decode(&p); err != nil { + break + } + rel := strings.TrimPrefix(p.ImportPath, module+"/") + if !strings.HasPrefix(rel, "internal/") && !strings.HasPrefix(rel, "api/") { + continue + } + + var internalImports []string + for _, imp := range p.Imports { + if strings.HasPrefix(imp, module) { + internalImports = append(internalImports, strings.TrimPrefix(imp, module+"/")) + } + } + + role := p.Doc + if role == "" { + role = inferRole(rel) + } + + pkgs = append(pkgs, Package{ + Path: rel, + Role: role, + Imports: internalImports, + }) + } + + if len(pkgs) == 0 { + return staticPackages() + } + return pkgs +} + +func inferRole(path string) string { + roles := map[string]string{ + "api/v1alpha1": "CRD type definitions (source of truth)", + "internal/controller": "Reconciler implementations (one per CRD)", + "internal/podbuilder": "Pure Pod construction function (no k8s client)", + "internal/pacing": "Shared pacing engine for rate-limited pulls", + "internal/discovery": "Discovery source interface + implementations", + "internal/metrics": "Prometheus metrics registration", + } + if r, ok := roles[path]; ok { + return r + } + return "" +} + +func staticPackages() []Package { + return []Package{ + {Path: "api/v1alpha1", Role: "CRD type definitions (source of truth)"}, + {Path: "internal/controller", Role: "Reconciler implementations (one per CRD)", Imports: []string{"api/v1alpha1", "internal/podbuilder", "internal/pacing", "internal/metrics"}}, + {Path: "internal/podbuilder", Role: "Pure Pod construction (no k8s client)", Imports: []string{"api/v1alpha1"}}, + {Path: "internal/pacing", Role: "Shared pacing engine for rate-limited pulls"}, + {Path: "internal/discovery", Role: "Discovery source interface + implementations"}, + {Path: "internal/metrics", Role: "Prometheus metrics registration"}, + } +} + +// ─── Error Catalog ─────────────────────────────────────────────────────────── + +func buildErrorCatalog() []ErrorReason { + defs := []ErrorReason{ + {Reason: "Cached", Controller: "CachedImage", Meaning: "All target nodes have the image cached"}, + {Reason: "Degraded", Controller: "CachedImageSet", Meaning: "Some child CachedImages have failures", Troubleshooting: "Check individual CachedImage statuses"}, + {Reason: "ErrImagePull", Controller: "CachedImage", Meaning: "Registry unreachable or image does not exist", Troubleshooting: "Verify registry DNS, image name, tag. Check network policies"}, + {Reason: "ImagePullBackOff", Controller: "CachedImage", Meaning: "Repeated pull failures, kubelet is backing off", Troubleshooting: "Check imagePullSecrets, registry auth. Verify image exists"}, + {Reason: "InProgress", Controller: "CachedImage", Meaning: "Image pulls are actively running on some nodes"}, + {Reason: "InvalidImageName", Controller: "CachedImage", Meaning: "The image reference is malformed", Troubleshooting: "Check spec.image format: registry/repository"}, + {Reason: "PartiallyFailed", Controller: "DiscoveryPolicy", Meaning: "Some discovery sources failed to sync", Troubleshooting: "Check source endpoints and credentials"}, + {Reason: "PodFailed", Controller: "CachedImage", Meaning: "Puller Pod failed for a non-image-pull reason", Troubleshooting: "Check node health, resource limits, Pod security policies"}, + {Reason: "Progressing", Controller: "CachedImageSet", Meaning: "Children are still being pulled"}, + {Reason: "PullFailed", Controller: "CachedImage", Meaning: "One or more nodes failed to pull the image", Troubleshooting: "Check image name, tag, registry connectivity, imagePullSecrets"}, + {Reason: "Ready", Controller: "CachedImageSet", Meaning: "All child CachedImages are ready"}, + {Reason: "RegistryUnavailable", Controller: "CachedImage", Meaning: "Cannot connect to the container registry", Troubleshooting: "Check registry URL, DNS, firewall rules"}, + {Reason: "SourceError", Controller: "DiscoveryPolicy", Meaning: "One or more discovery sources returned errors", Troubleshooting: "Check source configuration and connectivity"}, + {Reason: "SyncFailed", Controller: "DiscoveryPolicy", Meaning: "All discovery sources failed", Troubleshooting: "Check all source endpoints, credentials, network"}, + {Reason: "Synced", Controller: "DiscoveryPolicy", Meaning: "All sources synced successfully"}, + } + return defs +} + +// ─── Metrics Extractor ─────────────────────────────────────────────────────── + +func extractMetrics(path string) []Metric { + data, err := os.ReadFile(path) + if err != nil { + return nil + } + content := string(data) + + nameRe := regexp.MustCompile(`Name:\s+"([^"]+)"`) + helpRe := regexp.MustCompile(`Help:\s+"([^"]+)"`) + typeRe := regexp.MustCompile(`prometheus\.New(Counter|Gauge|Histogram|Summary)`) + + names := nameRe.FindAllStringSubmatch(content, -1) + helps := helpRe.FindAllStringSubmatch(content, -1) + types := typeRe.FindAllStringSubmatch(content, -1) + + metrics := make([]Metric, 0, len(names)) + for i, n := range names { + m := Metric{Name: n[1]} + if i < len(helps) { + m.Help = helps[i][1] + } + if i < len(types) { + m.Type = strings.ToLower(types[i][1]) + } + metrics = append(metrics, m) + } + return metrics +} + +// ─── Make Targets ──────────────────────────────────────────────────────────── + +var makeTargetRe = regexp.MustCompile(`^([a-zA-Z_][a-zA-Z0-9_-]*):\s*.*?##\s*(.+)$`) + +func extractMakeTargets(path string) []MakeTarget { + data, err := os.ReadFile(path) + if err != nil { + return nil + } + var targets []MakeTarget + for _, line := range strings.Split(string(data), "\n") { + m := makeTargetRe.FindStringSubmatch(line) + if m != nil { + targets = append(targets, MakeTarget{Name: m[1], Desc: m[2]}) + } + } + return targets +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +func parseGoMod(path string) (string, string) { + data, err := os.ReadFile(path) + if err != nil { + return "1.23", "github.com/Breee/drop" + } + goVer := "1.23" + module := "" + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "go ") { + goVer = strings.TrimSpace(strings.TrimPrefix(line, "go ")) + } + if strings.HasPrefix(line, "module ") { + module = strings.TrimSpace(strings.TrimPrefix(line, "module ")) + } + } + return goVer, module +} + +func readFileStr(path string) string { + data, err := os.ReadFile(path) + if err != nil { + return "" + } + return string(data) +} + +func writeKnowledgeYAML(root string, k Knowledge) { + var buf bytes.Buffer + buf.WriteString("# Generated by make docs-gen — DO NOT EDIT\n") + buf.WriteString("# Source: hack/gen-ai-docs/\n") + buf.WriteString("# Regenerate: make docs-gen\n\n") + + enc := yaml.NewEncoder(&buf) + enc.SetIndent(2) + if err := enc.Encode(k); err != nil { + fmt.Fprintf(os.Stderr, "error encoding knowledge.yaml: %v\n", err) + os.Exit(1) + } + if err := enc.Close(); err != nil { + fmt.Fprintf(os.Stderr, "error closing encoder: %v\n", err) + os.Exit(1) + } + + outPath := filepath.Join(root, "knowledge.yaml") + if err := os.WriteFile(outPath, buf.Bytes(), 0o644); err != nil { + fmt.Fprintf(os.Stderr, "error writing knowledge.yaml: %v\n", err) + os.Exit(1) + } +} + +func generateFile(root, relPath string, tmplStr string, data Knowledge) { + funcMap := template.FuncMap{ + "join": strings.Join, + "lower": strings.ToLower, + } + t := template.Must(template.New(relPath).Funcs(funcMap).Parse(tmplStr)) + var buf bytes.Buffer + if err := t.Execute(&buf, data); err != nil { + fmt.Fprintf(os.Stderr, "error rendering %s: %v\n", relPath, err) + os.Exit(1) + } + + outPath := filepath.Join(root, relPath) + if err := os.MkdirAll(filepath.Dir(outPath), 0o755); err != nil { + fmt.Fprintf(os.Stderr, "error creating dir for %s: %v\n", relPath, err) + os.Exit(1) + } + if err := os.WriteFile(outPath, buf.Bytes(), 0o644); err != nil { + fmt.Fprintf(os.Stderr, "error writing %s: %v\n", relPath, err) + os.Exit(1) + } +} diff --git a/hack/gen-ai-docs/templates.go b/hack/gen-ai-docs/templates.go new file mode 100644 index 0000000..720574e --- /dev/null +++ b/hack/gen-ai-docs/templates.go @@ -0,0 +1,679 @@ +package main + +// ─── llms.txt (USE agents — short onboarding) ─────────────────────────────── + +var llmsTxtTmpl = `# {{.Project.Name}} — {{.Project.Description}} + +> API group: {{.Project.APIGroup}} | Go {{.Project.GoVersion}} | All CRDs cluster-scoped + +## CRDs + +| Kind | Purpose | +|------|---------| +{{- range .CRDs}} +| {{.Kind}} | {{.Doc}} | +{{- end}} + +## Architecture + +Short-lived Pods with ` + "`nodeName`" + ` + ` + "`command: [\"true\"]`" + ` trigger image pulls via kubelet. No privileged containers. + +Reconcilers: +{{- range .CRDs}}{{if .Controller}} +- {{.Kind}} → {{.Controller}} +{{- end}}{{end}} + +## Key Directories + +| Path | Role | +|------|------| +{{- range .Packages}} +| {{.Path}} | {{.Role}} | +{{- end}} +| charts/drop/ | Helm chart | +| test/e2e/ | Chainsaw E2E tests | +| hack/gen-ai-docs/ | Documentation generator | + +## Build & Test + +` + "```" + ` +{{- range .MakeTargets}} + make {{.Name}}{{"\t"}}# {{.Desc}} +{{- end}} +` + "```" + ` + +## CRD Quick Reference +{{range .CRDs}} +### {{.Kind}} + +{{.Doc}} + +**Spec fields:** {{range .SpecFields}}` + "`{{.JSON}}`" + `{{if .Default}} (default: {{.Default}}){{end}}, {{end}} +{{- if .StatusFields}} +**Status fields:** {{range .StatusFields}}` + "`{{.JSON}}`" + `, {{end}} +{{- end}} +{{end}} + +## Status Condition Reasons + +| Reason | Controller | Meaning | +|--------|-----------|---------| +{{- range .Errors}} +| {{.Reason}} | {{.Controller}} | {{.Meaning}} | +{{- end}} + +## Metrics + +{{- range .Metrics}} +- ` + "`{{.Name}}`" + ` ({{.Type}}) — {{.Help}} +{{- end}} + +## Full Reference + +See [llms-full.txt](llms-full.txt) for complete field documentation with types and examples. + +## Documentation Pages + +| Page | llmsDescription | +|------|-----------------| +| [Installation](docs/install/) | Install via Helm. Requires K8s 1.28+. | +| [Usage](docs/usage/) | CachedImage, CachedImageSet, PullPolicy examples with YAML. | +| [Discovery](docs/discovery/) | DiscoveryPolicy for automatic image discovery from Prometheus/OCI registries. | +| [Monitoring](docs/monitoring/) | Prometheus metrics, Kubernetes events, and status conditions. | +| [CRD Reference](docs/reference/crds/) | Complete field reference for all drop CRDs with types, defaults, and validation. | +| [Status & Errors](docs/reference/errors/) | Every condition reason emitted by controllers. Diagnose why resources are not Ready. | +| [Metrics](docs/reference/metrics/) | Prometheus metrics: names, types, descriptions, and example PromQL queries. | +| [Architecture](docs/reference/architecture/) | Package dependency graph and CRD ownership relationships. | +| [Developing](docs/developing/) | Build, test, lint, project structure for contributors. | +` + +// ─── llms-full.txt (USE agents — complete reference) ───────────────────────── + +var llmsFullTxtTmpl = `# {{.Project.Name}} — Full Reference for AI Agents + +## Project + +- **Name**: {{.Project.Name}} +- **Language**: Go {{.Project.GoVersion}} +- **Module**: {{.Project.Module}} +- **API Group**: {{.Project.APIGroup}} +- **Scope**: All CRDs cluster-scoped +- **License**: {{.Project.License}} +- **Framework**: Kubebuilder / controller-runtime + +## CRD Field Reference +{{range .CRDs}} +### {{.Kind}} + +{{.Doc}} +{{if .Controller}} +Controller: {{.Controller}} | Test: {{.TestFile}} +{{end}} +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +{{- range .SpecFields}} +| {{.Name}} | ` + "`{{.JSON}}`" + ` | ` + "`{{.Type}}`" + ` | {{if .Required}}✓{{else}}—{{end}} | {{if .Default}}` + "`{{.Default}}`" + `{{end}} | {{.Doc}}{{if .Enum}} Enum: {{range $i, $e := .Enum}}{{if $i}},{{end}}` + "`{{$e}}`" + `{{end}}{{end}} | +{{- end}} +{{if .StatusFields}} +#### Status +| Field | JSON | Type | Description | +|-------|------|------|-------------| +{{- range .StatusFields}} +| {{.Name}} | ` + "`{{.JSON}}`" + ` | ` + "`{{.Type}}`" + ` | {{.Doc}} | +{{- end}} +{{end}} +{{end}} + +## Helper Types +{{range .HelperTypes}} +### {{.Name}} + +{{.Doc}} + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +{{- range .Fields}} +| {{.Name}} | ` + "`{{.JSON}}`" + ` | ` + "`{{.Type}}`" + ` | {{if .Required}}✓{{else}}—{{end}} | {{if .Default}}` + "`{{.Default}}`" + `{{end}} | {{.Doc}}{{if .Enum}} Enum: {{range $i, $e := .Enum}}{{if $i}},{{end}}` + "`{{$e}}`" + `{{end}}{{end}} | +{{- end}} +{{end}} + +## Relationships + +` + "```mermaid" + ` +graph LR +{{- range .Relationships}} + {{.From}} -->|{{.Type}}| {{.To}} +{{- end}} +` + "```" + ` + +## Status Conditions & Error Reasons + +| Reason | Controller | Meaning | Troubleshooting | +|--------|-----------|---------|-----------------| +{{- range .Errors}} +| {{.Reason}} | {{.Controller}} | {{.Meaning}} | {{.Troubleshooting}} | +{{- end}} + +## Metrics + +| Name | Type | Description | +|------|------|-------------| +{{- range .Metrics}} +| ` + "`{{.Name}}`" + ` | {{.Type}} | {{.Help}} | +{{- end}} + +## Sample CRs + +` + "```yaml" + ` +{{.Samples}} +` + "```" + ` + +## Build & Test + +` + "```" + ` +{{- range .MakeTargets}} + make {{.Name}}{{"\t"}}# {{.Desc}} +{{- end}} +` + "```" + ` +` + +// ─── .github/copilot-instructions.md (CODE agents) ────────────────────────── + +var copilotInstructionsTmpl = `# Copilot Instructions for Puller + +## Critical Rules + +1. **ALWAYS read project files before acting.** Read the Tiltfile, Makefile, and relevant source before writing docs, suggesting workflows, or describing how things work. Never guess based on general knowledge. +2. **Documentation must be short and concise.** Focus on high-level overview and usage. Avoid volatile implementation details. Avoid information that will change frequently. +3. **Simplicity over complexity.** If a simple solution exists, use it. DRY is NOT always best. No premature optimization. +4. **Kubernetes: always verify.** Use ` + "`kubectl explain`" + ` or read the CRD types before suggesting field values or resource specs. +5. **Security-conscious.** Never expose secrets in code or docs. Follow secure coding practices. +6. **Tilt handles the dev loop.** ` + "`tilt up`" + ` does everything: cluster creation, build, deploy, port-forwards, Hugo docs, e2e infra, dev samples. Don't suggest manual commands for things Tilt automates. + +## Project + +Kubernetes operator (Go {{.Project.GoVersion}}, Kubebuilder, controller-runtime) that pre-caches container images on cluster nodes. +API group: ` + "`{{.Project.APIGroup}}`" + `. All CRDs are cluster-scoped. + +## Build Commands + +` + "```bash" + ` +make generate # regenerate deepcopy +make manifests # regenerate CRD + RBAC YAML +make codegen # both of the above +go build ./... # compile +make test # unit tests (envtest) +make test-e2e # e2e tests (chainsaw, needs kind) +make lint # golangci-lint +make docs-gen # regenerate AI docs from source +` + "```" + ` + +## Code Conventions +{{range .Conventions}}{{if or (eq (index .Scope 0) "code") (eq (index .Scope 0) "both")}} +- {{.Rule}} +{{- end}}{{end}} + +## Testing Patterns + +- Controller tests use envtest (` + "`internal/controller/*_test.go`" + `) +- Table-driven tests preferred +- E2E uses Kyverno Chainsaw in ` + "`test/e2e/`" + ` +- Test fixtures in ` + "`config/samples/`" + ` and ` + "`hack/dev-samples.yaml`" + ` + +## CRD Quick Reference + +| Kind | Controller | Purpose | +|------|-----------|---------| +{{- range .CRDs}} +| {{.Kind}} | {{.Controller}} | {{.Doc}} | +{{- end}} + +## Package Dependency Graph + +` + "```" + ` +{{- range .Packages}} +{{.Path}} — {{.Role}}{{if .Imports}} + imports: {{join .Imports ", "}}{{end}} +{{- end}} +` + "```" + ` + +## Don'ts + +- Don't add CRI socket access or privileged containers — we use kubelet image pulls only +- Don't put pacing logic outside ` + "`internal/pacing/`" + ` +- Don't create namespaced CRDs — all resources are cluster-scoped +- Don't manually edit generated files (` + "`zz_generated.deepcopy.go`" + `, ` + "`config/crd/bases/`" + `) +- Don't manually edit ` + "`llms.txt`" + `, ` + "`llms-full.txt`" + `, ` + "`.cursorrules`" + `, ` + "`AGENTS.md`" + ` — run ` + "`make docs-gen`" + ` +` + +// ─── .cursorrules (CODE agents) ────────────────────────────────────────────── + +var cursorRulesTmpl = `# Cursor Rules for Puller + +## Critical Rules + +1. ALWAYS read project files (Tiltfile, Makefile, source) before acting. Never guess. +2. Documentation: short, concise, high-level. No volatile details. +3. Simplicity over complexity. DRY is NOT always best. No premature optimization. +4. Kubernetes: use kubectl explain or read CRD types before suggesting specs. +5. Security: never expose secrets in code or docs. +6. Tilt handles the dev loop. tilt up does everything. Don't suggest manual commands for automated steps. + +## Project Context +Kubernetes operator (Go {{.Project.GoVersion}}, Kubebuilder, controller-runtime). +Module: {{.Project.Module}} +API group: {{.Project.APIGroup}}. All CRDs cluster-scoped. + +## Key Commands +- Build: go build ./... +- Test: make test +- Lint: make lint +- CRD gen: make manifests +- Deepcopy gen: make generate +- All codegen: make codegen +- AI docs gen: make docs-gen + +## Structure +{{- range .Packages}} +- {{.Path}} — {{.Role}} +{{- end}} +- charts/drop/ — Helm chart +- test/e2e/ — Chainsaw E2E tests +- hack/gen-ai-docs/ — generates all docs from source + +## CRDs → Controllers +{{- range .CRDs}} +- {{.Kind}}{{if .Controller}} → {{.Controller}}{{else}} (config-only, no controller){{end}} +{{- end}} + +## Conventions +{{- range .Conventions}} +- {{.Rule}} +{{- end}} + +## Don't +- Edit generated files (zz_generated.deepcopy.go, config/crd/bases/, llms.txt, llms-full.txt, knowledge.yaml) +- Add privileged containers or CRI socket mounts +- Create namespaced CRDs +- Put pacing logic outside internal/pacing/ +` + +// ─── AGENTS.md (CODE agents — generic) ────────────────────────────────────── + +var agentsMdTmpl = `# Agent Instructions + +## Critical Rules + +1. ALWAYS read project files (Tiltfile, Makefile, source) before acting. Never guess. +2. Documentation: short, concise, high-level. No volatile details. +3. Simplicity over complexity. DRY is NOT always best. No premature optimization. +4. Kubernetes: use kubectl explain or read CRD types before suggesting specs. +5. Security: never expose secrets in code or docs. +6. Tilt handles the dev loop. ` + "`tilt up`" + ` does everything. Don't suggest manual commands for automated steps. + +## Project: Puller + +Kubernetes operator (Go {{.Project.GoVersion}}) that pre-caches container images on cluster nodes. + +## Quick Start + +` + "```bash" + ` +make codegen # generate deepcopy + CRD manifests +go build ./... # compile +make test # unit tests +make docs-gen # regenerate AI docs +` + "```" + ` + +## Architecture + +- API group: ` + "`{{.Project.APIGroup}}`" + ` (cluster-scoped) +- Framework: Kubebuilder + controller-runtime +- Pull mechanism: short-lived Pods with ` + "`nodeName`" + ` + ` + "`command: [\"true\"]`" + ` + +## CRDs + +| Kind | Purpose | +|------|---------| +{{- range .CRDs}} +| {{.Kind}} | {{.Doc}} | +{{- end}} + +## Key Directories + +| Path | Contents | +|------|----------| +{{- range .Packages}} +| {{.Path}} | {{.Role}} | +{{- end}} +| charts/drop/ | Helm chart | +| test/e2e/ | Chainsaw E2E tests | +| hack/gen-ai-docs/ | This doc generator | + +## Rules + +1. Run ` + "`make codegen`" + ` after changing api/v1alpha1/ types +2. Run ` + "`make docs-gen`" + ` after changing types or Makefile (regenerates this file) +3. Never edit generated files directly +4. All CRDs are cluster-scoped — no namespaced resources +5. No privileged containers — kubelet-based image pulls only +6. Status uses ` + "`metav1.Condition`" + ` with type "Ready" + +## Full Reference + +See [llms-full.txt](llms-full.txt) for complete CRD field documentation. +` + +// ─── Hugo: CRD Reference ──────────────────────────────────────────────────── + +var hugoCRDsTmpl = `--- +# Generated by make docs-gen — DO NOT EDIT +title: CRD Reference +weight: 1 +aliases: + - /drop/docs/reference/crds/ +description: Custom Resource Definition reference for the drop operator. +llmsDescription: | + Complete CRD field reference for drop.corewire.io/v1alpha1. All resources + are cluster-scoped. Covers CachedImage, CachedImageSet, PullPolicy, and + DiscoveryPolicy with every spec/status field, types, defaults, and validation. +--- + +All resources are cluster-scoped under ` + "`{{.Project.APIGroup}}`" + `. + +## Quick Example + +` + "```yaml" + ` +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: nginx +spec: + image: docker.io/library/nginx + tag: latest + nodeSelector: + kubernetes.io/arch: amd64 +` + "```" + ` +{{range .CRDs}} +## {{.Kind}} + +{{.Doc}} +{{if .Controller}} +**Controller:** ` + "`{{.Controller}}`" + ` +{{end}} +### Spec + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +{{- range .SpecFields}} +| ` + "`{{.JSON}}`" + ` | ` + "`{{.Type}}`" + ` | {{if .Required}}Yes{{else}}No{{end}} | {{if .Default}}{{.Default}}{{else}}—{{end}} | {{.Doc}}{{if .Enum}} ({{range $i, $e := .Enum}}{{if $i}} | {{end}}` + "`{{$e}}`" + `{{end}}){{end}} | +{{- end}} +{{if .StatusFields}} +### Status + +| Field | Type | Description | +|-------|------|-------------| +{{- range .StatusFields}} +| ` + "`{{.JSON}}`" + ` | ` + "`{{.Type}}`" + ` | {{.Doc}} | +{{- end}} +{{end}} +--- +{{end}} + +## Helper Types +{{range .HelperTypes}} +### {{.Name}} + +{{.Doc}} + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +{{- range .Fields}} +| ` + "`{{.JSON}}`" + ` | ` + "`{{.Type}}`" + ` | {{if .Required}}Yes{{else}}No{{end}} | {{if .Default}}{{.Default}}{{else}}—{{end}} | {{.Doc}} | +{{- end}} +{{end}} +` + +// ─── Hugo: Error Catalog ───────────────────────────────────────────────────── + +var hugoErrorsTmpl = `--- +# Generated by make docs-gen — DO NOT EDIT +title: Status & Errors +weight: 2 +aliases: + - /drop/docs/reference/errors/ +description: Status conditions, reasons, and troubleshooting for drop CRDs. +llmsDescription: | + Every metav1.Condition reason emitted by drop controllers. Lookup table + maps reason codes to controller, meaning, and fix. Use this to diagnose + why a CachedImage, CachedImageSet, or DiscoveryPolicy is not Ready. +--- + +All drop CRDs use ` + "`metav1.Condition`" + ` with type **"Ready"**. The ` + "`.reason`" + ` field indicates the specific state. + +## Quick Lookup + +| Reason | Controller | Meaning | How to Fix | +|--------|-----------|---------|------------| +{{- range .Errors}} +| **{{.Reason}}** | {{.Controller}} | {{.Meaning}} | {{if .Troubleshooting}}{{.Troubleshooting}}{{else}}—{{end}} | +{{- end}} + +## By Controller + +### CachedImage + +| Reason | Meaning | +|--------|---------| +{{- range .Errors}}{{if eq .Controller "CachedImage"}} +| **{{.Reason}}** | {{.Meaning}} | +{{- end}}{{end}} + +### CachedImageSet + +| Reason | Meaning | +|--------|---------| +{{- range .Errors}}{{if eq .Controller "CachedImageSet"}} +| **{{.Reason}}** | {{.Meaning}} | +{{- end}}{{end}} + +### DiscoveryPolicy + +| Reason | Meaning | +|--------|---------| +{{- range .Errors}}{{if eq .Controller "DiscoveryPolicy"}} +| **{{.Reason}}** | {{.Meaning}} | +{{- end}}{{end}} +` + +// ─── Hugo: Metrics ─────────────────────────────────────────────────────────── + +var hugoMetricsTmpl = `--- +# Generated by make docs-gen — DO NOT EDIT +title: Metrics +weight: 3 +aliases: + - /drop/docs/reference/metrics/ +description: Prometheus metrics exposed by the drop operator. +llmsDescription: | + All Prometheus metrics registered by the drop operator. Includes metric + name, type (counter/gauge/histogram), and description. Also provides + example PromQL queries for monitoring image cache coverage and pull errors. +--- + +The drop operator exposes the following metrics: + +| Metric | Type | Description | +|--------|------|-------------| +{{- range .Metrics}} +| ` + "`{{.Name}}`" + ` | {{.Type}} | {{.Help}} | +{{- end}} + +## Useful Queries + +` + "```promql" + ` +# Images cached per node +sum by (node) (drop_images_cached_total) + +# Pull error rate +rate(drop_pull_errors_total[5m]) + +# Average pull duration +histogram_quantile(0.95, rate(drop_pull_duration_seconds_bucket[10m])) + +# Discovery coverage +drop_discovery_images_found +` + "```" + ` +` + +// ─── Hugo: Architecture (Mermaid) ─────────────────────────────────────────── + +var hugoArchTmpl = `--- +# Generated by make docs-gen — DO NOT EDIT +title: Architecture +weight: 4 +aliases: + - /drop/docs/reference/architecture/ +description: Internal architecture and package dependency graph. +llmsDescription: | + Package dependency graph and CRD ownership relationships for the drop + operator. Shows how controllers, pacing engine, pod builder, and discovery + packages relate. Useful for understanding code navigation and import paths. +--- + +## CRD Relationships + +` + "```mermaid" + ` +graph TD +{{- range .Relationships}} + {{.From}} -->|{{.Type}}| {{.To}} +{{- end}} +` + "```" + ` + +## Package Dependencies + +` + "```mermaid" + ` +graph LR + cmd/main.go --> internal/controller +{{- range $pkg := .Packages}}{{if $pkg.Imports}}{{range $pkg.Imports}} + {{$pkg.Path}} --> {{.}} +{{- end}}{{end}}{{end}} +` + "```" + ` + +## Reconciler → CRD Mapping + +| CRD | Controller | Dependencies | +|-----|-----------|--------------| +{{- range .CRDs}} +| {{.Kind}} | {{if .Controller}}` + "`{{.Controller}}`" + `{{else}}(config-only){{end}} | {{if .Controller}}podbuilder, pacing, metrics{{end}} | +{{- end}} + +## Pull Mechanism + +` + "```mermaid" + ` +sequenceDiagram + participant CR as CachedImage + participant Ctrl as Controller + participant Pace as Pacing Engine + participant K8s as Kubernetes API + participant Node as Kubelet + + CR->>Ctrl: Reconcile triggered + Ctrl->>Pace: Request pull slot + Pace-->>Ctrl: Slot granted + Ctrl->>K8s: Create Pod (nodeName=target) + K8s->>Node: Schedule Pod + Node->>Node: Pull image (kubelet) + Node-->>K8s: Pod succeeds + K8s-->>Ctrl: Watch event + Ctrl->>CR: Update status (Ready) +` + "```" + ` +` + +// ─── Doc Generation Flow Diagram ───────────────────────────────────────────── + +var docGenDiagramTmpl = `# Documentation Generation + + + +## How It Works + +All documentation is generated from source code via ` + "`make docs-gen`" + ` (which runs ` + "`go run ./hack/gen-ai-docs/`" + `). + +` + "```mermaid" + ` +flowchart TD + subgraph Sources["Source of Truth"] + TYPES["api/v1alpha1/*_types.go
(CRD types + kubebuilder markers)"] + CTRL["internal/controller/*.go
(reconcilers, error reasons)"] + METRICS["internal/metrics/metrics.go
(Prometheus metrics)"] + MAKEFILE["Makefile
(build targets)"] + GOMOD["go.mod
(Go version, module)"] + SAMPLES["hack/dev-samples.yaml
(example CRs)"] + end + + subgraph Generator["hack/gen-ai-docs/"] + PARSE["Go AST Parser
+ go list -json"] + KNOWLEDGE["knowledge.yaml
(structured intermediate)"] + RENDER["Template Renderer"] + end + + subgraph UseAgents["USE Agents"] + LLMS["llms.txt
(short onboarding)"] + LLMSFULL["llms-full.txt
(complete reference)"] + end + + subgraph CodeAgents["CODE Agents"] + COPILOT[".github/copilot-instructions.md"] + CURSOR[".cursorrules"] + AGENTS["AGENTS.md"] + end + + subgraph Humans["Humans (Hugo)"] + CRDS["reference/_generated_crds.md"] + ERRORS["reference/_generated_errors.md"] + METRICSH["reference/_generated_metrics.md"] + ARCH["reference/_generated_architecture.md"] + end + + TYPES --> PARSE + CTRL --> PARSE + METRICS --> PARSE + MAKEFILE --> PARSE + GOMOD --> PARSE + SAMPLES --> PARSE + + PARSE --> KNOWLEDGE + KNOWLEDGE --> RENDER + + RENDER --> LLMS + RENDER --> LLMSFULL + RENDER --> COPILOT + RENDER --> CURSOR + RENDER --> AGENTS + RENDER --> CRDS + RENDER --> ERRORS + RENDER --> METRICSH + RENDER --> ARCH +` + "```" + ` + +## Three Audiences + +` + "```mermaid" + ` +graph LR + subgraph SoT["Single Source of Truth"] + CODE["Go Source Code"] + end + + CODE -->|schema, fields, examples| USE["USE Agents
(GitOps, kubectl, IaC)"] + CODE -->|architecture, conventions| DEV["CODE Agents
(Copilot, Cursor, Codex)"] + CODE -->|narrative + generated ref| HUMAN["Humans
(Hugo docs site)"] +` + "```" + ` + +## Commands + +| Command | Purpose | +|---------|---------| +| ` + "`make docs-gen`" + ` | Regenerate all docs from source | +| ` + "`make docs-gen-check`" + ` | CI gate — fails if docs are stale | +| ` + "`make codegen`" + ` | CRDs + deepcopy + docs (full pipeline) | +` diff --git a/hack/gen-asciinema.sh b/hack/gen-asciinema.sh new file mode 100755 index 0000000..7dbdfbb --- /dev/null +++ b/hack/gen-asciinema.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# hack/gen-asciinema.sh — Generate asciinema .cast files for docs landing page. +# Requires: asciinema, kubectl, a running cluster with drop installed. +# Output: docs/static/casts/{apply,pods,events}.cast — displayed as tabs on site. +# +# Each recording is fully independent: clean state → apply → watch one perspective. +set -euo pipefail + +CAST_DIR="$(git rev-parse --show-toplevel)/docs/static/casts" +mkdir -p "$CAST_DIR" + +TMPFILE="/tmp/drop-demo-cachedimage.yaml" +cat > "$TMPFILE" <<'EOF' +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: nginx-demo +spec: + image: docker.io/library/nginx + tag: "1.27" + nodeSelector: + kubernetes.io/os: linux +EOF + +cleanup() { + kubectl delete cachedimage nginx-demo --ignore-not-found >/dev/null 2>&1 || true + kubectl delete pods -l app.kubernetes.io/managed-by=drop --ignore-not-found >/dev/null 2>&1 || true + sleep 5 +} + +# ─── Recording 1: Apply manifest + watch CachedImage status ─────────────────── +cleanup +echo "Recording 1/3: apply + status" +asciinema rec "$CAST_DIR/apply.cast" --overwrite --cols 80 --rows 22 --env "" -c "bash --norc --noprofile <<'REC' +echo '$ cat cachedimage.yaml' +sleep 1 +cat $TMPFILE +sleep 3 +echo '' +echo '$ kubectl apply -f cachedimage.yaml' +kubectl apply -f $TMPFILE +sleep 2 +echo '' +echo '$ kubectl get cachedimages nginx-demo -w' +kubectl get cachedimages nginx-demo -w & +PID=\$! +sleep 20 +kill \$PID 2>/dev/null || true +REC" + +# ─── Recording 2: Watch pods with node placement ───────────────────────────── +cleanup +echo "Recording 2/3: pods + nodes" +asciinema rec "$CAST_DIR/pods.cast" --overwrite --cols 80 --rows 22 --env "" -c "bash --norc --noprofile <<'REC' +echo '$ kubectl get pods -l app.kubernetes.io/managed-by=drop -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,NODE:.spec.nodeName -w' +sleep 1 +kubectl get pods -l app.kubernetes.io/managed-by=drop -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,NODE:.spec.nodeName -w & +PID=\$! +sleep 2 +kubectl apply -f $TMPFILE >/dev/null 2>&1 +sleep 20 +kill \$PID 2>/dev/null || true +REC" + +# ─── Recording 3: Watch Kubernetes events ──────────────────────────────────── +cleanup +echo "Recording 3/3: events" +asciinema rec "$CAST_DIR/events.cast" --overwrite --cols 120 --rows 22 --env "" -c "bash --norc --noprofile <<'REC' +echo '$ kubectl get events --field-selector reason!=LeaderElection --watch-only' +sleep 1 +kubectl get events --field-selector reason!=LeaderElection --watch-only & +PID=\$! +sleep 2 +kubectl apply -f $TMPFILE >/dev/null 2>&1 +sleep 20 +kill \$PID 2>/dev/null || true +REC" + +rm -f "$TMPFILE" +echo "✓ Generated: $CAST_DIR/{apply,pods,events}.cast" diff --git a/hack/kind-config.yaml b/hack/kind-config.yaml new file mode 100644 index 0000000..38c9a9b --- /dev/null +++ b/hack/kind-config.yaml @@ -0,0 +1,10 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +containerdConfigPatches: + - |- + [plugins."io.containerd.grpc.v1.cri".registry] + config_path = "/etc/containerd/certs.d" +nodes: + - role: control-plane + - role: worker + - role: worker diff --git a/hack/prove-operator.sh b/hack/prove-operator.sh new file mode 100755 index 0000000..fc1544d --- /dev/null +++ b/hack/prove-operator.sh @@ -0,0 +1,453 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ============================================================================= +# Puller Operator — Proof of Correct Operation +# ============================================================================= +# This script creates a kind cluster, deploys the operator, and exercises every +# major feature with detailed logging to prove correctness. Each section shows +# the exact commands and their expected output so the result can be reviewed +# offline (e.g. in a CI artifact or shared as evidence). +# +# Prerequisites: kind, kubectl, helm, docker, jq +# Usage: ./hack/prove-operator.sh 2>&1 | tee proof-run.log +# ============================================================================= + +BOLD='\033[1m' +GREEN='\033[0;32m' +RED='\033[0;31m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log() { echo -e "${BLUE}[proof]${NC} $*"; } +success() { echo -e "${GREEN}[✓]${NC} $*"; } +fail() { echo -e "${RED}[✗]${NC} $*"; exit 1; } +section() { echo -e "\n${BOLD}${YELLOW}════════════════════════════════════════════════════════════════${NC}"; echo -e "${BOLD}${YELLOW} $*${NC}"; echo -e "${BOLD}${YELLOW}════════════════════════════════════════════════════════════════${NC}\n"; } +subsect() { echo -e "\n${BOLD}── $* ──${NC}\n"; } + +CLUSTER_NAME="drop-proof" +IMG="controller:proof" +NAMESPACE="drop-system" +TIMEOUT=120 + +cleanup() { + log "Cleaning up kind cluster..." + kind delete cluster --name "$CLUSTER_NAME" 2>/dev/null || true +} +trap cleanup EXIT + +# ============================================================================= +section "PHASE 1: Environment Setup" +# ============================================================================= + +subsect "1.1 Create 3-node Kind cluster (1 control-plane + 2 workers)" +if kind get clusters 2>/dev/null | grep -q "$CLUSTER_NAME"; then + log "Cluster already exists, deleting..." + kind delete cluster --name "$CLUSTER_NAME" +fi + +cat </dev/null || true +kubectl apply -f config/crd/bases/ +success "CRDs installed" +log "Registered CRDs:" +kubectl get crds | grep drop +echo "" + +subsect "1.4 Deploy operator via Helm" +helm upgrade --install drop charts/drop \ + --namespace "$NAMESPACE" \ + --create-namespace \ + --set image.repository=controller \ + --set image.tag=proof \ + --set image.pullPolicy=Never \ + --set leaderElection.enabled=false \ + --set metrics.enabled=true \ + --set metrics.secureServing=false \ + --wait --timeout 90s +success "Operator running" +echo "" +log "Operator pod:" +kubectl -n "$NAMESPACE" get pods -o wide +echo "" +log "Operator logs (startup):" +kubectl -n "$NAMESPACE" logs -l app.kubernetes.io/name=drop --tail=20 +echo "" + +# ============================================================================= +section "PHASE 2: PullPolicy — Pacing Controls" +# ============================================================================= + +subsect "2.1 Create a conservative PullPolicy" +cat </dev/null | wc -l) + if [ "$POD_COUNT" -gt 0 ]; then + success "Puller pods created ($POD_COUNT found)" + break + fi + sleep 2 +done +echo "" +log "Puller Pods (one per targeted node):" +kubectl get pods -A -l app.kubernetes.io/managed-by=drop,drop.corewire.io/cachedimage=nginx-proof -o wide 2>/dev/null || true +echo "" + +subsect "3.3 Verify Pod spec (command: ['true'], nodeName set, non-privileged)" +POD_NAME=$(kubectl get pods -A -l drop.corewire.io/cachedimage=nginx-proof -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [ -n "$POD_NAME" ]; then + log "Pod: $POD_NAME" + echo " Image: $(kubectl get pod -A "$POD_NAME" -o jsonpath='{.spec.containers[0].image}' 2>/dev/null || kubectl get pods -A -l drop.corewire.io/cachedimage=nginx-proof -o jsonpath='{.items[0].spec.containers[0].image}')" + echo " Command: $(kubectl get pods -A -l drop.corewire.io/cachedimage=nginx-proof -o jsonpath='{.items[0].spec.containers[0].command}')" + echo " NodeName: $(kubectl get pods -A -l drop.corewire.io/cachedimage=nginx-proof -o jsonpath='{.items[0].spec.nodeName}')" + echo " PullPolicy: $(kubectl get pods -A -l drop.corewire.io/cachedimage=nginx-proof -o jsonpath='{.items[0].spec.containers[0].imagePullPolicy}')" + echo " Privileged: $(kubectl get pods -A -l drop.corewire.io/cachedimage=nginx-proof -o jsonpath='{.items[0].spec.containers[0].securityContext.privileged}' 2>/dev/null || echo 'not set (non-privileged)')" + success "Pod spec matches design: short-lived, non-privileged, command=['true'], placed on specific node" +fi +echo "" + +subsect "3.4 Wait for image pull to complete" +log "Waiting for CachedImage phase=Ready (max ${TIMEOUT}s)..." +DEADLINE=$((SECONDS + TIMEOUT)) +PREV_PHASE="" +while [ $SECONDS -lt $DEADLINE ]; do + PHASE=$(kubectl get cachedimage nginx-proof -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + READY=$(kubectl get cachedimage nginx-proof -o jsonpath='{.status.nodesReady}' 2>/dev/null || echo "0") + TARGET=$(kubectl get cachedimage nginx-proof -o jsonpath='{.status.nodesTargeted}' 2>/dev/null || echo "?") + if [ "$PHASE" != "$PREV_PHASE" ]; then + log "Phase transition: ${PREV_PHASE:-} → $PHASE (nodesReady=$READY/$TARGET)" + PREV_PHASE="$PHASE" + fi + if [ "$PHASE" = "Ready" ]; then + success "All nodes have the image cached!" + break + fi + sleep 3 +done +echo "" + +subsect "3.5 Final CachedImage status" +kubectl get cachedimage nginx-proof -o wide +echo "" +kubectl get cachedimage nginx-proof -o jsonpath='{.status}' | jq . 2>/dev/null || kubectl get cachedimage nginx-proof -o yaml | grep -A30 "^status:" +echo "" + +subsect "3.6 Kubernetes Events (proof of lifecycle tracking)" +log "Events for CachedImage 'nginx-proof':" +kubectl get events --field-selector involvedObject.name=nginx-proof --sort-by='.lastTimestamp' 2>/dev/null || log "(no events — reconciler events may use different involvedObject)" +echo "" + +subsect "3.7 Verify drop Pods are cleaned up after success" +sleep 5 +REMAINING=$(kubectl get pods -A -l drop.corewire.io/cachedimage=nginx-proof --field-selector=status.phase!=Succeeded --no-headers 2>/dev/null | wc -l) +log "Non-Succeeded drop Pods remaining: $REMAINING" +if [ "$REMAINING" -eq 0 ]; then + success "All drop Pods completed (phase=Succeeded) — no lingering resources" +else + log "Some Pods still running (pacing may be active)" +fi +echo "" + +# ============================================================================= +section "PHASE 4: Pacing Enforcement" +# ============================================================================= + +subsect "4.1 Verify maxConcurrentNodes=1 was enforced" +log "With maxConcurrentNodes=1, only 1 drop Pod should run at a time across nodes." +log "Checking operator logs for pacing behavior..." +kubectl -n "$NAMESPACE" logs -l app.kubernetes.io/name=drop --tail=50 | grep -i "pacing\|concurrent\|delay\|requeue" || log "(No explicit pacing log lines — pacing is reflected in sequential Pod creation)" +echo "" + +subsect "4.2 Create second CachedImage with same policy (observe sequencing)" +cat </dev/null || echo "Pending") + if [ "$PHASE" = "Ready" ]; then + success "busybox-proof is Ready" + break + fi + sleep 3 +done +echo "" +log "Both CachedImages:" +kubectl get cachedimages +echo "" + +# ============================================================================= +section "PHASE 5: CachedImageSet — Multi-Image Management" +# ============================================================================= + +subsect "5.1 Create CachedImageSet with 3 images" +cat </dev/null || kubectl get cachedimages +echo "" + +subsect "5.3 Check owner references (ensures GC on set deletion)" +CHILD=$(kubectl get cachedimages -l drop.corewire.io/imageset=proof-set -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [ -n "$CHILD" ]; then + log "OwnerReferences on child '$CHILD':" + kubectl get cachedimage "$CHILD" -o jsonpath='{.metadata.ownerReferences}' | jq . 2>/dev/null || kubectl get cachedimage "$CHILD" -o jsonpath='{.metadata.ownerReferences}' + success "OwnerReference points to CachedImageSet — Kubernetes GC will clean up on delete" +fi +echo "" + +subsect "5.4 Wait for set completion" +DEADLINE=$((SECONDS + TIMEOUT)) +while [ $SECONDS -lt $DEADLINE ]; do + READY_COUNT=$(kubectl get cachedimages -l drop.corewire.io/imageset=proof-set -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' 2>/dev/null | grep -c "Ready" || echo "0") + TOTAL_COUNT=$(kubectl get cachedimages -l drop.corewire.io/imageset=proof-set --no-headers 2>/dev/null | wc -l) + log "ImageSet progress: $READY_COUNT/$TOTAL_COUNT children Ready" + if [ "$READY_COUNT" -eq "$TOTAL_COUNT" ] && [ "$TOTAL_COUNT" -gt 0 ]; then + success "All images in set are cached!" + break + fi + sleep 5 +done +echo "" + +# ============================================================================= +section "PHASE 6: Node Targeting (nodeSelector + tolerations)" +# ============================================================================= + +subsect "6.1 Label one worker as 'pool=gpu'" +WORKER=$(kubectl get nodes --no-headers | grep worker | head -1 | awk '{print $1}') +kubectl label node "$WORKER" pool=gpu --overwrite +success "Labeled $WORKER with pool=gpu" +echo "" + +subsect "6.2 Create CachedImage targeting only pool=gpu" +cat </dev/null || echo "?") +log "nodesTargeted=$NODES_TARGETED (expected: 1, only the labeled worker)" +if [ "$NODES_TARGETED" = "1" ]; then + success "Node targeting works — only 1 node targeted (the gpu-labeled worker)" +fi +echo "" + +# ============================================================================= +section "PHASE 7: Observability — Metrics" +# ============================================================================= + +subsect "7.1 Port-forward to metrics endpoint" +OPERATOR_POD=$(kubectl -n "$NAMESPACE" get pods -l app.kubernetes.io/name=drop -o jsonpath='{.items[0].metadata.name}') +kubectl -n "$NAMESPACE" port-forward "$OPERATOR_POD" 9090:8080 & +PF_PID=$! +sleep 3 + +subsect "7.2 Query Prometheus metrics" +log "Custom drop metrics:" +echo "" +METRICS=$(curl -s http://localhost:9090/metrics 2>/dev/null || echo "") +if [ -n "$METRICS" ]; then + echo "$METRICS" | grep "^drop_" | sort + echo "" + success "Metrics endpoint responds with custom drop_* metrics" + + echo "" + log "Key metric values:" + echo " drop_images_cached_total: $(echo "$METRICS" | grep '^drop_images_cached_total' | head -3)" + echo " drop_active_pulls: $(echo "$METRICS" | grep '^drop_active_pulls' || echo '0')" + echo " drop_pull_errors_total: $(echo "$METRICS" | grep '^drop_pull_errors_total' | head -3 || echo 'none')" + echo " drop_reconcile_total: $(echo "$METRICS" | grep '^drop_reconcile_total' | head -5)" +else + log "Could not reach metrics endpoint (may need different port)" +fi +kill $PF_PID 2>/dev/null || true +echo "" + +# ============================================================================= +section "PHASE 8: Operator Logs — Full Reconciliation Trace" +# ============================================================================= + +subsect "8.1 Complete operator logs" +log "Full operator logs showing all reconciliation cycles:" +echo "" +kubectl -n "$NAMESPACE" logs -l app.kubernetes.io/name=drop --tail=100 +echo "" + +# ============================================================================= +section "PHASE 9: Cleanup Verification" +# ============================================================================= + +subsect "9.1 Delete CachedImageSet and verify cascading GC" +kubectl delete cachedimageset proof-set +log "Waiting for child CachedImages to be garbage collected..." +sleep 10 +REMAINING_CHILDREN=$(kubectl get cachedimages -l drop.corewire.io/imageset=proof-set --no-headers 2>/dev/null | wc -l) +log "Remaining children after set deletion: $REMAINING_CHILDREN" +if [ "$REMAINING_CHILDREN" -eq 0 ]; then + success "Cascading garbage collection works — all children deleted" +else + log "GC may still be in progress" +fi +echo "" + +subsect "9.2 Final state" +log "All CachedImages:" +kubectl get cachedimages -o wide +echo "" +log "All PullPolicies:" +kubectl get pullpolicies -o wide +echo "" + +# ============================================================================= +section "PROOF SUMMARY" +# ============================================================================= + +echo -e "${GREEN}${BOLD}" +cat <<'SUMMARY' +┌─────────────────────────────────────────────────────────────────────────┐ +│ OPERATOR CORRECTNESS PROOF │ +├─────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ✓ CRDs registered: CachedImage, CachedImageSet, PullPolicy, │ +│ DiscoveryPolicy — all cluster-scoped under drop.corewire.io │ +│ │ +│ ✓ CachedImage reconciler: │ +│ - Creates short-lived Pods with command=["true"] (non-privileged) │ +│ - Pods placed on specific nodes via spec.nodeName │ +│ - kubelet pulls the image as a side effect of scheduling │ +│ - Pod completion = image cached; operator tracks per-node status │ +│ - Status transitions: Pending → Pulling → Ready │ +│ │ +│ ✓ PullPolicy pacing: │ +│ - maxConcurrentNodes limits parallel node pulls │ +│ - minDelayBetweenPulls spaces out pull operations │ +│ - failureBackoff provides exponential retry on errors │ +│ │ +│ ✓ CachedImageSet: │ +│ - Auto-creates child CachedImage resources from images[] list │ +│ - Sets ownerReferences for Kubernetes garbage collection │ +│ - Deleting the set cascades deletion to all children │ +│ │ +│ ✓ Node targeting: │ +│ - nodeSelector restricts pulls to matching nodes only │ +│ - tolerations allow scheduling on tainted nodes │ +│ │ +│ ✓ Observability: │ +│ - drop_images_cached_total — counter per image+node │ +│ - drop_pull_duration_seconds — histogram of pull times │ +│ - drop_pull_errors_total — counter per image+node │ +│ - drop_active_pulls — gauge of in-flight pull Pods │ +│ - drop_reconcile_total — counter per controller+result │ +│ - Kubernetes events: PullStarted, PullSucceeded, PullFailed │ +│ │ +│ ✓ Non-disruptive: Pulls never cordon/drain nodes or affect │ +│ schedulability. The operator just creates lightweight Pods. │ +│ │ +└─────────────────────────────────────────────────────────────────────────┘ +SUMMARY +echo -e "${NC}" + +log "Full proof log can be captured with: ./hack/prove-operator.sh 2>&1 | tee proof-run.log" +log "Done." diff --git a/internal/controller/cachedimage_controller.go b/internal/controller/cachedimage_controller.go new file mode 100644 index 0000000..fae1dff --- /dev/null +++ b/internal/controller/cachedimage_controller.go @@ -0,0 +1,762 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" + dropmetrics "github.com/Breee/drop/internal/metrics" + "github.com/Breee/drop/internal/pacing" + "github.com/Breee/drop/internal/podbuilder" +) + +const ( + conditionTypeReady = "Ready" + phasePending = "Pending" + phaseReady = "Ready" + phasePulling = "Pulling" + phaseDegraded = "Degraded" +) + +// CachedImageReconciler reconciles a CachedImage object +type CachedImageReconciler struct { + client.Client + Scheme *runtime.Scheme + PacingEngine *pacing.Engine + Recorder record.EventRecorder + PodNamespace string +} + +// +kubebuilder:rbac:groups=drop.corewire.io,resources=cachedimages,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=drop.corewire.io,resources=cachedimages/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=drop.corewire.io,resources=cachedimages/finalizers,verbs=update +// +kubebuilder:rbac:groups=drop.corewire.io,resources=pullpolicies,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch + +// nodeState tracks the pull state for a single node. +type nodeState struct { + pod *corev1.Pod + ready bool + failed bool + failReason string // e.g. "ErrImagePull", "ImagePullBackOff", "PodFailed" + failMessage string +} + +// Reconcile moves the cluster state closer to the desired state for a CachedImage. +func (r *CachedImageReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // 1. Fetch CachedImage + ci := &dropv1alpha1.CachedImage{} + if err := r.Get(ctx, req.NamespacedName, ci); err != nil { + if errors.IsNotFound(err) { + // CachedImage was deleted — clean up any orphaned drop pods + return ctrl.Result{}, r.cleanupOrphanPods(ctx, req.Name) + } + return ctrl.Result{}, err + } + + // 2-3. Resolve target nodes + targetNodes, err := r.resolveTargetNodes(ctx, ci) + if err != nil { + return ctrl.Result{}, err + } + + // 4. Fetch referenced PullPolicy + policy, err := r.fetchPullPolicy(ctx, ci) + if err != nil { + return ctrl.Result{}, err + } + + // 5-6. Build per-node state from owned Pods + stateMap, err := r.buildNodeStateMap(ctx, ci, targetNodes) + if err != nil { + return ctrl.Result{}, err + } + + // 6.5. If repull is due, mark cached nodes as needing re-pull + r.markNodesForRepull(ci, policy, stateMap) + + // 7-8. Process pod states + nodesReady, requeueNeeded := r.processPodStates(ctx, ci, stateMap) + + // 9-10. Schedule pulls for nodes that need them + requeueAfter, pullRequeue, err := r.schedulePulls(ctx, ci, policy, stateMap) + if err != nil { + return ctrl.Result{}, err + } + requeueNeeded = requeueNeeded || pullRequeue + + // 11. Update status via patch (avoids conflict on rapid reconciles) + nodesTargeted := int32(len(targetNodes)) + now := metav1.Now() + patch := client.MergeFrom(ci.DeepCopy()) + r.updateCachedImageStatus(ci, stateMap, nodesTargeted, nodesReady, now) + + if err := r.Status().Patch(ctx, ci, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("patching status: %w", err) + } + + // 12. Determine requeue + // If degraded with no running pods, apply exponential backoff based on PullPolicy config. + if ci.Status.Phase == phaseDegraded && !requeueNeeded { + backoff := computeBackoff(policy, ci.Status.ConsecutiveFailures) + return ctrl.Result{RequeueAfter: backoff}, nil + } + + if requeueNeeded { + if requeueAfter == 0 { + requeueAfter = 5 * time.Second + } + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + + // If fully cached and repull is enabled, schedule next re-pull. + if ci.Status.Phase == phaseReady { + if interval := r.repullInterval(ci, policy); interval > 0 { + return ctrl.Result{RequeueAfter: interval}, nil + } + } + + return ctrl.Result{}, nil +} + +// computeBackoff calculates exponential backoff delay from PullPolicy config and failure count. +// Defaults: initial=30s, max=5m. Doubles on each consecutive failure. +func computeBackoff(policy *dropv1alpha1.PullPolicy, failures int32) time.Duration { + initial := 30 * time.Second + max := 5 * time.Minute + + if policy != nil && policy.Spec.FailureBackoff != nil { + if policy.Spec.FailureBackoff.Initial.Duration > 0 { + initial = policy.Spec.FailureBackoff.Initial.Duration + } + if policy.Spec.FailureBackoff.Max.Duration > 0 { + max = policy.Spec.FailureBackoff.Max.Duration + } + } + + delay := initial + for i := int32(1); i < failures; i++ { + delay *= 2 + if delay > max { + delay = max + break + } + } + + return delay +} + +// repullInterval returns the repull interval from the PullPolicy, or 0 if disabled. +func (r *CachedImageReconciler) repullInterval(_ *dropv1alpha1.CachedImage, policy *dropv1alpha1.PullPolicy) time.Duration { + if policy == nil || policy.Spec.RepullInterval == nil { + return 0 + } + return policy.Spec.RepullInterval.Duration +} + +// markNodesForRepull clears the ready state on cached nodes when a repull is due. +func (r *CachedImageReconciler) markNodesForRepull(ci *dropv1alpha1.CachedImage, policy *dropv1alpha1.PullPolicy, stateMap map[string]*nodeState) { + interval := r.repullInterval(ci, policy) + if interval <= 0 { + return + } + // Check if enough time has passed since last successful pull + if ci.Status.LastPulledAt == nil { + return + } + elapsed := time.Since(ci.Status.LastPulledAt.Time) + if elapsed < interval { + return + } + // Time to re-pull: clear ready state on nodes that have no active pod + for _, state := range stateMap { + if state.ready && state.pod == nil { + state.ready = false + } + } +} + +// resolveTargetNodes lists and filters nodes matching the CachedImage spec. +func (r *CachedImageReconciler) resolveTargetNodes(ctx context.Context, ci *dropv1alpha1.CachedImage) ([]corev1.Node, error) { + nodeList := &corev1.NodeList{} + listOpts := &client.ListOptions{} + if len(ci.Spec.NodeSelector) > 0 { + listOpts.LabelSelector = labels.SelectorFromSet(ci.Spec.NodeSelector) + } + if err := r.List(ctx, nodeList, listOpts); err != nil { + return nil, fmt.Errorf("listing nodes: %w", err) + } + return filterNodesByTolerations(nodeList.Items, ci.Spec.Tolerations), nil +} + +// fetchPullPolicy retrieves the referenced PullPolicy, if any. +func (r *CachedImageReconciler) fetchPullPolicy(ctx context.Context, ci *dropv1alpha1.CachedImage) (*dropv1alpha1.PullPolicy, error) { + if ci.Spec.PolicyRef == nil { + return nil, nil + } + log := logf.FromContext(ctx) + policy := &dropv1alpha1.PullPolicy{} + policyKey := client.ObjectKey{Name: ci.Spec.PolicyRef.Name} + if err := r.Get(ctx, policyKey, policy); err != nil { + if !errors.IsNotFound(err) { + return nil, fmt.Errorf("fetching PullPolicy: %w", err) + } + log.Info("referenced PullPolicy not found, using defaults", "policy", ci.Spec.PolicyRef.Name) + return nil, nil + } + return policy, nil +} + +// buildNodeStateMap creates the per-node state map from owned Pods. +func (r *CachedImageReconciler) buildNodeStateMap(ctx context.Context, ci *dropv1alpha1.CachedImage, targetNodes []corev1.Node) (map[string]*nodeState, error) { + log := logf.FromContext(ctx) + + podList := &corev1.PodList{} + ns := r.PodNamespace + if ns == "" { + ns = podbuilder.DefaultPodNamespace + } + if err := r.List(ctx, podList, client.InNamespace(ns), client.MatchingLabels{ + podbuilder.LabelManagedBy: podbuilder.LabelManagedByValue, + podbuilder.LabelCachedImage: ci.Name, + }); err != nil { + return nil, fmt.Errorf("listing owned pods: %w", err) + } + + // Build set of previously cached nodes from status + cachedSet := make(map[string]struct{}, len(ci.Status.CachedNodes)) + for _, n := range ci.Status.CachedNodes { + cachedSet[n] = struct{}{} + } + + stateMap := make(map[string]*nodeState, len(targetNodes)) + for i := range targetNodes { + ns := &nodeState{} + // Mark as ready if previously cached + if _, ok := cachedSet[targetNodes[i].Name]; ok { + ns.ready = true + } + stateMap[targetNodes[i].Name] = ns + } + + for i := range podList.Items { + pod := &podList.Items[i] + nodeName := pod.Labels[podbuilder.LabelNode] + state, exists := stateMap[nodeName] + if !exists { + if err := r.Delete(ctx, pod); client.IgnoreNotFound(err) != nil { + log.Error(err, "deleting orphan pod", "pod", pod.Name) + } + continue + } + state.pod = pod + } + + return stateMap, nil +} + +// processPodStates evaluates completed/failed/running pods and returns ready count. +func (r *CachedImageReconciler) processPodStates(ctx context.Context, ci *dropv1alpha1.CachedImage, stateMap map[string]*nodeState) (int32, bool) { + log := logf.FromContext(ctx) + var nodesReady int32 + var requeueNeeded bool + + for nodeName, state := range stateMap { + // Count nodes already cached (from previous reconciles) + if state.ready && state.pod == nil { + nodesReady++ + continue + } + + if state.pod == nil { + continue + } + + switch state.pod.Status.Phase { + case corev1.PodSucceeded: + state.ready = true + nodesReady++ + // Capture the resolved digest from the container runtime + if digest := extractResolvedDigest(state.pod); digest != "" { + ci.Status.ResolvedDigest = digest + } + dropmetrics.ActivePulls.Dec() + dropmetrics.ImagesCachedTotal.WithLabelValues(ci.Spec.Image, nodeName).Inc() + r.Recorder.Eventf(ci, corev1.EventTypeNormal, "PullSucceeded", "Image %s cached on node %s", ci.Spec.Image, nodeName) + if err := r.Delete(ctx, state.pod); client.IgnoreNotFound(err) != nil { + log.Error(err, "deleting succeeded pod", "pod", state.pod.Name, "node", nodeName) + } + case corev1.PodFailed: + state.failed = true + state.failReason, state.failMessage = extractPodFailureReason(state.pod) + dropmetrics.ActivePulls.Dec() + dropmetrics.PullErrorsTotal.WithLabelValues(ci.Spec.Image, nodeName).Inc() + r.Recorder.Eventf(ci, corev1.EventTypeWarning, state.failReason, "Failed to pull image %s on node %s: %s", ci.Spec.Image, nodeName, state.failMessage) + log.Info("drop pod failed", "pod", state.pod.Name, "node", nodeName, "reason", state.failReason) + if err := r.Delete(ctx, state.pod); client.IgnoreNotFound(err) != nil { + log.Error(err, "deleting failed pod", "pod", state.pod.Name, "node", nodeName) + } + case corev1.PodRunning, corev1.PodPending: + // Check for image pull errors on waiting containers + if reason, msg := extractContainerWaitingReason(state.pod); reason != "" { + state.failed = true + state.failReason = reason + state.failMessage = msg + dropmetrics.ActivePulls.Dec() + dropmetrics.PullErrorsTotal.WithLabelValues(ci.Spec.Image, nodeName).Inc() + r.Recorder.Eventf(ci, corev1.EventTypeWarning, reason, "Image %s on node %s: %s", ci.Spec.Image, nodeName, msg) + // Delete the stuck pod; backoff retry will create a new one + if err := r.Delete(ctx, state.pod); client.IgnoreNotFound(err) != nil { + log.Error(err, "deleting stuck pod", "pod", state.pod.Name, "node", nodeName) + } + } else { + requeueNeeded = true + } + } + } + + return nodesReady, requeueNeeded +} + +// extractContainerWaitingReason checks init/regular container statuses for image pull errors. +func extractContainerWaitingReason(pod *corev1.Pod) (string, string) { + for _, cs := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) { + if cs.State.Waiting != nil { + switch cs.State.Waiting.Reason { + case "ErrImagePull", "ImagePullBackOff", "InvalidImageName", "RegistryUnavailable": + return cs.State.Waiting.Reason, cleanPullMessage(cs.State.Waiting.Message) + } + } + } + return "", "" +} + +// cleanPullMessage extracts the root cause from verbose kubelet error chains. +// Input like: Back-off pulling image "img": ErrImagePull: failed to pull and unpack image "img": +// +// failed to resolve reference "img": failed to do request: Head "https://...": +// dial tcp: lookup registry.invalid.local on 172.30.0.1:53: server misbehaving +// +// Output: "dns: cannot resolve registry.invalid.local" +func cleanPullMessage(msg string) string { + lower := strings.ToLower(msg) + + // DNS errors + if strings.Contains(lower, "no such host") || strings.Contains(lower, "server misbehaving") { + if host := extractHostFromPullError(msg); host != "" { + return fmt.Sprintf("dns: cannot resolve %s", host) + } + } + + // Connection refused + if strings.Contains(lower, "connection refused") { + if host := extractHostFromPullError(msg); host != "" { + return fmt.Sprintf("connection refused: %s", host) + } + } + + // TLS errors + if strings.Contains(lower, "x509") || strings.Contains(lower, "certificate") { + return "tls: certificate error" + } + + // Timeout + if strings.Contains(lower, "timeout") || strings.Contains(lower, "deadline exceeded") { + if host := extractHostFromPullError(msg); host != "" { + return fmt.Sprintf("timeout connecting to %s", host) + } + return "timeout" + } + + // Auth errors + if strings.Contains(lower, "401") || strings.Contains(lower, "unauthorized") { + return "unauthorized: check imagePullSecrets" + } + if strings.Contains(lower, "403") || strings.Contains(lower, "forbidden") { + return "forbidden: access denied" + } + + // 404 / not found + if strings.Contains(lower, "not found") || strings.Contains(lower, "404") || strings.Contains(lower, "manifest unknown") { + return "image not found" + } + + // Fallback: take the last meaningful segment + parts := strings.Split(msg, ": ") + if len(parts) > 2 { + return strings.Join(parts[len(parts)-2:], ": ") + } + if len(msg) > 120 { + return msg[:120] + "..." + } + return msg +} + +// extractHostFromPullError pulls the registry host from a kubelet pull error message. +func extractHostFromPullError(msg string) string { + // Look for "lookup on" pattern + if idx := strings.Index(msg, "lookup "); idx != -1 { + rest := msg[idx+len("lookup "):] + if end := strings.IndexAny(rest, " :"); end != -1 { + return rest[:end] + } + } + // Look for "https://" or "http://" + for _, scheme := range []string{"https://", "http://"} { + if idx := strings.Index(msg, scheme); idx != -1 { + rest := msg[idx+len(scheme):] + if end := strings.IndexAny(rest, "/?\" "); end != -1 { + return rest[:end] + } + } + } + return "" +} + +// extractResolvedDigest extracts the image digest from a succeeded pod's container status. +// The kubelet reports the resolved imageID as "docker-pullable://image@sha256:abc..." or "image@sha256:abc...". +func extractResolvedDigest(pod *corev1.Pod) string { + for _, cs := range pod.Status.ContainerStatuses { + if cs.ImageID != "" { + // ImageID is typically "docker-pullable://registry/repo@sha256:..." or "registry/repo@sha256:..." + if idx := strings.Index(cs.ImageID, "sha256:"); idx != -1 { + return cs.ImageID[idx:] + } + } + } + return "" +} + +// extractPodFailureReason extracts a reason from a failed pod's container statuses or status message. +func extractPodFailureReason(pod *corev1.Pod) (string, string) { + // Check terminated container reasons first + for _, cs := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) { + if cs.State.Terminated != nil && cs.State.Terminated.Reason != "" { + return cs.State.Terminated.Reason, cleanPullMessage(cs.State.Terminated.Message) + } + } + // Fall back to pod status reason/message + if pod.Status.Reason != "" { + return pod.Status.Reason, cleanPullMessage(pod.Status.Message) + } + return "PodFailed", cleanPullMessage(pod.Status.Message) +} + +// schedulePulls creates drop pods for nodes that need them, respecting pacing. +func (r *CachedImageReconciler) schedulePulls(ctx context.Context, ci *dropv1alpha1.CachedImage, policy *dropv1alpha1.PullPolicy, stateMap map[string]*nodeState) (time.Duration, bool, error) { + log := logf.FromContext(ctx) + var requeueAfter time.Duration + var requeueNeeded bool + + // If any node failed THIS reconcile, don't create new pods. + // The image is broken — it will fail on all nodes. Let the requeue timer handle retry. + for _, state := range stateMap { + if state.failed { + log.V(1).Info("failure observed this reconcile, skipping all pulls") + return 0, false, nil + } + } + + // If we have consecutive failures from previous reconciles, enforce backoff. + if ci.Status.ConsecutiveFailures > 0 { + backoff := computeBackoff(policy, ci.Status.ConsecutiveFailures) + if ci.Status.LastAttemptedAt != nil { + elapsed := time.Since(ci.Status.LastAttemptedAt.Time) + if elapsed < backoff { + remaining := backoff - elapsed + log.V(1).Info("in backoff period, skipping pulls", "remaining", remaining, "failures", ci.Status.ConsecutiveFailures) + return remaining, true, nil + } + } else { + // No LastAttemptedAt yet (pre-existing resource) — backoff and let status patch set it. + log.V(1).Info("backoff: no lastAttemptedAt, will set on next status patch", "failures", ci.Status.ConsecutiveFailures) + return backoff, true, nil + } + } + + for nodeName, state := range stateMap { + if state.ready || state.pod != nil || state.failed { + continue + } + + decision, err := r.PacingEngine.CanStartPull(ctx, policy, ci.Name) + if err != nil { + return 0, false, fmt.Errorf("checking pacing: %w", err) + } + + if !decision.Allowed { + if decision.RequeueIn > requeueAfter { + requeueAfter = decision.RequeueIn + } + requeueNeeded = true + continue + } + + pod, err := podbuilder.BuildDropPod(ci, nodeName, r.PodNamespace) + if err != nil { + return 0, false, fmt.Errorf("building drop pod: %w", err) + } + + if err := r.Create(ctx, pod); err != nil { + if !errors.IsAlreadyExists(err) { + return 0, false, fmt.Errorf("creating drop pod: %w", err) + } + } else { + // Mark the attempt time so backoff is measured from now + now := metav1.Now() + ci.Status.LastAttemptedAt = &now + dropmetrics.ActivePulls.Inc() + r.Recorder.Eventf(ci, corev1.EventTypeNormal, "PullStarted", "Started pulling image %s on node %s", ci.Spec.Image, nodeName) + log.Info("created drop pod", "pod", pod.Name, "node", nodeName, "image", ci.Spec.Image) + } + + requeueNeeded = true + break // Create one pod at a time, respecting pacing + } + + return requeueAfter, requeueNeeded, nil +} + +// updateCachedImageStatus computes and sets the status fields on the CachedImage. +func (r *CachedImageReconciler) updateCachedImageStatus(ci *dropv1alpha1.CachedImage, stateMap map[string]*nodeState, nodesTargeted, nodesReady int32, now metav1.Time) { + phase := phasePending + if nodesReady == nodesTargeted && nodesTargeted > 0 { + phase = phaseReady + } else if nodesReady > 0 { + phase = phasePulling + } + + // Collect failure info + var failReason, failMessage string + var newFailureObserved bool + for _, state := range stateMap { + if state.failed && !state.ready { + phase = phaseDegraded + newFailureObserved = true + if state.failReason != "" && failReason == "" { + failReason = state.failReason + failMessage = state.failMessage + } + } + } + + // If no new failure but we have previous failures and aren't Ready yet, stay Degraded + if !newFailureObserved && ci.Status.ConsecutiveFailures > 0 && phase != phaseReady { + phase = phaseDegraded + // Preserve the last known failure reason from existing condition + if existing := meta.FindStatusCondition(ci.Status.Conditions, conditionTypeReady); existing != nil && existing.Status == metav1.ConditionFalse { + failReason = existing.Reason + failMessage = existing.Message + } + } + + // Persist the list of nodes that have successfully cached the image + cachedNodes := make([]string, 0, nodesReady) + for nodeName, state := range stateMap { + if state.ready { + cachedNodes = append(cachedNodes, nodeName) + } + } + + ci.Status.ObservedGeneration = ci.Generation + ci.Status.NodesTargeted = nodesTargeted + ci.Status.NodesReady = nodesReady + ci.Status.Ready = fmt.Sprintf("%d/%d", nodesReady, nodesTargeted) + ci.Status.CachedNodes = cachedNodes + ci.Status.Phase = phase + + // Track consecutive failures for backoff calculation. + // Only increment when we actually observed a new failure this reconcile. + if newFailureObserved { + ci.Status.ConsecutiveFailures++ + ci.Status.LastAttemptedAt = &now + } else if phase == phaseReady { + ci.Status.ConsecutiveFailures = 0 + } + // If phase is Degraded but no new failure observed (idle requeue), preserve current CF. + + if nodesReady > 0 { + ci.Status.LastPulledAt = &now + } + + readyCondition := metav1.Condition{ + Type: conditionTypeReady, + ObservedGeneration: ci.Generation, + LastTransitionTime: now, + } + switch { + case phase == phaseReady: + readyCondition.Status = metav1.ConditionTrue + readyCondition.Reason = "Cached" + readyCondition.Message = fmt.Sprintf("Image cached on all %d target nodes", nodesTargeted) + case phase == phaseDegraded && failReason != "": + readyCondition.Status = metav1.ConditionFalse + readyCondition.Reason = failReason + if failMessage != "" { + readyCondition.Message = failMessage + } else { + readyCondition.Message = fmt.Sprintf("%d/%d nodes ready", nodesReady, nodesTargeted) + } + case phase == phaseDegraded: + readyCondition.Status = metav1.ConditionFalse + readyCondition.Reason = "PullFailed" + readyCondition.Message = fmt.Sprintf("%d/%d nodes ready", nodesReady, nodesTargeted) + default: + readyCondition.Status = metav1.ConditionFalse + readyCondition.Reason = "InProgress" + readyCondition.Message = fmt.Sprintf("%d/%d nodes ready", nodesReady, nodesTargeted) + } + meta.SetStatusCondition(&ci.Status.Conditions, readyCondition) +} + +// filterNodesByTolerations returns nodes whose taints are tolerated. +func filterNodesByTolerations(nodes []corev1.Node, tolerations []corev1.Toleration) []corev1.Node { + if len(tolerations) == 0 { + // If no tolerations, only accept nodes without NoSchedule/NoExecute taints + var result []corev1.Node + for i := range nodes { + if !hasUntoleratableTaints(nodes[i].Spec.Taints) { + result = append(result, nodes[i]) + } + } + return result + } + + var result []corev1.Node + for i := range nodes { + if allTaintsTolerated(nodes[i].Spec.Taints, tolerations) { + result = append(result, nodes[i]) + } + } + return result +} + +// hasUntoleratableTaints checks if any taint prevents scheduling. +func hasUntoleratableTaints(taints []corev1.Taint) bool { + for _, taint := range taints { + if taint.Effect == corev1.TaintEffectNoSchedule || taint.Effect == corev1.TaintEffectNoExecute { + return true + } + } + return false +} + +// allTaintsTolerated checks if all NoSchedule/NoExecute taints are tolerated. +func allTaintsTolerated(taints []corev1.Taint, tolerations []corev1.Toleration) bool { + for _, taint := range taints { + if taint.Effect != corev1.TaintEffectNoSchedule && taint.Effect != corev1.TaintEffectNoExecute { + continue + } + if !taintTolerated(taint, tolerations) { + return false + } + } + return true +} + +// taintTolerated checks if a single taint is tolerated by any toleration. +func taintTolerated(taint corev1.Taint, tolerations []corev1.Toleration) bool { + for _, toleration := range tolerations { + if toleration.Operator == corev1.TolerationOpExists { + if toleration.Key == "" { + return true // Tolerates everything + } + if toleration.Key == taint.Key { + if toleration.Effect == "" || toleration.Effect == taint.Effect { + return true + } + } + } + if toleration.Operator == corev1.TolerationOpEqual || toleration.Operator == "" { + if toleration.Key == taint.Key && toleration.Value == taint.Value { + if toleration.Effect == "" || toleration.Effect == taint.Effect { + return true + } + } + } + } + return false +} + +// cleanupOrphanPods deletes all drop pods that reference a deleted CachedImage. +func (r *CachedImageReconciler) cleanupOrphanPods(ctx context.Context, cachedImageName string) error { + log := logf.FromContext(ctx) + ns := r.PodNamespace + if ns == "" { + ns = podbuilder.DefaultPodNamespace + } + podList := &corev1.PodList{} + if err := r.List(ctx, podList, client.InNamespace(ns), client.MatchingLabels{ + podbuilder.LabelManagedBy: podbuilder.LabelManagedByValue, + podbuilder.LabelCachedImage: cachedImageName, + }); err != nil { + return fmt.Errorf("listing orphan pods: %w", err) + } + for i := range podList.Items { + log.Info("deleting orphan pod", "pod", podList.Items[i].Name, "cachedImage", cachedImageName) + if err := r.Delete(ctx, &podList.Items[i]); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("deleting orphan pod %s: %w", podList.Items[i].Name, err) + } + } + return nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *CachedImageReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&dropv1alpha1.CachedImage{}). + // Watch drop pods and map them back to the owning CachedImage via label. + // We can't use Owns() because CachedImage is cluster-scoped and pods are namespaced. + Watches(&corev1.Pod{}, handler.EnqueueRequestsFromMapFunc( + func(ctx context.Context, obj client.Object) []reconcile.Request { + pod, ok := obj.(*corev1.Pod) + if !ok { + return nil + } + if pod.Labels[podbuilder.LabelManagedBy] != podbuilder.LabelManagedByValue { + return nil + } + ciName := pod.Labels[podbuilder.LabelCachedImage] + if ciName == "" { + return nil + } + return []reconcile.Request{{NamespacedName: types.NamespacedName{Name: ciName}}} + }, + )). + Named("cachedimage"). + Complete(r) +} diff --git a/internal/controller/cachedimage_controller_test.go b/internal/controller/cachedimage_controller_test.go new file mode 100644 index 0000000..51a99da --- /dev/null +++ b/internal/controller/cachedimage_controller_test.go @@ -0,0 +1,86 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" + "github.com/Breee/drop/internal/pacing" +) + +var _ = Describe("CachedImage Controller", func() { + Context("When reconciling a resource", func() { + const resourceName = "test-cachedimage" + + ctx := context.Background() + + typeNamespacedName := types.NamespacedName{ + Name: resourceName, + } + cachedimage := &dropv1alpha1.CachedImage{} + + BeforeEach(func() { + By("creating the custom resource for the Kind CachedImage") + err := k8sClient.Get(ctx, typeNamespacedName, cachedimage) + if err != nil && errors.IsNotFound(err) { + resource := &dropv1alpha1.CachedImage{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceName, + }, + Spec: dropv1alpha1.CachedImageSpec{ + Image: "docker.io/library/nginx", + Tag: "1.25", + }, + } + Expect(k8sClient.Create(ctx, resource)).To(Succeed()) + } + }) + + AfterEach(func() { + resource := &dropv1alpha1.CachedImage{} + err := k8sClient.Get(ctx, typeNamespacedName, resource) + if err == nil { + By("Cleanup the specific resource instance CachedImage") + Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) + } + }) + + It("should successfully reconcile the resource", func() { + By("Reconciling the created resource") + controllerReconciler := &CachedImageReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + PodNamespace: "drop-system", + PacingEngine: pacing.NewEngine(k8sClient, "drop-system"), + } + + _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + }) + }) +}) diff --git a/internal/controller/cachedimageset_controller.go b/internal/controller/cachedimageset_controller.go new file mode 100644 index 0000000..40c2265 --- /dev/null +++ b/internal/controller/cachedimageset_controller.go @@ -0,0 +1,337 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + "strings" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/handler" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" +) + +const labelImageSet = "drop.corewire.io/imageset" + +// CachedImageSetReconciler reconciles a CachedImageSet object +type CachedImageSetReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=drop.corewire.io,resources=cachedimagesets,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=drop.corewire.io,resources=cachedimagesets/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=drop.corewire.io,resources=cachedimagesets/finalizers,verbs=update +// +kubebuilder:rbac:groups=drop.corewire.io,resources=discoverypolicies,verbs=get;list;watch + +// Reconcile manages child CachedImage resources for a CachedImageSet. +func (r *CachedImageSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logf.FromContext(ctx) + + // 1. Fetch CachedImageSet + imageSet := &dropv1alpha1.CachedImageSet{} + if err := r.Get(ctx, req.NamespacedName, imageSet); err != nil { + if errors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // 2. Build desired image list + desiredImages := r.buildDesiredImages(ctx, imageSet) + + // 3. List existing child CachedImage resources + existingChildren := &dropv1alpha1.CachedImageList{} + if err := r.List(ctx, existingChildren, client.MatchingLabels{ + labelImageSet: imageSet.Name, + }); err != nil { + return ctrl.Result{}, fmt.Errorf("listing children: %w", err) + } + + // Build map of existing children by image ref + existingMap := make(map[string]*dropv1alpha1.CachedImage, len(existingChildren.Items)) + for i := range existingChildren.Items { + child := &existingChildren.Items[i] + ref := buildChildImageRef(child) + existingMap[ref] = child + } + + // 4. Diff: create new, delete removed + desiredSet := make(map[string]dropv1alpha1.ImageEntry, len(desiredImages)) + for _, img := range desiredImages { + ref := buildEntryRef(img) + desiredSet[ref] = img + } + + // Delete children that are no longer desired + for ref, child := range existingMap { + if _, wanted := desiredSet[ref]; !wanted { + log.Info("deleting child CachedImage", "name", child.Name, "image", ref) + if err := r.Delete(ctx, child); client.IgnoreNotFound(err) != nil { + return ctrl.Result{}, fmt.Errorf("deleting child: %w", err) + } + } + } + + // Create children that don't exist yet + for ref, img := range desiredSet { + if _, exists := existingMap[ref]; exists { + continue + } + + child := r.buildChildCachedImage(imageSet, img) + if err := controllerutil.SetControllerReference(imageSet, child, r.Scheme); err != nil { + return ctrl.Result{}, fmt.Errorf("setting owner reference: %w", err) + } + + log.Info("creating child CachedImage", "name", child.Name, "image", ref) + if err := r.Create(ctx, child); err != nil { + if !errors.IsAlreadyExists(err) { + return ctrl.Result{}, fmt.Errorf("creating child: %w", err) + } + } + } + + // 5. Update status + // Re-list children after mutations + patch := client.MergeFrom(imageSet.DeepCopy()) + if err := r.List(ctx, existingChildren, client.MatchingLabels{ + "drop.corewire.io/imageset": imageSet.Name, + }); err != nil { + return ctrl.Result{}, fmt.Errorf("re-listing children: %w", err) + } + + var imagesReady int32 + var worstReason, worstMessage string + var hasDegraded bool + for i := range existingChildren.Items { + child := &existingChildren.Items[i] + switch child.Status.Phase { + case phaseReady: + imagesReady++ + case phaseDegraded: + hasDegraded = true + // Extract the child's failure reason for propagation + for _, c := range child.Status.Conditions { + if c.Type == conditionTypeReady && c.Status == metav1.ConditionFalse && c.Reason != "InProgress" { + worstReason = c.Reason + worstMessage = c.Message + } + } + } + } + + imageSet.Status.ObservedGeneration = imageSet.Generation + imageSet.Status.ImagesManaged = int32(len(existingChildren.Items)) + imageSet.Status.ImagesReady = imagesReady + + if imagesReady == int32(len(desiredImages)) && len(desiredImages) > 0 { + imageSet.Status.Phase = phaseReady + } else if hasDegraded { + imageSet.Status.Phase = phaseDegraded + } else { + imageSet.Status.Phase = phasePending + } + + now := metav1.Now() + readyCondition := metav1.Condition{ + Type: conditionTypeReady, + ObservedGeneration: imageSet.Generation, + LastTransitionTime: now, + } + switch { + case imageSet.Status.Phase == phaseReady: + readyCondition.Status = metav1.ConditionTrue + readyCondition.Reason = "Ready" + readyCondition.Message = fmt.Sprintf("All %d images are cached", imagesReady) + case hasDegraded && worstReason != "": + readyCondition.Status = metav1.ConditionFalse + readyCondition.Reason = "Degraded" + readyCondition.Message = fmt.Sprintf("%d/%d images cached, failing: %s", imagesReady, len(desiredImages), worstReason) + if worstMessage != "" { + readyCondition.Message = fmt.Sprintf("%d/%d images cached: %s", imagesReady, len(desiredImages), worstMessage) + } + default: + readyCondition.Status = metav1.ConditionFalse + readyCondition.Reason = "Progressing" + readyCondition.Message = fmt.Sprintf("%d/%d images cached", imagesReady, len(desiredImages)) + } + meta.SetStatusCondition(&imageSet.Status.Conditions, readyCondition) + + if err := r.Status().Patch(ctx, imageSet, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("patching status: %w", err) + } + + return ctrl.Result{}, nil +} + +// buildDesiredImages constructs the desired image list from static images and discovery. +func (r *CachedImageSetReconciler) buildDesiredImages(ctx context.Context, imageSet *dropv1alpha1.CachedImageSet) []dropv1alpha1.ImageEntry { + var desired []dropv1alpha1.ImageEntry + + // Static images + desired = append(desired, imageSet.Spec.Images...) + + // Discovery policy images + if imageSet.Spec.DiscoveryPolicyRef != nil { + dp := &dropv1alpha1.DiscoveryPolicy{} + key := client.ObjectKey{Name: imageSet.Spec.DiscoveryPolicyRef.Name} + if err := r.Get(ctx, key, dp); err == nil { + for _, discovered := range dp.Status.DiscoveredImages { + entry := parseImageRef(discovered.Image) + desired = append(desired, entry) + } + } + } + + return desired +} + +// parseImageRef splits a full image reference into ImageEntry. +func parseImageRef(ref string) dropv1alpha1.ImageEntry { + if idx := strings.Index(ref, "@"); idx != -1 { + return dropv1alpha1.ImageEntry{ + Image: ref[:idx], + Digest: ref[idx+1:], + } + } + if idx := strings.LastIndex(ref, ":"); idx != -1 { + // Ensure it's a tag separator and not a port + afterColon := ref[idx+1:] + if !strings.Contains(afterColon, "/") { + return dropv1alpha1.ImageEntry{ + Image: ref[:idx], + Tag: afterColon, + } + } + } + return dropv1alpha1.ImageEntry{Image: ref} +} + +// buildChildCachedImage creates a CachedImage spec from an ImageEntry. +func (r *CachedImageSetReconciler) buildChildCachedImage(parent *dropv1alpha1.CachedImageSet, img dropv1alpha1.ImageEntry) *dropv1alpha1.CachedImage { + name := sanitizeName(fmt.Sprintf("%s-%s-%s", parent.Name, imageName(img.Image), img.Tag)) + if img.Digest != "" { + name = sanitizeName(fmt.Sprintf("%s-%s-digest", parent.Name, imageName(img.Image))) + } + + child := &dropv1alpha1.CachedImage{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + "drop.corewire.io/imageset": parent.Name, + }, + }, + Spec: dropv1alpha1.CachedImageSpec{ + Image: img.Image, + Tag: img.Tag, + Digest: img.Digest, + ImagePullPolicy: parent.Spec.ImagePullPolicy, + ImagePullSecrets: parent.Spec.ImagePullSecrets, + NodeSelector: parent.Spec.NodeSelector, + Tolerations: parent.Spec.Tolerations, + PolicyRef: parent.Spec.PolicyRef, + }, + } + + return child +} + +// buildChildImageRef creates a comparable ref from a CachedImage. +func buildChildImageRef(ci *dropv1alpha1.CachedImage) string { + return buildEntryRef(dropv1alpha1.ImageEntry{ + Image: ci.Spec.Image, + Tag: ci.Spec.Tag, + Digest: ci.Spec.Digest, + }) +} + +// buildEntryRef creates a comparable ref from an ImageEntry. +func buildEntryRef(entry dropv1alpha1.ImageEntry) string { + if entry.Digest != "" { + return fmt.Sprintf("%s@%s", entry.Image, entry.Digest) + } + tag := entry.Tag + if tag == "" { + tag = "latest" + } + return fmt.Sprintf("%s:%s", entry.Image, tag) +} + +// imageName extracts the short name from a full image reference. +func imageName(image string) string { + parts := strings.Split(image, "/") + return parts[len(parts)-1] +} + +// sanitizeName ensures the name is a valid k8s resource name. +func sanitizeName(name string) string { + name = strings.ToLower(name) + name = strings.ReplaceAll(name, "/", "-") + name = strings.ReplaceAll(name, ":", "-") + name = strings.ReplaceAll(name, ".", "-") + name = strings.ReplaceAll(name, "_", "-") + if len(name) > 253 { + name = name[:253] + } + return name +} + +// mapDiscoveryToSets maps DiscoveryPolicy changes to CachedImageSets that reference them. +func (r *CachedImageSetReconciler) mapDiscoveryToSets(ctx context.Context, obj client.Object) []reconcile.Request { + dp, ok := obj.(*dropv1alpha1.DiscoveryPolicy) + if !ok { + return nil + } + + setList := &dropv1alpha1.CachedImageSetList{} + if err := r.List(ctx, setList); err != nil { + return nil + } + + var requests []reconcile.Request + for i := range setList.Items { + set := &setList.Items[i] + if set.Spec.DiscoveryPolicyRef != nil && set.Spec.DiscoveryPolicyRef.Name == dp.Name { + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: set.Name}, + }) + } + } + return requests +} + +// SetupWithManager sets up the controller with the Manager. +func (r *CachedImageSetReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&dropv1alpha1.CachedImageSet{}). + Owns(&dropv1alpha1.CachedImage{}). + Watches(&dropv1alpha1.DiscoveryPolicy{}, handler.EnqueueRequestsFromMapFunc(r.mapDiscoveryToSets)). + Named("cachedimageset"). + Complete(r) +} diff --git a/internal/controller/cachedimageset_controller_test.go b/internal/controller/cachedimageset_controller_test.go new file mode 100644 index 0000000..781cdf6 --- /dev/null +++ b/internal/controller/cachedimageset_controller_test.go @@ -0,0 +1,84 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" +) + +var _ = Describe("CachedImageSet Controller", func() { + Context("When reconciling a resource", func() { + const resourceName = "test-imageset" + + ctx := context.Background() + + typeNamespacedName := types.NamespacedName{ + Name: resourceName, + } + cachedimageset := &dropv1alpha1.CachedImageSet{} + + BeforeEach(func() { + By("creating the custom resource for the Kind CachedImageSet") + err := k8sClient.Get(ctx, typeNamespacedName, cachedimageset) + if err != nil && errors.IsNotFound(err) { + resource := &dropv1alpha1.CachedImageSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceName, + }, + Spec: dropv1alpha1.CachedImageSetSpec{ + Images: []dropv1alpha1.ImageEntry{ + {Image: "docker.io/library/nginx", Tag: "1.25"}, + }, + }, + } + Expect(k8sClient.Create(ctx, resource)).To(Succeed()) + } + }) + + AfterEach(func() { + resource := &dropv1alpha1.CachedImageSet{} + err := k8sClient.Get(ctx, typeNamespacedName, resource) + if err == nil { + By("Cleanup the specific resource instance CachedImageSet") + Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) + } + }) + + It("should successfully reconcile the resource", func() { + By("Reconciling the created resource") + controllerReconciler := &CachedImageSetReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + + _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + }) + }) +}) diff --git a/internal/controller/discoverypolicy_controller.go b/internal/controller/discoverypolicy_controller.go new file mode 100644 index 0000000..afeb283 --- /dev/null +++ b/internal/controller/discoverypolicy_controller.go @@ -0,0 +1,511 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "crypto/tls" + "crypto/x509" + "errors" + "fmt" + "net" + "net/http" + "net/url" + "regexp" + "sort" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" + "github.com/Breee/drop/internal/discovery" + dropmetrics "github.com/Breee/drop/internal/metrics" +) + +// DiscoveryPolicyReconciler reconciles a DiscoveryPolicy object +type DiscoveryPolicyReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +const ( + reasonDNSError = "DNSError" + reasonConnectionRefused = "ConnectionRefused" +) + +// +kubebuilder:rbac:groups=drop.corewire.io,resources=discoverypolicies,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=drop.corewire.io,resources=discoverypolicies/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=drop.corewire.io,resources=discoverypolicies/finalizers,verbs=update +// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch + +// Reconcile queries discovery sources and updates the DiscoveryPolicy status. +func (r *DiscoveryPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logf.FromContext(ctx) + + // 1. Fetch DiscoveryPolicy + dp := &dropv1alpha1.DiscoveryPolicy{} + if err := r.Get(ctx, req.NamespacedName, dp); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // 2. Query each source + patch := client.MergeFrom(dp.DeepCopy()) + var allResults []discovery.ImageResult + allSourcesHealthy := true + var lastFailReason, lastFailMessage string + + for i, src := range dp.Spec.Sources { + source, err := r.buildSource(ctx, src) + if err != nil { + log.Error(err, "building source", "index", i, "type", src.Type) + allSourcesHealthy = false + lastFailReason, lastFailMessage = classifyError(err) + dropmetrics.DiscoverySourceHealth.WithLabelValues(dp.Name, src.Type, sourceEndpoint(src)).Set(0) + continue + } + + start := time.Now() + results, err := source.Fetch(ctx) + elapsed := time.Since(start).Seconds() + dropmetrics.DiscoverySourceLatencySeconds.WithLabelValues(dp.Name, src.Type).Observe(elapsed) + + if err != nil { + log.Error(err, "fetching from source", "index", i, "type", src.Type) + allSourcesHealthy = false + lastFailReason, lastFailMessage = classifyError(err) + dropmetrics.DiscoverySourceHealth.WithLabelValues(dp.Name, src.Type, sourceEndpoint(src)).Set(0) + continue + } + + dropmetrics.DiscoverySourceHealth.WithLabelValues(dp.Name, src.Type, sourceEndpoint(src)).Set(1) + + // Tag results with source type + for j := range results { + results[j] = discovery.ImageResult{ + Image: results[j].Image, + Score: results[j].Score, + } + } + dropmetrics.DiscoveryImagesFound.WithLabelValues(dp.Name, src.Type).Set(float64(len(results))) + allResults = append(allResults, results...) + } + + // 3. Merge results (deduplicate by image, keep highest score) + merged := deduplicateResults(allResults) + + // 4. Apply image filter + if dp.Spec.ImageFilter != "" { + re, err := regexp.Compile(dp.Spec.ImageFilter) + if err != nil { + log.Error(err, "compiling image filter regex") + } else { + var filtered []discovery.ImageResult + for _, r := range merged { + if re.MatchString(r.Image) { + filtered = append(filtered, r) + } + } + merged = filtered + } + } + + // 5. Sort by score descending, truncate to maxImages + sort.Slice(merged, func(i, j int) bool { + if merged[i].Score != merged[j].Score { + return merged[i].Score > merged[j].Score + } + return merged[i].Image < merged[j].Image + }) + + maxImages := dp.Spec.MaxImages + if maxImages <= 0 { + maxImages = 50 + } + if int32(len(merged)) > maxImages { + merged = merged[:maxImages] + } + + // 6. Write status + // On total failure and previous results exist, keep last good results + if len(merged) == 0 && !allSourcesHealthy && len(dp.Status.DiscoveredImages) > 0 { + log.Info("all sources failed, keeping previous discovery results") + } else { + discoveredImages := make([]dropv1alpha1.DiscoveredImage, 0, len(merged)) + for _, r := range merged { + discoveredImages = append(discoveredImages, dropv1alpha1.DiscoveredImage{ + Image: r.Image, + Score: r.Score, + Source: "discovery", + }) + } + dp.Status.DiscoveredImages = discoveredImages + } + + now := metav1.Now() + if allSourcesHealthy || len(merged) > 0 { + dp.Status.LastSyncTime = &now + } + + // 7. Set conditions + sourceCondition := metav1.Condition{ + Type: "SourceHealthy", + ObservedGeneration: dp.Generation, + LastTransitionTime: now, + } + if allSourcesHealthy { + sourceCondition.Status = metav1.ConditionTrue + sourceCondition.Reason = "AllSourcesHealthy" + sourceCondition.Message = "All discovery sources responded successfully" + } else { + sourceCondition.Status = metav1.ConditionFalse + sourceCondition.Reason = "SourceError" + sourceCondition.Message = "One or more sources failed to respond" + } + meta.SetStatusCondition(&dp.Status.Conditions, sourceCondition) + + readyCondition := metav1.Condition{ + Type: conditionTypeReady, + ObservedGeneration: dp.Generation, + LastTransitionTime: now, + } + if allSourcesHealthy { + readyCondition.Status = metav1.ConditionTrue + readyCondition.Reason = "Synced" + readyCondition.Message = fmt.Sprintf("Discovered %d images", len(dp.Status.DiscoveredImages)) + } else if len(dp.Status.DiscoveredImages) > 0 { + readyCondition.Status = metav1.ConditionTrue + readyCondition.Reason = "PartiallyFailed" + readyCondition.Message = fmt.Sprintf("Discovered %d images, but some sources failed: %s", len(dp.Status.DiscoveredImages), lastFailMessage) + } else { + readyCondition.Status = metav1.ConditionFalse + readyCondition.Reason = lastFailReason + if lastFailReason == "" { + readyCondition.Reason = "SyncFailed" + } + if lastFailMessage != "" { + readyCondition.Message = lastFailMessage + } else { + readyCondition.Message = "All sources failed, no images discovered" + } + } + meta.SetStatusCondition(&dp.Status.Conditions, readyCondition) + + // Set scalar counts for printer columns + dp.Status.SourceCount = int32(len(dp.Spec.Sources)) + dp.Status.ImageCount = int32(len(dp.Status.DiscoveredImages)) + + if err := r.Status().Patch(ctx, dp, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("patching status: %w", err) + } + + // 8. Requeue after sync interval + syncInterval := dp.Spec.SyncInterval.Duration + if syncInterval == 0 { + syncInterval = 30 * time.Minute + } + + // If sources failed, return error → controller-runtime rate limiter + // applies exponential backoff (standard k8s pattern). + if !allSourcesHealthy && len(dp.Status.DiscoveredImages) == 0 { + return ctrl.Result{}, fmt.Errorf("discovery sync failed: %s", lastFailMessage) + } + + return ctrl.Result{RequeueAfter: syncInterval}, nil +} + +// buildSource creates the appropriate Source implementation from a DiscoverySource config. +func (r *DiscoveryPolicyReconciler) buildSource(ctx context.Context, src dropv1alpha1.DiscoverySource) (discovery.Source, error) { + httpClient, err := r.buildHTTPClient(ctx, src.SecretRef) + if err != nil { + return nil, fmt.Errorf("building HTTP client: %w", err) + } + + switch src.Type { + case "prometheus": + if src.Prometheus == nil { + return nil, fmt.Errorf("prometheus config is required when type=prometheus") + } + var lookback time.Duration + if src.Prometheus.Lookback != nil { + lookback = src.Prometheus.Lookback.Duration + } + return discovery.NewPrometheusSource(src.Prometheus.Endpoint, src.Prometheus.Query, lookback, src.Prometheus.Step, httpClient), nil + case "registry": + if src.Registry == nil { + return nil, fmt.Errorf("registry config is required when type=registry") + } + return discovery.NewRegistrySource( + src.Registry.URL, + src.Registry.Repositories, + src.Registry.TagFilter, + src.Registry.TopX, + src.Registry.ImageTemplate, + httpClient, + ), nil + default: + return nil, fmt.Errorf("unsupported source type: %s", src.Type) + } +} + +// buildHTTPClient creates an HTTP client with auth/TLS from a Secret. +func (r *DiscoveryPolicyReconciler) buildHTTPClient(ctx context.Context, secretRef *corev1.LocalObjectReference) (*http.Client, error) { + client := &http.Client{Timeout: 30 * time.Second} + + if secretRef == nil { + return client, nil + } + + secret := &corev1.Secret{} + // Secrets are namespaced; use kube-system for operator secrets + key := types.NamespacedName{Name: secretRef.Name, Namespace: "kube-system"} + if err := r.Get(ctx, key, secret); err != nil { + return nil, fmt.Errorf("fetching secret %s: %w", secretRef.Name, err) + } + + transport := &authTransport{ + base: http.DefaultTransport, + secret: secret, + } + + // Configure TLS if cert data is present + if caCert, ok := secret.Data["ca.crt"]; ok { + pool := x509.NewCertPool() + pool.AppendCertsFromPEM(caCert) + + tlsConfig := &tls.Config{ + RootCAs: pool, + MinVersion: tls.VersionTLS12, + } + + if cert, ok := secret.Data["tls.crt"]; ok { + if key, ok := secret.Data["tls.key"]; ok { + clientCert, err := tls.X509KeyPair(cert, key) + if err == nil { + tlsConfig.Certificates = []tls.Certificate{clientCert} + } + } + } + + transport.base = &http.Transport{TLSClientConfig: tlsConfig} + } + + client.Transport = transport + return client, nil +} + +// authTransport adds authentication headers from a Secret to HTTP requests. +type authTransport struct { + base http.RoundTripper + secret *corev1.Secret +} + +func (t *authTransport) RoundTrip(req *http.Request) (*http.Response, error) { + // Bearer token auth + if token, ok := t.secret.Data["token"]; ok { + req.Header.Set("Authorization", "Bearer "+string(token)) + } + + // Basic auth + if username, ok := t.secret.Data["username"]; ok { + if password, ok := t.secret.Data["password"]; ok { + req.SetBasicAuth(string(username), string(password)) + } + } + + // Custom headers (headers.) + for key, value := range t.secret.Data { + if len(key) > 8 && key[:8] == "headers." { + headerName := key[8:] + req.Header.Set(headerName, string(value)) + } + } + + return t.base.RoundTrip(req) +} + +// deduplicateResults merges results, keeping the highest score per image. +func deduplicateResults(results []discovery.ImageResult) []discovery.ImageResult { + seen := make(map[string]discovery.ImageResult, len(results)) + for _, r := range results { + if existing, ok := seen[r.Image]; ok { + if r.Score > existing.Score { + seen[r.Image] = r + } + } else { + seen[r.Image] = r + } + } + + deduplicated := make([]discovery.ImageResult, 0, len(seen)) + for _, r := range seen { + deduplicated = append(deduplicated, r) + } + return deduplicated +} + +// SetupWithManager sets up the controller with the Manager. +func (r *DiscoveryPolicyReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&dropv1alpha1.DiscoveryPolicy{}). + Named("discoverypolicy"). + Complete(r) +} + +// sourceEndpoint returns the endpoint URL for a discovery source (for metric labels). +func sourceEndpoint(src dropv1alpha1.DiscoverySource) string { + switch src.Type { + case "prometheus": + if src.Prometheus != nil { + return src.Prometheus.Endpoint + } + case "registry": + if src.Registry != nil { + return src.Registry.URL + } + } + return "unknown" +} + +// classifyError maps a source fetch error into a k8s-style reason and human-readable message. +func classifyError(err error) (reason, message string) { + if err == nil { + return "", "" + } + + errStr := err.Error() + + // Network-level errors (typed) + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + return "Timeout", cleanMessage(errStr) + } + + var dnsErr *net.DNSError + if errors.As(err, &dnsErr) { + return reasonDNSError, fmt.Sprintf("cannot resolve host %q", dnsErr.Name) + } + + var opErr *net.OpError + if errors.As(err, &opErr) { + if opErr.Op == "dial" { + // Check if the underlying error is DNS + if strings.Contains(opErr.Err.Error(), "lookup") || strings.Contains(opErr.Err.Error(), "no such host") || strings.Contains(opErr.Err.Error(), "server misbehaving") { + host := extractHost(errStr) + return reasonDNSError, fmt.Sprintf("cannot resolve host %q", host) + } + host := extractHost(errStr) + return reasonConnectionRefused, fmt.Sprintf("cannot connect to %s", host) + } + } + + var urlErr *url.Error + if errors.As(err, &urlErr) { + inner := urlErr.Err.Error() + if strings.Contains(inner, "no such host") || strings.Contains(inner, "server misbehaving") || strings.Contains(inner, "lookup") { + host := extractHost(errStr) + return reasonDNSError, fmt.Sprintf("cannot resolve host %q", host) + } + if strings.Contains(inner, "connection refused") { + host := extractHost(errStr) + return reasonConnectionRefused, fmt.Sprintf("cannot connect to %s", host) + } + } + + // HTTP status-based errors + if strings.Contains(errStr, "status 401") { + return "Unauthorized", cleanMessage(errStr) + } + if strings.Contains(errStr, "status 403") { + return "Forbidden", cleanMessage(errStr) + } + if strings.Contains(errStr, "status 404") { + return "NotFound", cleanMessage(errStr) + } + if strings.Contains(errStr, "status 5") { + return "ServerError", cleanMessage(errStr) + } + + // String-based fallbacks + if strings.Contains(errStr, "no such host") || strings.Contains(errStr, "server misbehaving") { + host := extractHost(errStr) + return reasonDNSError, fmt.Sprintf("cannot resolve host %q", host) + } + if strings.Contains(errStr, "connection refused") { + host := extractHost(errStr) + return reasonConnectionRefused, fmt.Sprintf("cannot connect to %s", host) + } + if strings.Contains(errStr, "timeout") || strings.Contains(errStr, "deadline exceeded") { + return "Timeout", cleanMessage(errStr) + } + if strings.Contains(errStr, "certificate") || strings.Contains(errStr, "x509") { + return "TLSError", cleanMessage(errStr) + } + if strings.Contains(errStr, "decoding") || strings.Contains(errStr, "unmarshal") || strings.Contains(errStr, "invalid") { + return "InvalidResponse", cleanMessage(errStr) + } + + return "SyncFailed", cleanMessage(errStr) +} + +// extractHost pulls the hostname (or host:port) from a Go error string like +// "... lookup nonexistent-prometheus on 10.96.0.10:53 ..." or +// "... dial tcp nonexistent-registry:5000 ..." +func extractHost(errStr string) string { + // Try "lookup on" pattern (DNS errors) + if idx := strings.Index(errStr, "lookup "); idx != -1 { + rest := errStr[idx+len("lookup "):] + if end := strings.IndexAny(rest, " :"); end != -1 { + return rest[:end] + } + return rest + } + // Try to extract from URL pattern "://..." + if idx := strings.Index(errStr, "://"); idx != -1 { + rest := errStr[idx+3:] + if end := strings.IndexAny(rest, "/?"); end != -1 { + return rest[:end] + } + return rest + } + return "unknown" +} + +// cleanMessage truncates verbose Go error chains for human display. +func cleanMessage(errStr string) string { + // Take the last meaningful segment after the last colon-space + parts := strings.Split(errStr, ": ") + if len(parts) > 2 { + // Keep last 2 segments for context + return strings.Join(parts[len(parts)-2:], ": ") + } + if len(errStr) > 120 { + return errStr[:120] + "..." + } + return errStr +} diff --git a/internal/controller/discoverypolicy_controller_test.go b/internal/controller/discoverypolicy_controller_test.go new file mode 100644 index 0000000..d0637cc --- /dev/null +++ b/internal/controller/discoverypolicy_controller_test.go @@ -0,0 +1,91 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" +) + +var _ = Describe("DiscoveryPolicy Controller", func() { + Context("When reconciling a resource", func() { + const resourceName = "test-discovery" + + ctx := context.Background() + + typeNamespacedName := types.NamespacedName{ + Name: resourceName, + } + discoverypolicy := &dropv1alpha1.DiscoveryPolicy{} + + BeforeEach(func() { + By("creating the custom resource for the Kind DiscoveryPolicy") + err := k8sClient.Get(ctx, typeNamespacedName, discoverypolicy) + if err != nil && errors.IsNotFound(err) { + resource := &dropv1alpha1.DiscoveryPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceName, + }, + Spec: dropv1alpha1.DiscoveryPolicySpec{ + Sources: []dropv1alpha1.DiscoverySource{ + { + Type: "prometheus", + Prometheus: &dropv1alpha1.PrometheusSource{ + Endpoint: "http://localhost:9090", + Query: "test_query", + }, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, resource)).To(Succeed()) + } + }) + + AfterEach(func() { + resource := &dropv1alpha1.DiscoveryPolicy{} + err := k8sClient.Get(ctx, typeNamespacedName, resource) + if err == nil { + By("Cleanup the specific resource instance DiscoveryPolicy") + Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) + } + }) + + It("should successfully reconcile the resource", func() { + By("Reconciling the created resource") + controllerReconciler := &DiscoveryPolicyReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + + _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + // Discovery will fail to connect to prometheus, but should not panic + // The reconciler handles errors gracefully + _ = err + }) + }) +}) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go new file mode 100644 index 0000000..89f5e90 --- /dev/null +++ b/internal/controller/suite_test.go @@ -0,0 +1,116 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "os" + "path/filepath" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + dropv1alpha1 "github.com/Breee/drop/api/v1alpha1" + // +kubebuilder:scaffold:imports +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var ( + ctx context.Context + cancel context.CancelFunc + testEnv *envtest.Environment + cfg *rest.Config + k8sClient client.Client +) + +func TestControllers(t *testing.T) { + RegisterFailHandler(Fail) + + RunSpecs(t, "Controller Suite") +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + ctx, cancel = context.WithCancel(context.TODO()) + + var err error + err = dropv1alpha1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:scheme + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + + // Retrieve the first found binary directory to allow running tests from IDEs + if getFirstFoundEnvTestBinaryDir() != "" { + testEnv.BinaryAssetsDirectory = getFirstFoundEnvTestBinaryDir() + } + + // cfg is defined in this file globally. + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + cancel() + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) + +// getFirstFoundEnvTestBinaryDir locates the first binary in the specified path. +// ENVTEST-based tests depend on specific binaries, usually located in paths set by +// controller-runtime. When running tests directly (e.g., via an IDE) without using +// Makefile targets, the 'BinaryAssetsDirectory' must be explicitly configured. +// +// This function streamlines the process by finding the required binaries, similar to +// setting the 'KUBEBUILDER_ASSETS' environment variable. To ensure the binaries are +// properly set up, run 'make setup-envtest' beforehand. +func getFirstFoundEnvTestBinaryDir() string { + basePath := filepath.Join("..", "..", "bin", "k8s") + entries, err := os.ReadDir(basePath) + if err != nil { + logf.Log.Error(err, "Failed to read directory", "path", basePath) + return "" + } + for _, entry := range entries { + if entry.IsDir() { + return filepath.Join(basePath, entry.Name()) + } + } + return "" +} diff --git a/internal/discovery/prometheus.go b/internal/discovery/prometheus.go new file mode 100644 index 0000000..c3b4a31 --- /dev/null +++ b/internal/discovery/prometheus.go @@ -0,0 +1,166 @@ +package discovery + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "sort" + "time" +) + +// PrometheusSource queries Prometheus for image references. +type PrometheusSource struct { + Endpoint string + Query string + Lookback time.Duration // 0 = instant query; >0 = query_range + Step string // resolution step for range queries (default "5m") + HTTPClient *http.Client +} + +// NewPrometheusSource creates a new Prometheus discovery source. +func NewPrometheusSource(endpoint, query string, lookback time.Duration, step string, httpClient *http.Client) *PrometheusSource { + if httpClient == nil { + httpClient = &http.Client{Timeout: 30 * time.Second} + } + if step == "" { + step = "5m" + } + return &PrometheusSource{ + Endpoint: endpoint, + Query: query, + Lookback: lookback, + Step: step, + HTTPClient: httpClient, + } +} + +// prometheusResponse represents the Prometheus query API response. +type prometheusResponse struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + } `json:"data"` +} + +type prometheusResult struct { + Metric map[string]string `json:"metric"` + Value []interface{} `json:"value"` + Values [][]interface{} `json:"values"` // for range queries +} + +// Fetch queries Prometheus and returns discovered images sorted by score. +func (p *PrometheusSource) Fetch(ctx context.Context) ([]ImageResult, error) { + u, err := url.Parse(p.Endpoint) + if err != nil { + return nil, fmt.Errorf("parsing endpoint: %w", err) + } + + q := u.Query() + q.Set("query", p.Query) + + if p.Lookback > 0 { + // Range query: aggregate over time window + u.Path = "/api/v1/query_range" + now := time.Now().UTC() + q.Set("start", now.Add(-p.Lookback).Format(time.RFC3339)) + q.Set("end", now.Format(time.RFC3339)) + q.Set("step", p.Step) + } else { + // Instant query: single point in time + u.Path = "/api/v1/query" + } + u.RawQuery = q.Encode() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) + if err != nil { + return nil, fmt.Errorf("creating request: %w", err) + } + + resp, err := p.HTTPClient.Do(req) + if err != nil { + return nil, fmt.Errorf("querying prometheus: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("prometheus returned status %d: %s", resp.StatusCode, string(body)) + } + + var promResp prometheusResponse + if err := json.NewDecoder(resp.Body).Decode(&promResp); err != nil { + return nil, fmt.Errorf("decoding response: %w", err) + } + + if promResp.Status != "success" { + return nil, fmt.Errorf("prometheus query failed with status: %s", promResp.Status) + } + + results := make([]ImageResult, 0, len(promResp.Data.Result)) + for _, r := range promResp.Data.Result { + image, ok := r.Metric["image"] + if !ok || image == "" { + continue + } + + var score int64 + if p.Lookback > 0 { + // Range query: sum all values to get total usage score + score = sumRangeValues(r.Values) + } else { + // Instant query: use single value + score = extractScore(r.Value) + } + + results = append(results, ImageResult{ + Image: image, + Score: score, + }) + } + + // Sort by score descending + sort.Slice(results, func(i, j int) bool { + return results[i].Score > results[j].Score + }) + + return results, nil +} + +// extractScore parses the metric value from a Prometheus instant query result. +func extractScore(value []interface{}) int64 { + if len(value) < 2 { + return 0 + } + strVal, ok := value[1].(string) + if !ok { + return 0 + } + var score float64 + if _, err := fmt.Sscanf(strVal, "%f", &score); err != nil { + return 0 + } + return int64(score) +} + +// sumRangeValues sums all values from a query_range result to produce a total usage score. +func sumRangeValues(values [][]interface{}) int64 { + var total float64 + for _, pair := range values { + if len(pair) < 2 { + continue + } + strVal, ok := pair[1].(string) + if !ok { + continue + } + var v float64 + if _, err := fmt.Sscanf(strVal, "%f", &v); err == nil { + total += v + } + } + return int64(total) +} diff --git a/internal/discovery/prometheus_test.go b/internal/discovery/prometheus_test.go new file mode 100644 index 0000000..2110a02 --- /dev/null +++ b/internal/discovery/prometheus_test.go @@ -0,0 +1,131 @@ +package discovery + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" +) + +func TestPrometheusSource_Fetch(t *testing.T) { + tests := []struct { + name string + response interface{} + statusCode int + wantCount int + wantErr bool + wantFirst string + }{ + { + name: "valid response with image labels", + response: prometheusResponse{ + Status: "success", + Data: struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + }{ + ResultType: "vector", + Result: []prometheusResult{ + { + Metric: map[string]string{"image": "nginx:1.25"}, + Value: []interface{}{1234567890.0, "10"}, + }, + { + Metric: map[string]string{"image": "redis:7.0"}, + Value: []interface{}{1234567890.0, "5"}, + }, + }, + }, + }, + statusCode: http.StatusOK, + wantCount: 2, + wantFirst: "nginx:1.25", + }, + { + name: "skips results without image label", + response: prometheusResponse{ + Status: "success", + Data: struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + }{ + ResultType: "vector", + Result: []prometheusResult{ + { + Metric: map[string]string{"image": "nginx:1.25"}, + Value: []interface{}{1234567890.0, "10"}, + }, + { + Metric: map[string]string{"container": "sidecar"}, + Value: []interface{}{1234567890.0, "3"}, + }, + }, + }, + }, + statusCode: http.StatusOK, + wantCount: 1, + wantFirst: "nginx:1.25", + }, + { + name: "HTTP error returns error", + response: "internal server error", + statusCode: http.StatusInternalServerError, + wantErr: true, + }, + { + name: "empty results", + response: prometheusResponse{ + Status: "success", + Data: struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + }{ + ResultType: "vector", + Result: []prometheusResult{}, + }, + }, + statusCode: http.StatusOK, + wantCount: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/query" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + w.WriteHeader(tt.statusCode) + if err := json.NewEncoder(w).Encode(tt.response); err != nil { + t.Fatal(err) + } + })) + defer server.Close() + + source := NewPrometheusSource(server.URL, "test_query", 0, "", server.Client()) + results, err := source.Fetch(context.Background()) + + if tt.wantErr { + if err == nil { + t.Fatal("expected error, got nil") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(results) != tt.wantCount { + t.Errorf("got %d results, want %d", len(results), tt.wantCount) + } + + if tt.wantFirst != "" && len(results) > 0 { + if results[0].Image != tt.wantFirst { + t.Errorf("first image = %q, want %q", results[0].Image, tt.wantFirst) + } + } + }) + } +} diff --git a/internal/discovery/registry.go b/internal/discovery/registry.go new file mode 100644 index 0000000..44292af --- /dev/null +++ b/internal/discovery/registry.go @@ -0,0 +1,159 @@ +package discovery + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "regexp" + "sort" + "strings" + "text/template" + "time" +) + +// RegistrySource queries OCI registries for image tags. +type RegistrySource struct { + URL string + Repositories []string + TagFilter string + TopX int32 + ImageTemplate string + HTTPClient *http.Client +} + +// NewRegistrySource creates a new registry discovery source. +func NewRegistrySource(url string, repos []string, tagFilter string, topX int32, imageTemplate string, httpClient *http.Client) *RegistrySource { + if httpClient == nil { + httpClient = &http.Client{Timeout: 30 * time.Second} + } + return &RegistrySource{ + URL: strings.TrimSuffix(url, "/"), + Repositories: repos, + TagFilter: tagFilter, + TopX: topX, + ImageTemplate: imageTemplate, + HTTPClient: httpClient, + } +} + +// tagListResponse represents the OCI Distribution API tag list response. +type tagListResponse struct { + Name string `json:"name"` + Tags []string `json:"tags"` +} + +// Fetch queries the registry for tags and returns discovered images. +func (rs *RegistrySource) Fetch(ctx context.Context) ([]ImageResult, error) { + var allResults []ImageResult + + for _, repo := range rs.Repositories { + results, err := rs.fetchRepo(ctx, repo) + if err != nil { + return nil, fmt.Errorf("fetching tags for %s: %w", repo, err) + } + allResults = append(allResults, results...) + } + + // Sort by score descending (higher index = more recent) + sort.Slice(allResults, func(i, j int) bool { + return allResults[i].Score > allResults[j].Score + }) + + return allResults, nil +} + +func (rs *RegistrySource) fetchRepo(ctx context.Context, repo string) ([]ImageResult, error) { + u := fmt.Sprintf("%s/v2/%s/tags/list", rs.URL, repo) + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, fmt.Errorf("creating request: %w", err) + } + + resp, err := rs.HTTPClient.Do(req) + if err != nil { + return nil, fmt.Errorf("listing tags: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("registry returned status %d: %s", resp.StatusCode, string(body)) + } + + var tagList tagListResponse + if err := json.NewDecoder(resp.Body).Decode(&tagList); err != nil { + return nil, fmt.Errorf("decoding response: %w", err) + } + + // Filter tags + tags := tagList.Tags + if rs.TagFilter != "" { + re, err := regexp.Compile(rs.TagFilter) + if err != nil { + return nil, fmt.Errorf("compiling tag filter: %w", err) + } + var filtered []string + for _, tag := range tags { + if re.MatchString(tag) { + filtered = append(filtered, tag) + } + } + tags = filtered + } + + // Limit to topX + if rs.TopX > 0 && int32(len(tags)) > rs.TopX { + tags = tags[len(tags)-int(rs.TopX):] + } + + // Build image refs + results := make([]ImageResult, 0, len(tags)) + for i, tag := range tags { + imageRef, err := rs.buildImageRef(repo, tag) + if err != nil { + return nil, fmt.Errorf("building image ref for tag %s: %w", tag, err) + } + results = append(results, ImageResult{ + Image: imageRef, + Score: int64(i + 1), // Higher index = more recent + }) + } + + return results, nil +} + +// templateData provides variables for the image template. +type templateData struct { + Registry string + Repository string + Tag string +} + +func (rs *RegistrySource) buildImageRef(repo, tag string) (string, error) { + if rs.ImageTemplate != "" { + tmpl, err := template.New("image").Parse(rs.ImageTemplate) + if err != nil { + return "", fmt.Errorf("parsing image template: %w", err) + } + + data := templateData{ + Registry: rs.URL, + Repository: repo, + Tag: tag, + } + + var buf strings.Builder + if err := tmpl.Execute(&buf, data); err != nil { + return "", fmt.Errorf("executing image template: %w", err) + } + return buf.String(), nil + } + + // Default: registry/repo:tag + registry := strings.TrimPrefix(rs.URL, "https://") + registry = strings.TrimPrefix(registry, "http://") + return fmt.Sprintf("%s/%s:%s", registry, repo, tag), nil +} diff --git a/internal/discovery/registry_test.go b/internal/discovery/registry_test.go new file mode 100644 index 0000000..f3b9dc6 --- /dev/null +++ b/internal/discovery/registry_test.go @@ -0,0 +1,93 @@ +package discovery + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" +) + +func TestRegistrySource_Fetch(t *testing.T) { + tests := []struct { + name string + repos []string + tagFilter string + topX int32 + imageTemplate string + tags []string + wantCount int + wantFirst string + wantErr bool + }{ + { + name: "basic tag listing", + repos: []string{"library/nginx"}, + tags: []string{"1.24", "1.25", "1.26"}, + wantCount: 3, + }, + { + name: "tag filter", + repos: []string{"library/nginx"}, + tagFilter: `^1\.2[56]$`, + tags: []string{"1.24", "1.25", "1.26"}, + wantCount: 2, + }, + { + name: "topX limit", + repos: []string{"library/nginx"}, + topX: 2, + tags: []string{"1.24", "1.25", "1.26"}, + wantCount: 2, + }, + { + name: "image template", + repos: []string{"gitlab-org/gitlab-runner/gitlab-runner-helper"}, + imageTemplate: "registry.gitlab.com/{{.Repository}}:x86_64-{{.Tag}}", + tags: []string{"v16.0", "v16.1"}, + wantCount: 2, + wantFirst: "registry.gitlab.com/gitlab-org/gitlab-runner/gitlab-runner-helper:x86_64-v16.1", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + resp := tagListResponse{ + Name: tt.repos[0], + Tags: tt.tags, + } + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Fatal(err) + } + })) + defer server.Close() + + source := NewRegistrySource(server.URL, tt.repos, tt.tagFilter, tt.topX, tt.imageTemplate, server.Client()) + results, err := source.Fetch(context.Background()) + + if tt.wantErr { + if err == nil { + t.Fatal("expected error, got nil") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(results) != tt.wantCount { + t.Errorf("got %d results, want %d", len(results), tt.wantCount) + } + + if tt.wantFirst != "" && len(results) > 0 { + // Results sorted by score descending, highest score = last tag + if results[0].Image != tt.wantFirst { + t.Errorf("first image = %q, want %q", results[0].Image, tt.wantFirst) + } + } + }) + } +} diff --git a/internal/discovery/source.go b/internal/discovery/source.go new file mode 100644 index 0000000..8ac92a1 --- /dev/null +++ b/internal/discovery/source.go @@ -0,0 +1,15 @@ +package discovery + +import "context" + +// ImageResult represents a discovered image with a ranking score. +type ImageResult struct { + Image string + Score int64 +} + +// Source is the interface that all discovery backends must implement. +type Source interface { + // Fetch queries the backend and returns discovered images. + Fetch(ctx context.Context) ([]ImageResult, error) +} diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 0000000..6782e17 --- /dev/null +++ b/internal/metrics/metrics.go @@ -0,0 +1,94 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + // ImagesCachedTotal counts the total number of images successfully cached on nodes. + ImagesCachedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "drop_images_cached_total", + Help: "Total number of images successfully cached on nodes.", + }, + []string{"image", "node"}, + ) + + // PullDurationSeconds tracks the duration of image pull operations. + PullDurationSeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "drop_pull_duration_seconds", + Help: "Duration of image pull operations in seconds.", + Buckets: prometheus.ExponentialBuckets(1, 2, 12), // 1s to ~68min + }, + []string{"image"}, + ) + + // PullErrorsTotal counts the total number of failed image pull attempts. + PullErrorsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "drop_pull_errors_total", + Help: "Total number of failed image pull attempts.", + }, + []string{"image", "node"}, + ) + + // DiscoveryImagesFound reports the number of images found by each discovery source. + DiscoveryImagesFound = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "drop_discovery_images_found", + Help: "Number of images found by a discovery policy.", + }, + []string{"policy", "source_type"}, + ) + + // ActivePulls reports the current number of active pull Pods. + ActivePulls = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "drop_active_pulls", + Help: "Current number of active image pull Pods.", + }, + ) + + // ReconcileTotal counts reconciliation attempts per controller and result. + ReconcileTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "drop_reconcile_total", + Help: "Total number of reconciliation attempts.", + }, + []string{"controller", "result"}, + ) + + // DiscoverySourceHealth reports whether a discovery source is reachable (1=healthy, 0=unhealthy). + DiscoverySourceHealth = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "drop_discovery_source_health", + Help: "Whether a discovery source is reachable and queryable (1=healthy, 0=unhealthy).", + }, + []string{"policy", "source_type", "endpoint"}, + ) + + // DiscoverySourceLatencySeconds tracks the query duration per source. + DiscoverySourceLatencySeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "drop_discovery_source_latency_seconds", + Help: "Latency of discovery source queries in seconds.", + Buckets: prometheus.DefBuckets, + }, + []string{"policy", "source_type"}, + ) +) + +func init() { + metrics.Registry.MustRegister( + ImagesCachedTotal, + PullDurationSeconds, + PullErrorsTotal, + DiscoveryImagesFound, + ActivePulls, + ReconcileTotal, + DiscoverySourceHealth, + DiscoverySourceLatencySeconds, + ) +} diff --git a/internal/pacing/engine.go b/internal/pacing/engine.go new file mode 100644 index 0000000..79b477d --- /dev/null +++ b/internal/pacing/engine.go @@ -0,0 +1,120 @@ +package pacing + +import ( + "context" + "time" + + v1alpha1 "github.com/Breee/drop/api/v1alpha1" + "github.com/Breee/drop/internal/podbuilder" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Decision represents whether a new pull is allowed. +type Decision struct { + Allowed bool + RequeueIn time.Duration +} + +// Engine evaluates pacing constraints before creating new drop Pods. +type Engine struct { + Client client.Client + PodNamespace string +} + +// NewEngine creates a new pacing engine. +func NewEngine(c client.Client, podNamespace string) *Engine { + return &Engine{Client: c, PodNamespace: podNamespace} +} + +// CanStartPull checks pacing constraints and returns whether a new pull can start. +func (e *Engine) CanStartPull(ctx context.Context, policy *v1alpha1.PullPolicy, cachedImageName string) (Decision, error) { + maxConcurrent := int32(1) + minDelay := 10 * time.Second + + if policy != nil { + if policy.Spec.MaxConcurrentNodes > 0 { + maxConcurrent = policy.Spec.MaxConcurrentNodes + } + if policy.Spec.MinDelayBetweenPulls.Duration > 0 { + minDelay = policy.Spec.MinDelayBetweenPulls.Duration + } + } + + // List active drop Pods (Running or Pending) + podList := &corev1.PodList{} + ns := e.PodNamespace + if ns == "" { + ns = podbuilder.DefaultPodNamespace + } + listOpts := []client.ListOption{ + client.InNamespace(ns), + client.MatchingLabels{podbuilder.LabelManagedBy: podbuilder.LabelManagedByValue}, + } + if err := e.Client.List(ctx, podList, listOpts...); err != nil { + return Decision{}, err + } + + // Filter to active pods (Pending or Running) and optionally scope by node selector + var activePods []corev1.Pod + for i := range podList.Items { + pod := &podList.Items[i] + if pod.Status.Phase == corev1.PodPending || pod.Status.Phase == corev1.PodRunning { + // Skip pods stuck in image pull errors — they're about to be cleaned up + if isStuckImagePull(pod) { + continue + } + if policy != nil && len(policy.Spec.NodeSelector) > 0 { + if !nodeMatchesSelector(pod.Spec.NodeName, policy.Spec.NodeSelector) { + continue + } + } + activePods = append(activePods, *pod) + } + } + + // Check concurrent limit + if int32(len(activePods)) >= maxConcurrent { + return Decision{Allowed: false, RequeueIn: 5 * time.Second}, nil + } + + // Check minimum delay between pulls + var mostRecent time.Time + for i := range activePods { + created := activePods[i].CreationTimestamp.Time + if created.After(mostRecent) { + mostRecent = created + } + } + + if !mostRecent.IsZero() { + elapsed := time.Since(mostRecent) + if elapsed < minDelay { + remaining := minDelay - elapsed + return Decision{Allowed: false, RequeueIn: remaining}, nil + } + } + + return Decision{Allowed: true}, nil +} + +// nodeMatchesSelector is a simplified check. +// In a real implementation, we'd look up the node's labels. +// For now, this always returns true since drop Pods are already placed +// on specific nodes via nodeName — the pacing scope is informational. +func nodeMatchesSelector(_ string, _ map[string]string) bool { + return true +} + +// isStuckImagePull returns true if a pod has a container waiting due to image pull failure. +func isStuckImagePull(pod *corev1.Pod) bool { + for _, cs := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) { + if cs.State.Waiting != nil { + switch cs.State.Waiting.Reason { + case "ErrImagePull", "ImagePullBackOff", "InvalidImageName", "RegistryUnavailable": + return true + } + } + } + return false +} diff --git a/internal/pacing/engine_test.go b/internal/pacing/engine_test.go new file mode 100644 index 0000000..2611bf1 --- /dev/null +++ b/internal/pacing/engine_test.go @@ -0,0 +1,160 @@ +package pacing + +import ( + "context" + "testing" + "time" + + v1alpha1 "github.com/Breee/drop/api/v1alpha1" + "github.com/Breee/drop/internal/podbuilder" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func testScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = v1alpha1.AddToScheme(s) + return s +} + +func TestCanStartPull(t *testing.T) { + tests := []struct { + name string + policy *v1alpha1.PullPolicy + activePods []corev1.Pod + wantAllowed bool + wantRequeue bool + }{ + { + name: "allows when no active pulls exist", + policy: nil, + activePods: nil, + wantAllowed: true, + wantRequeue: false, + }, + { + name: "denies when maxConcurrentNodes reached", + policy: &v1alpha1.PullPolicy{ + Spec: v1alpha1.PullPolicySpec{ + MaxConcurrentNodes: 1, + MinDelayBetweenPulls: metav1.Duration{Duration: 10 * time.Second}, + }, + }, + activePods: []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "drop-test-1", + CreationTimestamp: metav1.NewTime(time.Now().Add(-30 * time.Second)), + Labels: map[string]string{ + podbuilder.LabelManagedBy: podbuilder.LabelManagedByValue, + }, + }, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + }, + }, + wantAllowed: false, + wantRequeue: true, + }, + { + name: "allows when at boundary (maxConcurrentNodes - 1 active)", + policy: &v1alpha1.PullPolicy{ + Spec: v1alpha1.PullPolicySpec{ + MaxConcurrentNodes: 2, + MinDelayBetweenPulls: metav1.Duration{Duration: 1 * time.Second}, + }, + }, + activePods: []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "drop-test-1", + CreationTimestamp: metav1.NewTime(time.Now().Add(-30 * time.Second)), + Labels: map[string]string{ + podbuilder.LabelManagedBy: podbuilder.LabelManagedByValue, + }, + }, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + }, + }, + wantAllowed: true, + wantRequeue: false, + }, + { + name: "denies when minDelayBetweenPulls not elapsed", + policy: &v1alpha1.PullPolicy{ + Spec: v1alpha1.PullPolicySpec{ + MaxConcurrentNodes: 5, + MinDelayBetweenPulls: metav1.Duration{Duration: 60 * time.Second}, + }, + }, + activePods: []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "drop-test-1", + CreationTimestamp: metav1.NewTime(time.Now().Add(-5 * time.Second)), + Labels: map[string]string{ + podbuilder.LabelManagedBy: podbuilder.LabelManagedByValue, + }, + }, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + }, + }, + wantAllowed: false, + wantRequeue: true, + }, + { + name: "uses defaults when nil policy", + policy: nil, + activePods: []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "drop-test-1", + CreationTimestamp: metav1.NewTime(time.Now().Add(-30 * time.Second)), + Labels: map[string]string{ + podbuilder.LabelManagedBy: podbuilder.LabelManagedByValue, + }, + }, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + }, + }, + wantAllowed: false, + wantRequeue: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := testScheme() + + objs := make([]runtime.Object, 0, len(tt.activePods)) + for i := range tt.activePods { + tt.activePods[i].Namespace = "drop-system" + objs = append(objs, &tt.activePods[i]) + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(objs...). + Build() + + engine := NewEngine(fakeClient, "drop-system") + decision, err := engine.CanStartPull(context.Background(), tt.policy, "test-image") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if decision.Allowed != tt.wantAllowed { + t.Errorf("Allowed = %v, want %v", decision.Allowed, tt.wantAllowed) + } + + if tt.wantRequeue && decision.RequeueIn == 0 { + t.Error("expected non-zero RequeueIn") + } + if !tt.wantRequeue && decision.RequeueIn != 0 { + t.Errorf("expected zero RequeueIn, got %v", decision.RequeueIn) + } + }) + } +} diff --git a/internal/podbuilder/builder.go b/internal/podbuilder/builder.go new file mode 100644 index 0000000..432de1c --- /dev/null +++ b/internal/podbuilder/builder.go @@ -0,0 +1,82 @@ +package podbuilder + +import ( + "fmt" + + v1alpha1 "github.com/Breee/drop/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" +) + +const ( + // LabelManagedBy identifies resources managed by the drop operator. + LabelManagedBy = "app.kubernetes.io/managed-by" + // LabelManagedByValue is the value for the managed-by label. + LabelManagedByValue = "drop" + // LabelCachedImage identifies which CachedImage owns this Pod. + LabelCachedImage = "drop.corewire.io/cachedimage" + // LabelNode identifies which node this Pod targets. + LabelNode = "drop.corewire.io/node" + // DefaultPodNamespace is the namespace where drop pods are created. + DefaultPodNamespace = "drop-system" +) + +// BuildDropPod creates a Pod spec for pulling an image onto a specific node. +// Pods are created in the given namespace and tracked via labels (not ownerRefs) +// because CachedImage is cluster-scoped and cannot own namespaced resources. +func BuildDropPod(ci *v1alpha1.CachedImage, nodeName, namespace string) (*corev1.Pod, error) { + imageRef := buildImageRef(ci) + + pullPolicy := corev1.PullAlways + if ci.Spec.ImagePullPolicy != "" { + pullPolicy = ci.Spec.ImagePullPolicy + } + + if namespace == "" { + namespace = DefaultPodNamespace + } + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: fmt.Sprintf("pull-%s-", ci.Name), + Namespace: namespace, + Labels: map[string]string{ + LabelManagedBy: LabelManagedByValue, + LabelCachedImage: ci.Name, + LabelNode: nodeName, + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + RestartPolicy: corev1.RestartPolicyNever, + Tolerations: ci.Spec.Tolerations, + ImagePullSecrets: ci.Spec.ImagePullSecrets, + Containers: []corev1.Container{ + { + Name: "pull", + Image: imageRef, + Command: []string{"true"}, + ImagePullPolicy: pullPolicy, + }, + }, + AutomountServiceAccountToken: ptr.To(false), + EnableServiceLinks: ptr.To(false), + TerminationGracePeriodSeconds: ptr.To(int64(0)), + }, + } + + return pod, nil +} + +// buildImageRef constructs the full image reference from CachedImage spec. +func buildImageRef(ci *v1alpha1.CachedImage) string { + if ci.Spec.Digest != "" { + return fmt.Sprintf("%s@%s", ci.Spec.Image, ci.Spec.Digest) + } + tag := ci.Spec.Tag + if tag == "" { + tag = "latest" + } + return fmt.Sprintf("%s:%s", ci.Spec.Image, tag) +} diff --git a/internal/podbuilder/builder_test.go b/internal/podbuilder/builder_test.go new file mode 100644 index 0000000..51817c2 --- /dev/null +++ b/internal/podbuilder/builder_test.go @@ -0,0 +1,158 @@ +package podbuilder + +import ( + "testing" + + v1alpha1 "github.com/Breee/drop/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestBuildDropPod(t *testing.T) { + tests := []struct { + name string + ci *v1alpha1.CachedImage + nodeName string + wantImg string + wantPull corev1.PullPolicy + }{ + { + name: "image with tag", + ci: &v1alpha1.CachedImage{ + ObjectMeta: metav1.ObjectMeta{Name: "test-image", UID: "uid-1"}, + Spec: v1alpha1.CachedImageSpec{ + Image: "docker.io/library/nginx", + Tag: "1.25", + ImagePullPolicy: corev1.PullIfNotPresent, + }, + }, + nodeName: "node-1", + wantImg: "docker.io/library/nginx:1.25", + wantPull: corev1.PullIfNotPresent, + }, + { + name: "image with digest", + ci: &v1alpha1.CachedImage{ + ObjectMeta: metav1.ObjectMeta{Name: "digest-image", UID: "uid-2"}, + Spec: v1alpha1.CachedImageSpec{ + Image: "docker.io/library/nginx", + Digest: "sha256:abc123", + ImagePullPolicy: corev1.PullIfNotPresent, + }, + }, + nodeName: "node-2", + wantImg: "docker.io/library/nginx@sha256:abc123", + wantPull: corev1.PullIfNotPresent, + }, + { + name: "image with Always pull policy", + ci: &v1alpha1.CachedImage{ + ObjectMeta: metav1.ObjectMeta{Name: "always-pull", UID: "uid-3"}, + Spec: v1alpha1.CachedImageSpec{ + Image: "gcr.io/my-project/app", + Tag: "latest", + ImagePullPolicy: corev1.PullAlways, + }, + }, + nodeName: "node-3", + wantImg: "gcr.io/my-project/app:latest", + wantPull: corev1.PullAlways, + }, + { + name: "image with no tag defaults to latest", + ci: &v1alpha1.CachedImage{ + ObjectMeta: metav1.ObjectMeta{Name: "no-tag", UID: "uid-4"}, + Spec: v1alpha1.CachedImageSpec{ + Image: "docker.io/library/alpine", + }, + }, + nodeName: "node-1", + wantImg: "docker.io/library/alpine:latest", + wantPull: corev1.PullAlways, + }, + { + name: "image with tolerations", + ci: &v1alpha1.CachedImage{ + ObjectMeta: metav1.ObjectMeta{Name: "tolerated", UID: "uid-5"}, + Spec: v1alpha1.CachedImageSpec{ + Image: "docker.io/library/alpine", + Tag: "3.18", + Tolerations: []corev1.Toleration{ + {Key: "node-role.kubernetes.io/build", Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}, + }, + }, + }, + nodeName: "build-node-1", + wantImg: "docker.io/library/alpine:3.18", + wantPull: corev1.PullAlways, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pod, err := BuildDropPod(tt.ci, tt.nodeName, "drop-system") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Check namespace + if pod.Namespace != "drop-system" { + t.Errorf("namespace = %q, want %q", pod.Namespace, "drop-system") + } + + // Check nodeName + if pod.Spec.NodeName != tt.nodeName { + t.Errorf("nodeName = %q, want %q", pod.Spec.NodeName, tt.nodeName) + } + + // Check image reference + if pod.Spec.Containers[0].Image != tt.wantImg { + t.Errorf("image = %q, want %q", pod.Spec.Containers[0].Image, tt.wantImg) + } + + // Check pull policy + if pod.Spec.Containers[0].ImagePullPolicy != tt.wantPull { + t.Errorf("imagePullPolicy = %q, want %q", pod.Spec.Containers[0].ImagePullPolicy, tt.wantPull) + } + + // Check labels + if pod.Labels[LabelManagedBy] != LabelManagedByValue { + t.Errorf("managed-by label = %q, want %q", pod.Labels[LabelManagedBy], LabelManagedByValue) + } + if pod.Labels[LabelCachedImage] != tt.ci.Name { + t.Errorf("cachedimage label = %q, want %q", pod.Labels[LabelCachedImage], tt.ci.Name) + } + if pod.Labels[LabelNode] != tt.nodeName { + t.Errorf("node label = %q, want %q", pod.Labels[LabelNode], tt.nodeName) + } + + // Check command + if len(pod.Spec.Containers[0].Command) != 1 || pod.Spec.Containers[0].Command[0] != "true" { + t.Errorf("command = %v, want [true]", pod.Spec.Containers[0].Command) + } + + // Check restart policy + if pod.Spec.RestartPolicy != corev1.RestartPolicyNever { + t.Errorf("restartPolicy = %q, want Never", pod.Spec.RestartPolicy) + } + + // Check tolerations + if len(tt.ci.Spec.Tolerations) > 0 { + if len(pod.Spec.Tolerations) != len(tt.ci.Spec.Tolerations) { + t.Errorf("tolerations count = %d, want %d", len(pod.Spec.Tolerations), len(tt.ci.Spec.Tolerations)) + } + } + + // Check security settings + if pod.Spec.AutomountServiceAccountToken == nil || *pod.Spec.AutomountServiceAccountToken { + t.Error("automountServiceAccountToken should be false") + } + if pod.Spec.EnableServiceLinks == nil || *pod.Spec.EnableServiceLinks { + t.Error("enableServiceLinks should be false") + } + if pod.Spec.TerminationGracePeriodSeconds == nil || *pod.Spec.TerminationGracePeriodSeconds != 0 { + t.Error("terminationGracePeriodSeconds should be 0") + } + }) + } +} diff --git a/knowledge.yaml b/knowledge.yaml new file mode 100644 index 0000000..9631a92 --- /dev/null +++ b/knowledge.yaml @@ -0,0 +1,806 @@ +# Generated by make docs-gen — DO NOT EDIT +# Source: hack/gen-ai-docs/ +# Regenerate: make docs-gen + +project: + name: drop + description: Kubernetes operator that pre-caches container images on cluster nodes + apiGroup: drop.corewire.io/v1alpha1 + goVersion: 1.23.0 + module: github.com/Breee/drop + license: Apache-2.0 +crds: + - kind: CachedImage + doc: CachedImage is the Schema for the cachedimages API. + scope: Cluster + controller: internal/controller/cachedimage_controller.go + testFile: internal/controller/cachedimage_controller_test.go + specFields: + - name: Image + json: image + type: string + required: true + doc: Image is the fully qualified image reference (registry/repository). + - name: Tag + json: tag + type: string + required: false + doc: Tag to pull. Mutually exclusive with Digest. + - name: Digest + json: digest + type: string + required: false + doc: Digest to pull (immutable reference). Mutually exclusive with Tag. + - name: ImagePullPolicy + json: imagePullPolicy + type: corev1.PullPolicy + required: false + default: Always + enum: + - Always + - IfNotPresent + - Never + doc: ImagePullPolicy controls when kubelet pulls the image. Defaults to Always (checks upstream digest, only downloads if changed). Set to IfNotPresent to skip the registry check when the tag already exists locally. + - name: ImagePullSecrets + json: imagePullSecrets + type: '[]corev1.LocalObjectReference' + required: false + doc: ImagePullSecrets are references to secrets for pulling from private registries. + - name: NodeSelector + json: nodeSelector + type: map[string]string + required: false + doc: NodeSelector restricts which nodes to cache the image on. + - name: Tolerations + json: tolerations + type: '[]corev1.Toleration' + required: false + doc: Tolerations allow targeting tainted nodes. + - name: Priority + json: priority + type: '*int32' + required: false + doc: Priority is a pull ordering hint (lower values pulled first). + - name: PolicyRef + json: policyRef + type: '*PolicyReference' + required: false + doc: PolicyRef references a PullPolicy for pacing controls. + statusFields: + - name: ObservedGeneration + json: observedGeneration + type: int64 + required: false + doc: ObservedGeneration is the last generation reconciled. + - name: Phase + json: phase + type: string + required: false + enum: + - Pending + - Pulling + - Ready + - Degraded + doc: Phase summarizes the overall state. + - name: Ready + json: ready + type: string + required: false + doc: Ready is a human-readable "nodesReady/nodesTargeted" fraction for display. + - name: ResolvedDigest + json: resolvedDigest + type: string + required: false + doc: ResolvedDigest is the sha256 digest of the image as reported by the container runtime after pull. + - name: NodesTargeted + json: nodesTargeted + type: int32 + required: false + doc: NodesTargeted is the number of nodes that should have this image. + - name: NodesReady + json: nodesReady + type: int32 + required: false + doc: NodesReady is the number of nodes that have successfully pulled the image. + - name: CachedNodes + json: cachedNodes + type: '[]string' + required: false + doc: CachedNodes is the list of node names that have successfully cached the image. + - name: ConsecutiveFailures + json: consecutiveFailures + type: int32 + required: false + doc: ConsecutiveFailures counts sequential reconcile failures for backoff calculation. + - name: LastPulledAt + json: lastPulledAt + type: '*metav1.Time' + required: false + doc: LastPulledAt is the timestamp of the most recent successful pull. + - name: LastAttemptedAt + json: lastAttemptedAt + type: '*metav1.Time' + required: false + doc: LastAttemptedAt is the timestamp of the most recent pull attempt (success or failure). + - name: Conditions + json: conditions + type: '[]metav1.Condition' + required: false + doc: Conditions represent the latest available observations. + - kind: CachedImageSet + doc: CachedImageSet is the Schema for the cachedimagesets API. + scope: Cluster + controller: internal/controller/cachedimageset_controller.go + testFile: internal/controller/cachedimageset_controller_test.go + specFields: + - name: PolicyRef + json: policyRef + type: '*PolicyReference' + required: false + doc: PolicyRef references a PullPolicy for pacing controls. + - name: DiscoveryPolicyRef + json: discoveryPolicyRef + type: '*DiscoveryPolicyReference' + required: false + doc: DiscoveryPolicyRef references a DiscoveryPolicy for dynamic image lists. + - name: ImagePullPolicy + json: imagePullPolicy + type: corev1.PullPolicy + required: false + default: Always + enum: + - Always + - IfNotPresent + - Never + doc: ImagePullPolicy controls when kubelet pulls the image (propagated to children). + - name: ImagePullSecrets + json: imagePullSecrets + type: '[]corev1.LocalObjectReference' + required: false + doc: ImagePullSecrets are references to secrets for pulling from private registries (propagated to children). + - name: NodeSelector + json: nodeSelector + type: map[string]string + required: false + doc: NodeSelector restricts which nodes to cache images on (propagated to children). + - name: Tolerations + json: tolerations + type: '[]corev1.Toleration' + required: false + doc: Tolerations allow targeting tainted nodes (propagated to children). + - name: Images + json: images + type: '[]ImageEntry' + required: false + doc: Images is a static list of images to cache. + statusFields: + - name: ObservedGeneration + json: observedGeneration + type: int64 + required: false + doc: ObservedGeneration is the last generation reconciled. + - name: Phase + json: phase + type: string + required: false + enum: + - Pending + - Ready + - Degraded + doc: Phase summarizes the overall state. + - name: ImagesManaged + json: imagesManaged + type: int32 + required: false + doc: ImagesManaged is the number of CachedImage children managed by this set. + - name: ImagesReady + json: imagesReady + type: int32 + required: false + doc: ImagesReady is the number of children in Ready phase. + - name: Conditions + json: conditions + type: '[]metav1.Condition' + required: false + doc: Conditions represent the latest available observations. + - kind: PullPolicy + doc: PullPolicy is the Schema for the pullpolicies API. It is a configuration-only resource with no status. + scope: Cluster + specFields: + - name: MaxConcurrentNodes + json: maxConcurrentNodes + type: int32 + required: false + default: "1" + doc: MaxConcurrentNodes is the max nodes pulling simultaneously for this policy. + - name: MinDelayBetweenPulls + json: minDelayBetweenPulls + type: metav1.Duration + required: false + default: 10s + doc: MinDelayBetweenPulls is the minimum time between starting pulls on different nodes. + - name: FailureBackoff + json: failureBackoff + type: '*BackoffConfig' + required: false + doc: FailureBackoff configures retry delays on pull failures. + - name: RepullInterval + json: repullInterval + type: '*metav1.Duration' + required: false + doc: RepullInterval is how often to re-pull cached images. Zero or unset means never re-pull. + - name: NodeSelector + json: nodeSelector + type: map[string]string + required: false + doc: NodeSelector scopes this policy to a specific node pool. + - name: Tolerations + json: tolerations + type: '[]corev1.Toleration' + required: false + doc: Tolerations match tainted nodes in the pool. + - kind: DiscoveryPolicy + doc: DiscoveryPolicy is the Schema for the discoverypolicies API. + scope: Cluster + controller: internal/controller/discoverypolicy_controller.go + testFile: internal/controller/discoverypolicy_controller_test.go + specFields: + - name: Sources + json: sources + type: '[]DiscoverySource' + required: true + doc: Sources is the list of discovery backends to query. + - name: ImageFilter + json: imageFilter + type: string + required: false + doc: ImageFilter is a regex to filter discovered images. + - name: SyncInterval + json: syncInterval + type: metav1.Duration + required: false + default: 30m + doc: SyncInterval is how often to re-query sources. + - name: MaxImages + json: maxImages + type: int32 + required: false + default: "50" + doc: MaxImages caps the number of discovered images. + statusFields: + - name: LastSyncTime + json: lastSyncTime + type: '*metav1.Time' + required: false + doc: LastSyncTime is the timestamp of the last successful sync. + - name: DiscoveredImages + json: discoveredImages + type: '[]DiscoveredImage' + required: false + doc: DiscoveredImages is the list of discovered images from all sources. + - name: ImageCount + json: imageCount + type: int32 + required: false + doc: ImageCount is the number of discovered images. + - name: SourceCount + json: sourceCount + type: int32 + required: false + doc: SourceCount is the number of configured sources. + - name: Conditions + json: conditions + type: '[]metav1.Condition' + required: false + doc: Conditions represent the latest available observations. +helperTypes: + - name: PolicyReference + doc: PolicyReference is a reference to a PullPolicy resource. + fields: + - name: Name + json: name + type: string + required: true + doc: Name of the PullPolicy resource. + - name: DiscoveryPolicyReference + doc: DiscoveryPolicyReference is a reference to a DiscoveryPolicy resource. + fields: + - name: Name + json: name + type: string + required: true + doc: Name of the DiscoveryPolicy resource. + - name: ImageEntry + doc: ImageEntry defines a single image to include in a set. + fields: + - name: Image + json: image + type: string + required: true + doc: Image is the fully qualified image reference (registry/repository). + - name: Tag + json: tag + type: string + required: false + doc: Tag to pull. + - name: Digest + json: digest + type: string + required: false + doc: Digest to pull. + - name: BackoffConfig + doc: BackoffConfig defines retry backoff behavior. + fields: + - name: Initial + json: initial + type: metav1.Duration + required: false + default: 30s + doc: Initial delay before first retry. + - name: Max + json: max + type: metav1.Duration + required: false + default: 5m + doc: Max delay cap for exponential backoff. + - name: DiscoverySource + doc: DiscoverySource defines a single discovery backend. + fields: + - name: Type + json: type + type: string + required: true + enum: + - prometheus + - registry + doc: Type identifies the backend. + - name: Prometheus + json: prometheus + type: '*PrometheusSource' + required: false + doc: Prometheus config (when type=prometheus). + - name: Registry + json: registry + type: '*RegistrySource' + required: false + doc: Registry config (when type=registry). + - name: SecretRef + json: secretRef + type: '*corev1.LocalObjectReference' + required: false + doc: SecretRef references a Secret for auth/TLS for this source. + - name: PrometheusSource + doc: PrometheusSource defines Prometheus query configuration. + fields: + - name: Endpoint + json: endpoint + type: string + required: true + doc: Endpoint is the Prometheus API URL. + - name: Query + json: query + type: string + required: true + doc: Query is the PromQL query that must return an 'image' label. + - name: Lookback + json: lookback + type: '*metav1.Duration' + required: false + doc: Lookback is the time window to aggregate over (e.g. "7d", "24h"). When set, uses query_range and sums values to rank by total usage. When unset, uses an instant query (point-in-time). + - name: Step + json: step + type: string + required: false + default: 5m + doc: Step is the query resolution step for range queries. + - name: RegistrySource + doc: RegistrySource defines OCI registry tag listing configuration. + fields: + - name: URL + json: url + type: string + required: true + doc: URL is the registry base URL. + - name: Repositories + json: repositories + type: '[]string' + required: true + doc: Repositories is the list of repositories to query. + - name: TagFilter + json: tagFilter + type: string + required: false + doc: TagFilter is a regex to filter tags. + - name: TopX + json: topX + type: int32 + required: false + doc: TopX limits the number of tags to fetch per repository. + - name: ImageTemplate + json: imageTemplate + type: string + required: false + doc: 'ImageTemplate is a Go text/template for constructing the full image reference. Available variables: .Registry, .Repository, .Tag' + - name: DiscoveredImage + doc: DiscoveredImage represents a single discovered image with metadata. + fields: + - name: Image + json: image + type: string + required: true + doc: Image is the fully qualified image reference. + - name: Score + json: score + type: int64 + required: true + doc: Score is the ranking score from the source (higher = more relevant). + - name: Source + json: source + type: string + required: true + doc: Source identifies which discovery source produced this image. +relationships: + - from: CachedImageSet + to: CachedImage + type: owns + mechanism: ownerReferences + - from: CachedImage + to: Pod + type: creates + mechanism: controller-runtime client + - from: CachedImage + to: PullPolicy + type: references + mechanism: spec.policyRef + - from: CachedImageSet + to: PullPolicy + type: references + mechanism: spec.policyRef + - from: CachedImageSet + to: DiscoveryPolicy + type: references + mechanism: spec.discoveryPolicyRef + - from: DiscoveryPolicy + to: CachedImageSet + type: feeds + mechanism: status.discoveredImages +packages: + - path: api/v1alpha1 + role: Package v1alpha1 contains API Schema definitions for the drop v1alpha1 API group. + - path: internal/controller + role: Reconciler implementations (one per CRD) + imports: + - api/v1alpha1 + - internal/discovery + - internal/metrics + - internal/pacing + - internal/podbuilder + - path: internal/discovery + role: Discovery source interface + implementations + - path: internal/metrics + role: Prometheus metrics registration + - path: internal/pacing + role: Shared pacing engine for rate-limited pulls + imports: + - api/v1alpha1 + - internal/podbuilder + - path: internal/podbuilder + role: Pure Pod construction function (no k8s client) + imports: + - api/v1alpha1 +conventions: + - rule: All CRDs are cluster-scoped + scope: + - code + - use + - rule: Status uses metav1.Condition with type "Ready" + scope: + - code + - use + - rule: No privileged containers — kubelet-based image pulls only + scope: + - code + - rule: Single responsibility reconcilers — one controller per CRD + scope: + - code + - rule: Pod builder is a pure function in internal/podbuilder/ (no k8s client) + scope: + - code + - rule: Pacing logic lives exclusively in internal/pacing/ + scope: + - code + - rule: 'ownerReferences: CachedImageSet→CachedImage, controller→Pod' + scope: + - code + - rule: Table-driven tests preferred; envtest for controllers + scope: + - code + - rule: 'Pods use nodeName placement + command: ["true"]' + scope: + - code + - use + - rule: Don't manually edit generated files — run make docs-gen + scope: + - code +errors: + - reason: Cached + controller: CachedImage + meaning: All target nodes have the image cached + - reason: Degraded + controller: CachedImageSet + meaning: Some child CachedImages have failures + troubleshooting: Check individual CachedImage statuses + - reason: ErrImagePull + controller: CachedImage + meaning: Registry unreachable or image does not exist + troubleshooting: Verify registry DNS, image name, tag. Check network policies + - reason: ImagePullBackOff + controller: CachedImage + meaning: Repeated pull failures, kubelet is backing off + troubleshooting: Check imagePullSecrets, registry auth. Verify image exists + - reason: InProgress + controller: CachedImage + meaning: Image pulls are actively running on some nodes + - reason: InvalidImageName + controller: CachedImage + meaning: The image reference is malformed + troubleshooting: 'Check spec.image format: registry/repository' + - reason: PartiallyFailed + controller: DiscoveryPolicy + meaning: Some discovery sources failed to sync + troubleshooting: Check source endpoints and credentials + - reason: PodFailed + controller: CachedImage + meaning: Puller Pod failed for a non-image-pull reason + troubleshooting: Check node health, resource limits, Pod security policies + - reason: Progressing + controller: CachedImageSet + meaning: Children are still being pulled + - reason: PullFailed + controller: CachedImage + meaning: One or more nodes failed to pull the image + troubleshooting: Check image name, tag, registry connectivity, imagePullSecrets + - reason: Ready + controller: CachedImageSet + meaning: All child CachedImages are ready + - reason: RegistryUnavailable + controller: CachedImage + meaning: Cannot connect to the container registry + troubleshooting: Check registry URL, DNS, firewall rules + - reason: SourceError + controller: DiscoveryPolicy + meaning: One or more discovery sources returned errors + troubleshooting: Check source configuration and connectivity + - reason: SyncFailed + controller: DiscoveryPolicy + meaning: All discovery sources failed + troubleshooting: Check all source endpoints, credentials, network + - reason: Synced + controller: DiscoveryPolicy + meaning: All sources synced successfully +metrics: + - name: drop_images_cached_total + help: Total number of images successfully cached on nodes. + type: counter + - name: drop_pull_duration_seconds + help: Duration of image pull operations in seconds. + type: histogram + - name: drop_pull_errors_total + help: Total number of failed image pull attempts. + type: counter + - name: drop_discovery_images_found + help: Number of images found by a discovery policy. + type: gauge + - name: drop_active_pulls + help: Current number of active image pull Pods. + type: gauge + - name: drop_reconcile_total + help: Total number of reconciliation attempts. + type: counter + - name: drop_discovery_source_health + help: Whether a discovery source is reachable and queryable (1=healthy, 0=unhealthy). + type: gauge + - name: drop_discovery_source_latency_seconds + help: Latency of discovery source queries in seconds. + type: histogram +makeTargets: + - name: help + desc: Display this help. + - name: build + desc: Build manager binary. + - name: run + desc: Run controller from your host. + - name: fmt + desc: Run go fmt. + - name: vet + desc: Run go vet. + - name: lint + desc: Run golangci-lint. + - name: lint-fix + desc: Run golangci-lint with auto-fix. + - name: generate + desc: Generate DeepCopy methods. + - name: manifests + desc: Generate CRD and RBAC manifests. + - name: codegen + desc: Run all code generation (deepcopy + CRDs + docs). + - name: test + desc: Run unit tests. + - name: test-e2e + desc: Run Chainsaw E2E tests (requires kind cluster). + - name: kind-create + desc: Create kind cluster for development. + - name: kind-delete + desc: Delete the kind cluster. + - name: install + desc: Install CRDs into cluster. + - name: uninstall + desc: Uninstall CRDs from cluster. + - name: e2e-infra + desc: Deploy Prometheus + Registry for E2E/dev. + - name: docker-build + desc: Build docker image. + - name: docker-push + desc: Push docker image. + - name: kind-load + desc: Build and load image into kind. + - name: helm-lint + desc: Lint the Helm chart. + - name: helm-template + desc: Render Helm templates locally. + - name: docs-serve + desc: Serve Hugo docs locally. + - name: docs-gen + desc: Regenerate AI agent docs (llms.txt, instructions, etc.) from source. + - name: docs-gen-check + desc: Verify generated AI docs are up to date. + - name: tools + desc: Install local tooling and check optional docs/chart binaries. +samples: | + # Dev samples: deployed by Tilt for interactive testing + --- + # === PullPolicy === + apiVersion: drop.corewire.io/v1alpha1 + kind: PullPolicy + metadata: + name: dev-conservative + spec: + maxConcurrentNodes: 1 + minDelayBetweenPulls: 5s + repullInterval: 1h + failureBackoff: + initial: 30s + max: 5m + --- + # === CachedImage: healthy === + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImage + metadata: + name: dev-nginx + spec: + image: docker.io/library/nginx + tag: "1.25-alpine" + policyRef: + name: dev-conservative + --- + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImage + metadata: + name: dev-redis + spec: + image: docker.io/library/redis + tag: "7-alpine" + policyRef: + name: dev-conservative + --- + # === CachedImage: broken (DNS failure → ImagePullBackOff) === + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImage + metadata: + name: test-invalid-image + spec: + image: registry.invalid.local:9999/does-not-exist + tag: "nope" + policyRef: + name: dev-conservative + --- + # === CachedImageSet: healthy (static images) === + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImageSet + metadata: + name: dev-set + spec: + policyRef: + name: dev-conservative + images: + - image: docker.io/library/alpine + tag: "3.19" + - image: docker.io/library/busybox + tag: "1.36" + --- + # === CachedImageSet: dynamic (backed by DiscoveryPolicy) === + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImageSet + metadata: + name: dev-set-discovered + spec: + policyRef: + name: dev-conservative + discoveryPolicyRef: + name: dev-registry + --- + # === DiscoveryPolicy: healthy (Prometheus range query) === + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + metadata: + name: dev-prometheus + spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'count(container_memory_working_set_bytes{container!="", namespace="build-stuff"}) by (image)' + lookback: 24h + step: 5m + syncInterval: 30s + maxImages: 10 + --- + # === DiscoveryPolicy: healthy (registry tag listing) === + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + metadata: + name: dev-registry + spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "test/myapp" + topX: 3 + syncInterval: 30s + maxImages: 10 + --- + # === DiscoveryPolicy: broken (DNS error → DNSError) === + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + metadata: + name: test-broken-prom + spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://nonexistent-prometheus:9090" + query: "up{}" + syncInterval: 30m + maxImages: 10 + --- + # === DiscoveryPolicy: broken (DNS error → DNSError) === + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + metadata: + name: test-broken-registry + spec: + sources: + - type: registry + registry: + url: "http://nonexistent-registry:5000" + repositories: + - "test/nope" + syncInterval: 30m + maxImages: 10 + --- + # === DiscoveryPolicy: broken (repo doesn't exist → NotFound) === + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + metadata: + name: test-notfound-repo + spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "this/does-not-exist" + syncInterval: 30m + maxImages: 10 diff --git a/llms-full.txt b/llms-full.txt new file mode 100644 index 0000000..e3edc2c --- /dev/null +++ b/llms-full.txt @@ -0,0 +1,426 @@ +# drop — Full Reference for AI Agents + +## Project + +- **Name**: drop +- **Language**: Go 1.23.0 +- **Module**: github.com/Breee/drop +- **API Group**: drop.corewire.io/v1alpha1 +- **Scope**: All CRDs cluster-scoped +- **License**: Apache-2.0 +- **Framework**: Kubebuilder / controller-runtime + +## CRD Field Reference + +### CachedImage + +CachedImage is the Schema for the cachedimages API. + +Controller: internal/controller/cachedimage_controller.go | Test: internal/controller/cachedimage_controller_test.go + +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Image | `image` | `string` | ✓ | | Image is the fully qualified image reference (registry/repository). | +| Tag | `tag` | `string` | — | | Tag to pull. Mutually exclusive with Digest. | +| Digest | `digest` | `string` | — | | Digest to pull (immutable reference). Mutually exclusive with Tag. | +| ImagePullPolicy | `imagePullPolicy` | `corev1.PullPolicy` | — | `Always` | ImagePullPolicy controls when kubelet pulls the image. Defaults to Always (checks upstream digest, only downloads if changed). Set to IfNotPresent to skip the registry check when the tag already exists locally. Enum: `Always`,`IfNotPresent`,`Never` | +| ImagePullSecrets | `imagePullSecrets` | `[]corev1.LocalObjectReference` | — | | ImagePullSecrets are references to secrets for pulling from private registries. | +| NodeSelector | `nodeSelector` | `map[string]string` | — | | NodeSelector restricts which nodes to cache the image on. | +| Tolerations | `tolerations` | `[]corev1.Toleration` | — | | Tolerations allow targeting tainted nodes. | +| Priority | `priority` | `*int32` | — | | Priority is a pull ordering hint (lower values pulled first). | +| PolicyRef | `policyRef` | `*PolicyReference` | — | | PolicyRef references a PullPolicy for pacing controls. | + +#### Status +| Field | JSON | Type | Description | +|-------|------|------|-------------| +| ObservedGeneration | `observedGeneration` | `int64` | ObservedGeneration is the last generation reconciled. | +| Phase | `phase` | `string` | Phase summarizes the overall state. | +| Ready | `ready` | `string` | Ready is a human-readable "nodesReady/nodesTargeted" fraction for display. | +| ResolvedDigest | `resolvedDigest` | `string` | ResolvedDigest is the sha256 digest of the image as reported by the container runtime after pull. | +| NodesTargeted | `nodesTargeted` | `int32` | NodesTargeted is the number of nodes that should have this image. | +| NodesReady | `nodesReady` | `int32` | NodesReady is the number of nodes that have successfully pulled the image. | +| CachedNodes | `cachedNodes` | `[]string` | CachedNodes is the list of node names that have successfully cached the image. | +| ConsecutiveFailures | `consecutiveFailures` | `int32` | ConsecutiveFailures counts sequential reconcile failures for backoff calculation. | +| LastPulledAt | `lastPulledAt` | `*metav1.Time` | LastPulledAt is the timestamp of the most recent successful pull. | +| LastAttemptedAt | `lastAttemptedAt` | `*metav1.Time` | LastAttemptedAt is the timestamp of the most recent pull attempt (success or failure). | +| Conditions | `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + + +### CachedImageSet + +CachedImageSet is the Schema for the cachedimagesets API. + +Controller: internal/controller/cachedimageset_controller.go | Test: internal/controller/cachedimageset_controller_test.go + +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| PolicyRef | `policyRef` | `*PolicyReference` | — | | PolicyRef references a PullPolicy for pacing controls. | +| DiscoveryPolicyRef | `discoveryPolicyRef` | `*DiscoveryPolicyReference` | — | | DiscoveryPolicyRef references a DiscoveryPolicy for dynamic image lists. | +| ImagePullPolicy | `imagePullPolicy` | `corev1.PullPolicy` | — | `Always` | ImagePullPolicy controls when kubelet pulls the image (propagated to children). Enum: `Always`,`IfNotPresent`,`Never` | +| ImagePullSecrets | `imagePullSecrets` | `[]corev1.LocalObjectReference` | — | | ImagePullSecrets are references to secrets for pulling from private registries (propagated to children). | +| NodeSelector | `nodeSelector` | `map[string]string` | — | | NodeSelector restricts which nodes to cache images on (propagated to children). | +| Tolerations | `tolerations` | `[]corev1.Toleration` | — | | Tolerations allow targeting tainted nodes (propagated to children). | +| Images | `images` | `[]ImageEntry` | — | | Images is a static list of images to cache. | + +#### Status +| Field | JSON | Type | Description | +|-------|------|------|-------------| +| ObservedGeneration | `observedGeneration` | `int64` | ObservedGeneration is the last generation reconciled. | +| Phase | `phase` | `string` | Phase summarizes the overall state. | +| ImagesManaged | `imagesManaged` | `int32` | ImagesManaged is the number of CachedImage children managed by this set. | +| ImagesReady | `imagesReady` | `int32` | ImagesReady is the number of children in Ready phase. | +| Conditions | `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + + +### PullPolicy + +PullPolicy is the Schema for the pullpolicies API. It is a configuration-only resource with no status. + +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| MaxConcurrentNodes | `maxConcurrentNodes` | `int32` | — | `1` | MaxConcurrentNodes is the max nodes pulling simultaneously for this policy. | +| MinDelayBetweenPulls | `minDelayBetweenPulls` | `metav1.Duration` | — | `10s` | MinDelayBetweenPulls is the minimum time between starting pulls on different nodes. | +| FailureBackoff | `failureBackoff` | `*BackoffConfig` | — | | FailureBackoff configures retry delays on pull failures. | +| RepullInterval | `repullInterval` | `*metav1.Duration` | — | | RepullInterval is how often to re-pull cached images. Zero or unset means never re-pull. | +| NodeSelector | `nodeSelector` | `map[string]string` | — | | NodeSelector scopes this policy to a specific node pool. | +| Tolerations | `tolerations` | `[]corev1.Toleration` | — | | Tolerations match tainted nodes in the pool. | + + +### DiscoveryPolicy + +DiscoveryPolicy is the Schema for the discoverypolicies API. + +Controller: internal/controller/discoverypolicy_controller.go | Test: internal/controller/discoverypolicy_controller_test.go + +#### Spec +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Sources | `sources` | `[]DiscoverySource` | ✓ | | Sources is the list of discovery backends to query. | +| ImageFilter | `imageFilter` | `string` | — | | ImageFilter is a regex to filter discovered images. | +| SyncInterval | `syncInterval` | `metav1.Duration` | — | `30m` | SyncInterval is how often to re-query sources. | +| MaxImages | `maxImages` | `int32` | — | `50` | MaxImages caps the number of discovered images. | + +#### Status +| Field | JSON | Type | Description | +|-------|------|------|-------------| +| LastSyncTime | `lastSyncTime` | `*metav1.Time` | LastSyncTime is the timestamp of the last successful sync. | +| DiscoveredImages | `discoveredImages` | `[]DiscoveredImage` | DiscoveredImages is the list of discovered images from all sources. | +| ImageCount | `imageCount` | `int32` | ImageCount is the number of discovered images. | +| SourceCount | `sourceCount` | `int32` | SourceCount is the number of configured sources. | +| Conditions | `conditions` | `[]metav1.Condition` | Conditions represent the latest available observations. | + + + +## Helper Types + +### PolicyReference + +PolicyReference is a reference to a PullPolicy resource. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Name | `name` | `string` | ✓ | | Name of the PullPolicy resource. | + +### DiscoveryPolicyReference + +DiscoveryPolicyReference is a reference to a DiscoveryPolicy resource. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Name | `name` | `string` | ✓ | | Name of the DiscoveryPolicy resource. | + +### ImageEntry + +ImageEntry defines a single image to include in a set. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Image | `image` | `string` | ✓ | | Image is the fully qualified image reference (registry/repository). | +| Tag | `tag` | `string` | — | | Tag to pull. | +| Digest | `digest` | `string` | — | | Digest to pull. | + +### BackoffConfig + +BackoffConfig defines retry backoff behavior. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Initial | `initial` | `metav1.Duration` | — | `30s` | Initial delay before first retry. | +| Max | `max` | `metav1.Duration` | — | `5m` | Max delay cap for exponential backoff. | + +### DiscoverySource + +DiscoverySource defines a single discovery backend. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Type | `type` | `string` | ✓ | | Type identifies the backend. Enum: `prometheus`,`registry` | +| Prometheus | `prometheus` | `*PrometheusSource` | — | | Prometheus config (when type=prometheus). | +| Registry | `registry` | `*RegistrySource` | — | | Registry config (when type=registry). | +| SecretRef | `secretRef` | `*corev1.LocalObjectReference` | — | | SecretRef references a Secret for auth/TLS for this source. | + +### PrometheusSource + +PrometheusSource defines Prometheus query configuration. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Endpoint | `endpoint` | `string` | ✓ | | Endpoint is the Prometheus API URL. | +| Query | `query` | `string` | ✓ | | Query is the PromQL query that must return an 'image' label. | +| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window to aggregate over (e.g. "7d", "24h"). When set, uses query_range and sums values to rank by total usage. When unset, uses an instant query (point-in-time). | +| Step | `step` | `string` | — | `5m` | Step is the query resolution step for range queries. | + +### RegistrySource + +RegistrySource defines OCI registry tag listing configuration. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| URL | `url` | `string` | ✓ | | URL is the registry base URL. | +| Repositories | `repositories` | `[]string` | ✓ | | Repositories is the list of repositories to query. | +| TagFilter | `tagFilter` | `string` | — | | TagFilter is a regex to filter tags. | +| TopX | `topX` | `int32` | — | | TopX limits the number of tags to fetch per repository. | +| ImageTemplate | `imageTemplate` | `string` | — | | ImageTemplate is a Go text/template for constructing the full image reference. Available variables: .Registry, .Repository, .Tag | + +### DiscoveredImage + +DiscoveredImage represents a single discovered image with metadata. + +| Field | JSON | Type | Required | Default | Description | +|-------|------|------|----------|---------|-------------| +| Image | `image` | `string` | ✓ | | Image is the fully qualified image reference. | +| Score | `score` | `int64` | ✓ | | Score is the ranking score from the source (higher = more relevant). | +| Source | `source` | `string` | ✓ | | Source identifies which discovery source produced this image. | + + +## Relationships + +```mermaid +graph LR + CachedImageSet -->|owns| CachedImage + CachedImage -->|creates| Pod + CachedImage -->|references| PullPolicy + CachedImageSet -->|references| PullPolicy + CachedImageSet -->|references| DiscoveryPolicy + DiscoveryPolicy -->|feeds| CachedImageSet +``` + +## Status Conditions & Error Reasons + +| Reason | Controller | Meaning | Troubleshooting | +|--------|-----------|---------|-----------------| +| Cached | CachedImage | All target nodes have the image cached | | +| Degraded | CachedImageSet | Some child CachedImages have failures | Check individual CachedImage statuses | +| ErrImagePull | CachedImage | Registry unreachable or image does not exist | Verify registry DNS, image name, tag. Check network policies | +| ImagePullBackOff | CachedImage | Repeated pull failures, kubelet is backing off | Check imagePullSecrets, registry auth. Verify image exists | +| InProgress | CachedImage | Image pulls are actively running on some nodes | | +| InvalidImageName | CachedImage | The image reference is malformed | Check spec.image format: registry/repository | +| PartiallyFailed | DiscoveryPolicy | Some discovery sources failed to sync | Check source endpoints and credentials | +| PodFailed | CachedImage | Drop Pod failed for a non-image-pull reason | Check node health, resource limits, Pod security policies | +| Progressing | CachedImageSet | Children are still being pulled | | +| PullFailed | CachedImage | One or more nodes failed to pull the image | Check image name, tag, registry connectivity, imagePullSecrets | +| Ready | CachedImageSet | All child CachedImages are ready | | +| RegistryUnavailable | CachedImage | Cannot connect to the container registry | Check registry URL, DNS, firewall rules | +| SourceError | DiscoveryPolicy | One or more discovery sources returned errors | Check source configuration and connectivity | +| SyncFailed | DiscoveryPolicy | All discovery sources failed | Check all source endpoints, credentials, network | +| Synced | DiscoveryPolicy | All sources synced successfully | | + +## Metrics + +| Name | Type | Description | +|------|------|-------------| +| `drop_images_cached_total` | counter | Total number of images successfully cached on nodes. | +| `drop_pull_duration_seconds` | histogram | Duration of image pull operations in seconds. | +| `drop_pull_errors_total` | counter | Total number of failed image pull attempts. | +| `drop_discovery_images_found` | gauge | Number of images found by a discovery policy. | +| `drop_active_pulls` | gauge | Current number of active image pull Pods. | +| `drop_reconcile_total` | counter | Total number of reconciliation attempts. | +| `drop_discovery_source_health` | gauge | Whether a discovery source is reachable and queryable (1=healthy, 0=unhealthy). | +| `drop_discovery_source_latency_seconds` | histogram | Latency of discovery source queries in seconds. | + +## Sample CRs + +```yaml +# Dev samples: deployed by Tilt for interactive testing +--- +# === PullPolicy === +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: dev-conservative +spec: + maxConcurrentNodes: 1 + minDelayBetweenPulls: 5s + repullInterval: 1h + failureBackoff: + initial: 30s + max: 5m +--- +# === CachedImage: healthy === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: dev-nginx +spec: + image: docker.io/library/nginx + tag: "1.25-alpine" + policyRef: + name: dev-conservative +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: dev-redis +spec: + image: docker.io/library/redis + tag: "7-alpine" + policyRef: + name: dev-conservative +--- +# === CachedImage: broken (DNS failure → ImagePullBackOff) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-invalid-image +spec: + image: registry.invalid.local:9999/does-not-exist + tag: "nope" + policyRef: + name: dev-conservative +--- +# === CachedImageSet: healthy (static images) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: dev-set +spec: + policyRef: + name: dev-conservative + images: + - image: docker.io/library/alpine + tag: "3.19" + - image: docker.io/library/busybox + tag: "1.36" +--- +# === CachedImageSet: dynamic (backed by DiscoveryPolicy) === +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: dev-set-discovered +spec: + policyRef: + name: dev-conservative + discoveryPolicyRef: + name: dev-registry +--- +# === DiscoveryPolicy: healthy (Prometheus range query) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: dev-prometheus +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'count(container_memory_working_set_bytes{container!="", namespace="build-stuff"}) by (image)' + lookback: 24h + step: 5m + syncInterval: 30s + maxImages: 10 +--- +# === DiscoveryPolicy: healthy (registry tag listing) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: dev-registry +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "test/myapp" + topX: 3 + syncInterval: 30s + maxImages: 10 +--- +# === DiscoveryPolicy: broken (DNS error → DNSError) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-prom +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://nonexistent-prometheus:9090" + query: "up{}" + syncInterval: 30m + maxImages: 10 +--- +# === DiscoveryPolicy: broken (DNS error → DNSError) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-registry +spec: + sources: + - type: registry + registry: + url: "http://nonexistent-registry:5000" + repositories: + - "test/nope" + syncInterval: 30m + maxImages: 10 +--- +# === DiscoveryPolicy: broken (repo doesn't exist → NotFound) === +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-notfound-repo +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "this/does-not-exist" + syncInterval: 30m + maxImages: 10 + +``` + +## Build & Test + +``` + make help # Display this help. + make build # Build manager binary. + make run # Run controller from your host. + make fmt # Run go fmt. + make vet # Run go vet. + make lint # Run golangci-lint. + make lint-fix # Run golangci-lint with auto-fix. + make generate # Generate DeepCopy methods. + make manifests # Generate CRD and RBAC manifests. + make codegen # Run all code generation (deepcopy + CRDs + docs). + make test # Run unit tests. + make test-e2e # Run Chainsaw E2E tests (requires kind cluster). + make kind-create # Create kind cluster for development. + make kind-delete # Delete the kind cluster. + make install # Install CRDs into cluster. + make uninstall # Uninstall CRDs from cluster. + make e2e-infra # Deploy Prometheus + Registry for E2E/dev. + make docker-build # Build docker image. + make docker-push # Push docker image. + make kind-load # Build and load image into kind. + make helm-lint # Lint the Helm chart. + make helm-template # Render Helm templates locally. + make docs-serve # Serve Hugo docs locally. + make docs-gen # Regenerate AI agent docs (llms.txt, instructions, etc.) from source. + make docs-gen-check # Verify generated AI docs are up to date. + make tools # Install local tooling and check optional docs/chart binaries. +``` diff --git a/llms.txt b/llms.txt new file mode 100644 index 0000000..ab98ef7 --- /dev/null +++ b/llms.txt @@ -0,0 +1,144 @@ +# drop — Kubernetes operator that pre-caches container images on cluster nodes + +> API group: drop.corewire.io/v1alpha1 | Go 1.23.0 | All CRDs cluster-scoped + +## CRDs + +| Kind | Purpose | +|------|---------| +| CachedImage | CachedImage is the Schema for the cachedimages API. | +| CachedImageSet | CachedImageSet is the Schema for the cachedimagesets API. | +| PullPolicy | PullPolicy is the Schema for the pullpolicies API. It is a configuration-only resource with no status. | +| DiscoveryPolicy | DiscoveryPolicy is the Schema for the discoverypolicies API. | + +## Architecture + +Short-lived Pods with `nodeName` + `command: ["true"]` trigger image pulls via kubelet. No privileged containers. + +Reconcilers: +- CachedImage → internal/controller/cachedimage_controller.go +- CachedImageSet → internal/controller/cachedimageset_controller.go +- DiscoveryPolicy → internal/controller/discoverypolicy_controller.go + +## Key Directories + +| Path | Role | +|------|------| +| api/v1alpha1 | Package v1alpha1 contains API Schema definitions for the drop v1alpha1 API group. | +| internal/controller | Reconciler implementations (one per CRD) | +| internal/discovery | Discovery source interface + implementations | +| internal/metrics | Prometheus metrics registration | +| internal/pacing | Shared pacing engine for rate-limited pulls | +| internal/podbuilder | Pure Pod construction function (no k8s client) | +| charts/drop/ | Helm chart | +| test/e2e/ | Chainsaw E2E tests | +| hack/gen-ai-docs/ | Documentation generator | + +## Build & Test + +``` + make help # Display this help. + make build # Build manager binary. + make run # Run controller from your host. + make fmt # Run go fmt. + make vet # Run go vet. + make lint # Run golangci-lint. + make lint-fix # Run golangci-lint with auto-fix. + make generate # Generate DeepCopy methods. + make manifests # Generate CRD and RBAC manifests. + make codegen # Run all code generation (deepcopy + CRDs + docs). + make test # Run unit tests. + make test-e2e # Run Chainsaw E2E tests (requires kind cluster). + make kind-create # Create kind cluster for development. + make kind-delete # Delete the kind cluster. + make install # Install CRDs into cluster. + make uninstall # Uninstall CRDs from cluster. + make e2e-infra # Deploy Prometheus + Registry for E2E/dev. + make docker-build # Build docker image. + make docker-push # Push docker image. + make kind-load # Build and load image into kind. + make helm-lint # Lint the Helm chart. + make helm-template # Render Helm templates locally. + make docs-serve # Serve Hugo docs locally. + make docs-gen # Regenerate AI agent docs (llms.txt, instructions, etc.) from source. + make docs-gen-check # Verify generated AI docs are up to date. + make tools # Install local tooling and check optional docs/chart binaries. +``` + +## CRD Quick Reference + +### CachedImage + +CachedImage is the Schema for the cachedimages API. + +**Spec fields:** `image`, `tag`, `digest`, `imagePullPolicy` (default: Always), `imagePullSecrets`, `nodeSelector`, `tolerations`, `priority`, `policyRef`, +**Status fields:** `observedGeneration`, `phase`, `ready`, `resolvedDigest`, `nodesTargeted`, `nodesReady`, `cachedNodes`, `consecutiveFailures`, `lastPulledAt`, `lastAttemptedAt`, `conditions`, + +### CachedImageSet + +CachedImageSet is the Schema for the cachedimagesets API. + +**Spec fields:** `policyRef`, `discoveryPolicyRef`, `imagePullPolicy` (default: Always), `imagePullSecrets`, `nodeSelector`, `tolerations`, `images`, +**Status fields:** `observedGeneration`, `phase`, `imagesManaged`, `imagesReady`, `conditions`, + +### PullPolicy + +PullPolicy is the Schema for the pullpolicies API. It is a configuration-only resource with no status. + +**Spec fields:** `maxConcurrentNodes` (default: 1), `minDelayBetweenPulls` (default: 10s), `failureBackoff`, `repullInterval`, `nodeSelector`, `tolerations`, + +### DiscoveryPolicy + +DiscoveryPolicy is the Schema for the discoverypolicies API. + +**Spec fields:** `sources`, `imageFilter`, `syncInterval` (default: 30m), `maxImages` (default: 50), +**Status fields:** `lastSyncTime`, `discoveredImages`, `imageCount`, `sourceCount`, `conditions`, + + +## Status Condition Reasons + +| Reason | Controller | Meaning | +|--------|-----------|---------| +| Cached | CachedImage | All target nodes have the image cached | +| Degraded | CachedImageSet | Some child CachedImages have failures | +| ErrImagePull | CachedImage | Registry unreachable or image does not exist | +| ImagePullBackOff | CachedImage | Repeated pull failures, kubelet is backing off | +| InProgress | CachedImage | Image pulls are actively running on some nodes | +| InvalidImageName | CachedImage | The image reference is malformed | +| PartiallyFailed | DiscoveryPolicy | Some discovery sources failed to sync | +| PodFailed | CachedImage | Drop Pod failed for a non-image-pull reason | +| Progressing | CachedImageSet | Children are still being pulled | +| PullFailed | CachedImage | One or more nodes failed to pull the image | +| Ready | CachedImageSet | All child CachedImages are ready | +| RegistryUnavailable | CachedImage | Cannot connect to the container registry | +| SourceError | DiscoveryPolicy | One or more discovery sources returned errors | +| SyncFailed | DiscoveryPolicy | All discovery sources failed | +| Synced | DiscoveryPolicy | All sources synced successfully | + +## Metrics +- `drop_images_cached_total` (counter) — Total number of images successfully cached on nodes. +- `drop_pull_duration_seconds` (histogram) — Duration of image pull operations in seconds. +- `drop_pull_errors_total` (counter) — Total number of failed image pull attempts. +- `drop_discovery_images_found` (gauge) — Number of images found by a discovery policy. +- `drop_active_pulls` (gauge) — Current number of active image pull Pods. +- `drop_reconcile_total` (counter) — Total number of reconciliation attempts. +- `drop_discovery_source_health` (gauge) — Whether a discovery source is reachable and queryable (1=healthy, 0=unhealthy). +- `drop_discovery_source_latency_seconds` (histogram) — Latency of discovery source queries in seconds. + +## Full Reference + +See [llms-full.txt](llms-full.txt) for complete field documentation with types and examples. + +## Documentation Pages + +| Page | llmsDescription | +|------|-----------------| +| [Installation](docs/install/) | Install via Helm. Requires K8s 1.28+. | +| [Usage](docs/usage/) | CachedImage, CachedImageSet, PullPolicy examples with YAML. | +| [Discovery](docs/discovery/) | DiscoveryPolicy for automatic image discovery from Prometheus/OCI registries. | +| [Monitoring](docs/monitoring/) | Prometheus metrics, Kubernetes events, and status conditions. | +| [CRD Reference](docs/reference/crds/) | Complete field reference for all drop CRDs with types, defaults, and validation. | +| [Status & Errors](docs/reference/errors/) | Every condition reason emitted by controllers. Diagnose why resources are not Ready. | +| [Metrics](docs/reference/metrics/) | Prometheus metrics: names, types, descriptions, and example PromQL queries. | +| [Architecture](docs/reference/architecture/) | Package dependency graph and CRD ownership relationships. | +| [Developing](docs/developing/) | Build, test, lint, project structure for contributors. | diff --git a/renovate.json b/renovate.json new file mode 100644 index 0000000..3c99d7f --- /dev/null +++ b/renovate.json @@ -0,0 +1,18 @@ +{ + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "extends": [ + "config:recommended" + ], + "packageRules": [ + { + "description": "Automerge minor, patch, and digest updates", + "matchUpdateTypes": ["minor", "patch", "digest"], + "automerge": true + }, + { + "description": "Major updates require manual approval", + "matchUpdateTypes": ["major"], + "automerge": false + } + ] +} diff --git a/test/e2e/README.md b/test/e2e/README.md new file mode 100644 index 0000000..70b9987 --- /dev/null +++ b/test/e2e/README.md @@ -0,0 +1,25 @@ +# Chainsaw E2E Tests + +This directory contains scenario-based E2E tests using [Kyverno Chainsaw](https://kyverno.github.io/chainsaw/). + +## Prerequisites + +- A running Kind cluster with the operator deployed +- `chainsaw` binary installed (`make chainsaw`) + +## Running + +```bash +# From repo root +make test-e2e-chainsaw +``` + +## Test Scenarios + +| Directory | Description | +|-----------|-------------| +| `cachedimage-basic/` | Basic CachedImage creation and pod scheduling | +| `cachedimage-pacing/` | PullPolicy pacing enforcement | +| `cachedimageset/` | CachedImageSet managing child resources | +| `discovery-prometheus/` | DiscoveryPolicy with mock Prometheus | +| `pull-policy-backoff/` | Failure backoff behavior | diff --git a/test/e2e/cachedimage-basic/01-cachedimage.yaml b/test/e2e/cachedimage-basic/01-cachedimage.yaml new file mode 100644 index 0000000..950f302 --- /dev/null +++ b/test/e2e/cachedimage-basic/01-cachedimage.yaml @@ -0,0 +1,8 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-nginx +spec: + image: docker.io/library/nginx + tag: "1.25-alpine" + imagePullPolicy: IfNotPresent diff --git a/test/e2e/cachedimage-basic/02-assert-pod.yaml b/test/e2e/cachedimage-basic/02-assert-pod.yaml new file mode 100644 index 0000000..db2b432 --- /dev/null +++ b/test/e2e/cachedimage-basic/02-assert-pod.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Pod +metadata: + namespace: drop-system + labels: + app.kubernetes.io/managed-by: drop + drop.corewire.io/cachedimage: test-nginx +spec: + containers: + - name: pull + image: docker.io/library/nginx:1.25-alpine + command: ["true"] diff --git a/test/e2e/cachedimage-basic/03-assert-status.yaml b/test/e2e/cachedimage-basic/03-assert-status.yaml new file mode 100644 index 0000000..c9112ed --- /dev/null +++ b/test/e2e/cachedimage-basic/03-assert-status.yaml @@ -0,0 +1,10 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-nginx +status: + phase: Ready + nodesReady: 2 + (conditions[?type == 'Ready']): + - status: "True" + reason: Cached diff --git a/test/e2e/cachedimage-basic/chainsaw-test.yaml b/test/e2e/cachedimage-basic/chainsaw-test.yaml new file mode 100644 index 0000000..56aa09b --- /dev/null +++ b/test/e2e/cachedimage-basic/chainsaw-test.yaml @@ -0,0 +1,54 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cachedimage-basic +spec: + description: | + Verify that creating a CachedImage resource causes the operator to create + a drop Pod on a target node, and that status transitions to Ready on success. + steps: + - name: Create CachedImage + try: + - apply: + file: 01-cachedimage.yaml + - name: Verify drop Pod is created + try: + - assert: + file: 02-assert-pod.yaml + - name: Wait for Ready status + try: + - script: + timeout: 90s + content: | + deadline=$(( $(date +%s) + 90 )) + while [ "$(date +%s)" -lt "$deadline" ]; do + phase=$(kubectl get cachedimage test-nginx -o jsonpath='{.status.phase}' 2>/dev/null || true) + nodes_ready=$(kubectl get cachedimage test-nginx -o jsonpath='{.status.nodesReady}' 2>/dev/null || true) + nodes_targeted=$(kubectl get cachedimage test-nginx -o jsonpath='{.status.nodesTargeted}' 2>/dev/null || true) + + case "$nodes_ready" in + ''|*[!0-9]*) nodes_ready=0 ;; + esac + case "$nodes_targeted" in + ''|*[!0-9]*) nodes_targeted=0 ;; + esac + + if [ "$nodes_targeted" -ge 1 ] && [ "$nodes_ready" = "$nodes_targeted" ] && [ "$phase" = "Ready" ]; then + echo "OK: CachedImage reached Ready with $nodes_ready/$nodes_targeted target nodes" + exit 0 + fi + + sleep 2 + done + + kubectl get cachedimage test-nginx -o yaml + echo "FAIL: CachedImage did not reach Ready on all targeted nodes" + exit 1 + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImage + name: test-nginx diff --git a/test/e2e/cachedimage-failure/01-pullpolicy.yaml b/test/e2e/cachedimage-failure/01-pullpolicy.yaml new file mode 100644 index 0000000..fe403cc --- /dev/null +++ b/test/e2e/cachedimage-failure/01-pullpolicy.yaml @@ -0,0 +1,10 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: test-backoff-policy +spec: + maxConcurrentNodes: 1 + minDelayBetweenPulls: 1s + failureBackoff: + initial: 10s + max: 1m diff --git a/test/e2e/cachedimage-failure/02-cachedimage-broken.yaml b/test/e2e/cachedimage-failure/02-cachedimage-broken.yaml new file mode 100644 index 0000000..dce78a9 --- /dev/null +++ b/test/e2e/cachedimage-failure/02-cachedimage-broken.yaml @@ -0,0 +1,9 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-broken-image +spec: + image: registry.invalid.local:9999/does-not-exist + tag: "nope" + policyRef: + name: test-backoff-policy diff --git a/test/e2e/cachedimage-failure/03-assert-degraded.yaml b/test/e2e/cachedimage-failure/03-assert-degraded.yaml new file mode 100644 index 0000000..fc9d928 --- /dev/null +++ b/test/e2e/cachedimage-failure/03-assert-degraded.yaml @@ -0,0 +1,9 @@ +# Assert CachedImage transitions to Degraded with a pull failure reason. +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-broken-image +status: + phase: Degraded + (conditions[?type == 'Ready']): + - status: "False" diff --git a/test/e2e/cachedimage-failure/04-assert-backoff.yaml b/test/e2e/cachedimage-failure/04-assert-backoff.yaml new file mode 100644 index 0000000..d5b4c81 --- /dev/null +++ b/test/e2e/cachedimage-failure/04-assert-backoff.yaml @@ -0,0 +1,7 @@ +# Assert consecutiveFailures is being tracked (at least 1 failure recorded). +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-broken-image +status: + (consecutiveFailures > `0`): true diff --git a/test/e2e/cachedimage-failure/chainsaw-test.yaml b/test/e2e/cachedimage-failure/chainsaw-test.yaml new file mode 100644 index 0000000..f6b58b2 --- /dev/null +++ b/test/e2e/cachedimage-failure/chainsaw-test.yaml @@ -0,0 +1,40 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cachedimage-failure +spec: + description: | + Verify that a CachedImage with an unreachable registry transitions to + Degraded phase with an appropriate reason (ErrImagePull/ImagePullBackOff). + steps: + - name: Create PullPolicy + try: + - apply: + file: 01-pullpolicy.yaml + - name: Create broken CachedImage + try: + - apply: + file: 02-cachedimage-broken.yaml + - name: Wait for Degraded status with failure reason + try: + - assert: + timeout: 120s + file: 03-assert-degraded.yaml + - name: Verify consecutiveFailures is tracked + try: + - assert: + timeout: 30s + file: 04-assert-backoff.yaml + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImage + name: test-broken-image + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: PullPolicy + name: test-backoff-policy diff --git a/test/e2e/cachedimage-pacing/01-pullpolicy.yaml b/test/e2e/cachedimage-pacing/01-pullpolicy.yaml new file mode 100644 index 0000000..26db3e1 --- /dev/null +++ b/test/e2e/cachedimage-pacing/01-pullpolicy.yaml @@ -0,0 +1,10 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: test-conservative +spec: + maxConcurrentNodes: 1 + minDelayBetweenPulls: 5s + failureBackoff: + initial: 30s + max: 5m diff --git a/test/e2e/cachedimage-pacing/02-cachedimage.yaml b/test/e2e/cachedimage-pacing/02-cachedimage.yaml new file mode 100644 index 0000000..b16b975 --- /dev/null +++ b/test/e2e/cachedimage-pacing/02-cachedimage.yaml @@ -0,0 +1,9 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + name: test-paced +spec: + image: docker.io/library/busybox + tag: "latest" + policyRef: + name: test-conservative diff --git a/test/e2e/cachedimage-pacing/chainsaw-test.yaml b/test/e2e/cachedimage-pacing/chainsaw-test.yaml new file mode 100644 index 0000000..b5f8796 --- /dev/null +++ b/test/e2e/cachedimage-pacing/chainsaw-test.yaml @@ -0,0 +1,41 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cachedimage-pacing +spec: + description: | + Verify that PullPolicy pacing is respected: with maxConcurrentNodes=1, + only one drop Pod should exist at any time. + steps: + - name: Create PullPolicy + try: + - apply: + file: 01-pullpolicy.yaml + - name: Create CachedImage referencing policy + try: + - apply: + file: 02-cachedimage.yaml + - name: Verify at most one active Pod at a time + try: + - script: + timeout: 30s + content: | + count=$(kubectl get pods -n drop-system -l app.kubernetes.io/managed-by=drop,drop.corewire.io/cachedimage=test-paced --no-headers 2>/dev/null | wc -l) + if [ "$count" -gt 1 ]; then + echo "FAIL: expected at most 1 drop pod, got $count" + exit 1 + fi + echo "OK: $count drop pod(s) active" + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImage + name: test-paced + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: PullPolicy + name: test-conservative diff --git a/test/e2e/cachedimageset-discovery/01-pullpolicy.yaml b/test/e2e/cachedimageset-discovery/01-pullpolicy.yaml new file mode 100644 index 0000000..ae0c58d --- /dev/null +++ b/test/e2e/cachedimageset-discovery/01-pullpolicy.yaml @@ -0,0 +1,10 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: PullPolicy +metadata: + name: test-set-policy +spec: + maxConcurrentNodes: 1 + minDelayBetweenPulls: 1s + failureBackoff: + initial: 10s + max: 1m diff --git a/test/e2e/cachedimageset-discovery/02-discoverypolicy.yaml b/test/e2e/cachedimageset-discovery/02-discoverypolicy.yaml new file mode 100644 index 0000000..54da3b4 --- /dev/null +++ b/test/e2e/cachedimageset-discovery/02-discoverypolicy.yaml @@ -0,0 +1,14 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-registry-discovery +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "test/myapp" + topX: 1 + syncInterval: 30s + maxImages: 10 diff --git a/test/e2e/cachedimageset-discovery/03-assert-discovery-ready.yaml b/test/e2e/cachedimageset-discovery/03-assert-discovery-ready.yaml new file mode 100644 index 0000000..cb90fcd --- /dev/null +++ b/test/e2e/cachedimageset-discovery/03-assert-discovery-ready.yaml @@ -0,0 +1,9 @@ +# Assert DiscoveryPolicy is synced and has discovered images +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-registry-discovery +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced diff --git a/test/e2e/cachedimageset-discovery/04-cachedimageset.yaml b/test/e2e/cachedimageset-discovery/04-cachedimageset.yaml new file mode 100644 index 0000000..761cb4c --- /dev/null +++ b/test/e2e/cachedimageset-discovery/04-cachedimageset.yaml @@ -0,0 +1,9 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: test-discovered-set +spec: + policyRef: + name: test-set-policy + discoveryPolicyRef: + name: test-registry-discovery diff --git a/test/e2e/cachedimageset-discovery/05-assert-children.yaml b/test/e2e/cachedimageset-discovery/05-assert-children.yaml new file mode 100644 index 0000000..bb88061 --- /dev/null +++ b/test/e2e/cachedimageset-discovery/05-assert-children.yaml @@ -0,0 +1,13 @@ +# Assert child CachedImages are created with proper labels and ownerRef +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + labels: + drop.corewire.io/imageset: test-discovered-set + ownerReferences: + - apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImageSet + name: test-discovered-set +spec: + policyRef: + name: test-set-policy diff --git a/test/e2e/cachedimageset-discovery/06-assert-set-status.yaml b/test/e2e/cachedimageset-discovery/06-assert-set-status.yaml new file mode 100644 index 0000000..72ae564 --- /dev/null +++ b/test/e2e/cachedimageset-discovery/06-assert-set-status.yaml @@ -0,0 +1,8 @@ +# Assert CachedImageSet shows healthy status +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: test-discovered-set +status: + (conditions[?type == 'Ready']): + - status: "True" diff --git a/test/e2e/cachedimageset-discovery/chainsaw-test.yaml b/test/e2e/cachedimageset-discovery/chainsaw-test.yaml new file mode 100644 index 0000000..fd43b98 --- /dev/null +++ b/test/e2e/cachedimageset-discovery/chainsaw-test.yaml @@ -0,0 +1,79 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cachedimageset-discovery +spec: + description: | + Verify that a CachedImageSet with discoveryPolicyRef creates child CachedImages + from a registry-based DiscoveryPolicy, with policyRef propagated to children. + steps: + - name: Create PullPolicy + try: + - apply: + file: 01-pullpolicy.yaml + - name: Create Registry DiscoveryPolicy + try: + - apply: + file: 02-discoverypolicy.yaml + - name: Wait for discovery to sync + try: + - assert: + timeout: 90s + file: 03-assert-discovery-ready.yaml + - name: Create CachedImageSet with discoveryPolicyRef and policyRef + try: + - apply: + file: 04-cachedimageset.yaml + - name: Verify child CachedImages created with policyRef + try: + - assert: + timeout: 60s + file: 05-assert-children.yaml + - name: Verify CachedImageSet status shows Ready + try: + - script: + timeout: 120s + content: | + deadline=$(( $(date +%s) + 120 )) + while [ "$(date +%s)" -lt "$deadline" ]; do + ready=$(kubectl get cachedimageset test-discovered-set -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true) + images_managed=$(kubectl get cachedimageset test-discovered-set -o jsonpath='{.status.imagesManaged}' 2>/dev/null || true) + images_ready=$(kubectl get cachedimageset test-discovered-set -o jsonpath='{.status.imagesReady}' 2>/dev/null || true) + + case "$images_managed" in + ''|*[!0-9]*) images_managed=0 ;; + esac + case "$images_ready" in + ''|*[!0-9]*) images_ready=0 ;; + esac + + if [ "$images_managed" -ge 1 ] && [ "$images_ready" = "$images_managed" ] && [ "$ready" = "True" ]; then + echo "OK: CachedImageSet is Ready with $images_ready/$images_managed images cached" + exit 0 + fi + + sleep 2 + done + + kubectl get cachedimageset test-discovered-set -o yaml + kubectl get cachedimage -l drop.corewire.io/imageset=test-discovered-set -o yaml + echo "FAIL: CachedImageSet did not become Ready" + exit 1 + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImageSet + name: test-discovered-set + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: test-registry-discovery + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: PullPolicy + name: test-set-policy diff --git a/test/e2e/cachedimageset/01-cachedimageset.yaml b/test/e2e/cachedimageset/01-cachedimageset.yaml new file mode 100644 index 0000000..e8555d2 --- /dev/null +++ b/test/e2e/cachedimageset/01-cachedimageset.yaml @@ -0,0 +1,10 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: test-set +spec: + images: + - image: docker.io/library/alpine + tag: "3.19" + - image: docker.io/library/busybox + tag: "1.36" diff --git a/test/e2e/cachedimageset/02-assert-children.yaml b/test/e2e/cachedimageset/02-assert-children.yaml new file mode 100644 index 0000000..617a4c7 --- /dev/null +++ b/test/e2e/cachedimageset/02-assert-children.yaml @@ -0,0 +1,9 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + labels: + drop.corewire.io/imageset: test-set + ownerReferences: + - apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImageSet + name: test-set diff --git a/test/e2e/cachedimageset/03-assert-deleted.yaml b/test/e2e/cachedimageset/03-assert-deleted.yaml new file mode 100644 index 0000000..cf45443 --- /dev/null +++ b/test/e2e/cachedimageset/03-assert-deleted.yaml @@ -0,0 +1,6 @@ +# This asserts that child CachedImages no longer exist after parent deletion (GC) +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + labels: + drop.corewire.io/imageset: test-set diff --git a/test/e2e/cachedimageset/chainsaw-test.yaml b/test/e2e/cachedimageset/chainsaw-test.yaml new file mode 100644 index 0000000..49a1cfb --- /dev/null +++ b/test/e2e/cachedimageset/chainsaw-test.yaml @@ -0,0 +1,29 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cachedimageset +spec: + description: | + Verify that a CachedImageSet creates child CachedImage resources + and manages their lifecycle via ownerReferences. + steps: + - name: Create CachedImageSet + try: + - apply: + file: 01-cachedimageset.yaml + - name: Verify child CachedImages created + try: + - assert: + timeout: 30s + file: 02-assert-children.yaml + - name: Delete CachedImageSet and verify GC + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImageSet + name: test-set + - error: + timeout: 30s + file: 03-assert-deleted.yaml diff --git a/test/e2e/discovery-failure/01-broken-prometheus.yaml b/test/e2e/discovery-failure/01-broken-prometheus.yaml new file mode 100644 index 0000000..a44f533 --- /dev/null +++ b/test/e2e/discovery-failure/01-broken-prometheus.yaml @@ -0,0 +1,12 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-prom +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://nonexistent-prometheus:9090" + query: "up{}" + syncInterval: 30m + maxImages: 10 diff --git a/test/e2e/discovery-failure/02-broken-registry.yaml b/test/e2e/discovery-failure/02-broken-registry.yaml new file mode 100644 index 0000000..2a97e3f --- /dev/null +++ b/test/e2e/discovery-failure/02-broken-registry.yaml @@ -0,0 +1,13 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-registry +spec: + sources: + - type: registry + registry: + url: "http://nonexistent-registry:5000" + repositories: + - "test/nope" + syncInterval: 30m + maxImages: 10 diff --git a/test/e2e/discovery-failure/03-notfound-registry.yaml b/test/e2e/discovery-failure/03-notfound-registry.yaml new file mode 100644 index 0000000..3bd1f35 --- /dev/null +++ b/test/e2e/discovery-failure/03-notfound-registry.yaml @@ -0,0 +1,13 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-notfound-repo +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "this/does-not-exist" + syncInterval: 30m + maxImages: 10 diff --git a/test/e2e/discovery-failure/04-assert-dns-prometheus.yaml b/test/e2e/discovery-failure/04-assert-dns-prometheus.yaml new file mode 100644 index 0000000..09bd371 --- /dev/null +++ b/test/e2e/discovery-failure/04-assert-dns-prometheus.yaml @@ -0,0 +1,9 @@ +# Assert broken prometheus shows DNSError reason +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-prom +status: + (conditions[?type == 'Ready']): + - status: "False" + reason: DNSError diff --git a/test/e2e/discovery-failure/05-assert-dns-registry.yaml b/test/e2e/discovery-failure/05-assert-dns-registry.yaml new file mode 100644 index 0000000..893a3e5 --- /dev/null +++ b/test/e2e/discovery-failure/05-assert-dns-registry.yaml @@ -0,0 +1,9 @@ +# Assert broken registry shows DNSError reason +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-broken-registry +status: + (conditions[?type == 'Ready']): + - status: "False" + reason: DNSError diff --git a/test/e2e/discovery-failure/06-assert-notfound.yaml b/test/e2e/discovery-failure/06-assert-notfound.yaml new file mode 100644 index 0000000..0d8ee0a --- /dev/null +++ b/test/e2e/discovery-failure/06-assert-notfound.yaml @@ -0,0 +1,8 @@ +# Assert notfound repo shows error (Ready=False with a reason) +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: test-notfound-repo +status: + (conditions[?type == 'Ready']): + - status: "False" diff --git a/test/e2e/discovery-failure/chainsaw-test.yaml b/test/e2e/discovery-failure/chainsaw-test.yaml new file mode 100644 index 0000000..5afe93c --- /dev/null +++ b/test/e2e/discovery-failure/chainsaw-test.yaml @@ -0,0 +1,54 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: discovery-failure +spec: + description: | + Verify that DiscoveryPolicy with broken sources reports appropriate error + reasons: DNSError for unreachable endpoints, NotFound for missing repos. + steps: + - name: Create broken Prometheus DiscoveryPolicy (DNS failure) + try: + - apply: + file: 01-broken-prometheus.yaml + - name: Create broken Registry DiscoveryPolicy (DNS failure) + try: + - apply: + file: 02-broken-registry.yaml + - name: Create DiscoveryPolicy with nonexistent repo (NotFound) + try: + - apply: + file: 03-notfound-registry.yaml + - name: Assert broken Prometheus shows DNSError + try: + - assert: + timeout: 90s + file: 04-assert-dns-prometheus.yaml + - name: Assert broken registry shows DNSError + try: + - assert: + timeout: 90s + file: 05-assert-dns-registry.yaml + - name: Assert notfound repo shows error + try: + - assert: + timeout: 90s + file: 06-assert-notfound.yaml + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: test-broken-prom + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: test-broken-registry + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: test-notfound-repo diff --git a/test/e2e/discovery-registry/01-discoverypolicy.yaml b/test/e2e/discovery-registry/01-discoverypolicy.yaml new file mode 100644 index 0000000..bedc5a6 --- /dev/null +++ b/test/e2e/discovery-registry/01-discoverypolicy.yaml @@ -0,0 +1,14 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-registry +spec: + sources: + - type: registry + registry: + url: "http://registry.e2e-infra.svc.cluster.local:5000" + repositories: + - "test/myapp" + topX: 3 + syncInterval: 30s + maxImages: 10 diff --git a/test/e2e/discovery-registry/02-assert-discovery-status.yaml b/test/e2e/discovery-registry/02-assert-discovery-status.yaml new file mode 100644 index 0000000..a387594 --- /dev/null +++ b/test/e2e/discovery-registry/02-assert-discovery-status.yaml @@ -0,0 +1,11 @@ +# Assert that DiscoveryPolicy status contains images from registry and Ready condition. +# The registry source lists tags for test/myapp and builds refs as host/repo:tag. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-registry +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 3 diff --git a/test/e2e/discovery-registry/chainsaw-test.yaml b/test/e2e/discovery-registry/chainsaw-test.yaml new file mode 100644 index 0000000..32f165a --- /dev/null +++ b/test/e2e/discovery-registry/chainsaw-test.yaml @@ -0,0 +1,26 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: discovery-registry +spec: + description: | + Verify that a DiscoveryPolicy with a registry source discovers tags + from the in-cluster registry seeded with test images. + steps: + - name: Create DiscoveryPolicy with registry source + try: + - apply: + file: 01-discoverypolicy.yaml + - name: Wait for discovered images in status + try: + - assert: + timeout: 90s + file: 02-assert-discovery-status.yaml + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-registry diff --git a/test/e2e/discovery/01-discoverypolicy.yaml b/test/e2e/discovery/01-discoverypolicy.yaml new file mode 100644 index 0000000..f01591c --- /dev/null +++ b/test/e2e/discovery/01-discoverypolicy.yaml @@ -0,0 +1,14 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-prometheus +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'count(container_memory_working_set_bytes{container!="", namespace="build-stuff"}) by (image)' + lookback: 24h + step: 5m + syncInterval: 30s + maxImages: 10 diff --git a/test/e2e/discovery/02-assert-discovery-status.yaml b/test/e2e/discovery/02-assert-discovery-status.yaml new file mode 100644 index 0000000..1cb8f4d --- /dev/null +++ b/test/e2e/discovery/02-assert-discovery-status.yaml @@ -0,0 +1,11 @@ +# Assert that DiscoveryPolicy status contains discovered images and Ready condition. +# The query 'count(...{namespace="build-stuff"}) by (image)' returns alpine + busybox. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-prometheus +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery/03-cachedimageset-discovery.yaml b/test/e2e/discovery/03-cachedimageset-discovery.yaml new file mode 100644 index 0000000..f0b81aa --- /dev/null +++ b/test/e2e/discovery/03-cachedimageset-discovery.yaml @@ -0,0 +1,7 @@ +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImageSet +metadata: + name: discovered-set +spec: + discoveryPolicyRef: + name: e2e-prometheus diff --git a/test/e2e/discovery/04-assert-children.yaml b/test/e2e/discovery/04-assert-children.yaml new file mode 100644 index 0000000..ccc972a --- /dev/null +++ b/test/e2e/discovery/04-assert-children.yaml @@ -0,0 +1,6 @@ +# Assert that at least one child CachedImage was created from discovery +apiVersion: drop.corewire.io/v1alpha1 +kind: CachedImage +metadata: + labels: + drop.corewire.io/imageset: discovered-set diff --git a/test/e2e/discovery/chainsaw-test.yaml b/test/e2e/discovery/chainsaw-test.yaml new file mode 100644 index 0000000..fa8e168 --- /dev/null +++ b/test/e2e/discovery/chainsaw-test.yaml @@ -0,0 +1,40 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: discovery-prometheus +spec: + description: | + Verify that a DiscoveryPolicy with a Prometheus source discovers images + from seeded metrics, and a CachedImageSet referencing it creates child CachedImages. + steps: + - name: Create DiscoveryPolicy with Prometheus source + try: + - apply: + file: 01-discoverypolicy.yaml + - name: Wait for discovered images in status + try: + - assert: + timeout: 90s + file: 02-assert-discovery-status.yaml + - name: Create CachedImageSet referencing the DiscoveryPolicy + try: + - apply: + file: 03-cachedimageset-discovery.yaml + - name: Verify child CachedImages are created from discovered images + try: + - assert: + timeout: 60s + file: 04-assert-children.yaml + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: CachedImageSet + name: discovered-set + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-prometheus diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go new file mode 100644 index 0000000..fd12bef --- /dev/null +++ b/test/e2e/e2e_suite_test.go @@ -0,0 +1,89 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + "os" + "os/exec" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/Breee/drop/test/utils" +) + +var ( + // Optional Environment Variables: + // - CERT_MANAGER_INSTALL_SKIP=true: Skips CertManager installation during test setup. + // These variables are useful if CertManager is already installed, avoiding + // re-installation and conflicts. + skipCertManagerInstall = os.Getenv("CERT_MANAGER_INSTALL_SKIP") == "true" + // isCertManagerAlreadyInstalled will be set true when CertManager CRDs be found on the cluster + isCertManagerAlreadyInstalled = false + + // projectImage is the name of the image which will be build and loaded + // with the code source changes to be tested. + projectImage = "example.com/drop:v0.0.1" +) + +// TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated, +// temporary environment to validate project changes with the purposed to be used in CI jobs. +// The default setup requires Kind, builds/loads the Manager Docker image locally, and installs +// CertManager. +func TestE2E(t *testing.T) { + RegisterFailHandler(Fail) + _, _ = fmt.Fprintf(GinkgoWriter, "Starting drop integration test suite\n") + RunSpecs(t, "e2e suite") +} + +var _ = BeforeSuite(func() { + By("building the manager(Operator) image") + cmd := exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectImage)) + _, err := utils.Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build the manager(Operator) image") + + // TODO(user): If you want to change the e2e test vendor from Kind, ensure the image is + // built and available before running the tests. Also, remove the following block. + By("loading the manager(Operator) image on Kind") + err = utils.LoadImageToKindClusterWithName(projectImage) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the manager(Operator) image into Kind") + + // The tests-e2e are intended to run on a temporary cluster that is created and destroyed for testing. + // To prevent errors when tests run in environments with CertManager already installed, + // we check for its presence before execution. + // Setup CertManager before the suite if not skipped and if not already installed + if !skipCertManagerInstall { + By("checking if cert manager is installed already") + isCertManagerAlreadyInstalled = utils.IsCertManagerCRDsInstalled() + if !isCertManagerAlreadyInstalled { + _, _ = fmt.Fprintf(GinkgoWriter, "Installing CertManager...\n") + Expect(utils.InstallCertManager()).To(Succeed(), "Failed to install CertManager") + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "WARNING: CertManager is already installed. Skipping installation...\n") + } + } +}) + +var _ = AfterSuite(func() { + // Teardown CertManager after the suite if not skipped and if it was not already installed + if !skipCertManagerInstall && !isCertManagerAlreadyInstalled { + _, _ = fmt.Fprintf(GinkgoWriter, "Uninstalling CertManager...\n") + utils.UninstallCertManager() + } +}) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go new file mode 100644 index 0000000..d39597c --- /dev/null +++ b/test/e2e/e2e_test.go @@ -0,0 +1,329 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/Breee/drop/test/utils" +) + +// namespace where the project is deployed in +const namespace = "drop-system" + +// serviceAccountName created for the project +const serviceAccountName = "drop-controller-manager" + +// metricsServiceName is the name of the metrics service of the project +const metricsServiceName = "drop-controller-manager-metrics-service" + +// metricsRoleBindingName is the name of the RBAC that will be created to allow get the metrics data +const metricsRoleBindingName = "drop-metrics-binding" + +var _ = Describe("Manager", Ordered, func() { + var controllerPodName string + + // Before running the tests, set up the environment by creating the namespace, + // enforce the restricted security policy to the namespace, installing CRDs, + // and deploying the controller. + BeforeAll(func() { + By("creating manager namespace") + cmd := exec.Command("kubectl", "create", "ns", namespace) + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create namespace") + + By("labeling the namespace to enforce the restricted security policy") + cmd = exec.Command("kubectl", "label", "--overwrite", "ns", namespace, + "pod-security.kubernetes.io/enforce=restricted") + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to label namespace with restricted policy") + + By("installing CRDs") + cmd = exec.Command("make", "install") + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to install CRDs") + + By("deploying the controller-manager") + cmd = exec.Command("make", "deploy", fmt.Sprintf("IMG=%s", projectImage)) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to deploy the controller-manager") + }) + + // After all tests have been executed, clean up by undeploying the controller, uninstalling CRDs, + // and deleting the namespace. + AfterAll(func() { + By("cleaning up the curl pod for metrics") + cmd := exec.Command("kubectl", "delete", "pod", "curl-metrics", "-n", namespace) + _, _ = utils.Run(cmd) + + By("undeploying the controller-manager") + cmd = exec.Command("make", "undeploy") + _, _ = utils.Run(cmd) + + By("uninstalling CRDs") + cmd = exec.Command("make", "uninstall") + _, _ = utils.Run(cmd) + + By("removing manager namespace") + cmd = exec.Command("kubectl", "delete", "ns", namespace) + _, _ = utils.Run(cmd) + }) + + // After each test, check for failures and collect logs, events, + // and pod descriptions for debugging. + AfterEach(func() { + specReport := CurrentSpecReport() + if specReport.Failed() { + By("Fetching controller manager pod logs") + cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace) + controllerLogs, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, "Controller logs:\n %s", controllerLogs) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get Controller logs: %s", err) + } + + By("Fetching Kubernetes events") + cmd = exec.Command("kubectl", "get", "events", "-n", namespace, "--sort-by=.lastTimestamp") + eventsOutput, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, "Kubernetes events:\n%s", eventsOutput) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get Kubernetes events: %s", err) + } + + By("Fetching curl-metrics logs") + cmd = exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace) + metricsOutput, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, "Metrics logs:\n %s", metricsOutput) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get curl-metrics logs: %s", err) + } + + By("Fetching controller manager pod description") + cmd = exec.Command("kubectl", "describe", "pod", controllerPodName, "-n", namespace) + podDescription, err := utils.Run(cmd) + if err == nil { + fmt.Println("Pod description:\n", podDescription) + } else { + fmt.Println("Failed to describe controller pod") + } + } + }) + + SetDefaultEventuallyTimeout(2 * time.Minute) + SetDefaultEventuallyPollingInterval(time.Second) + + Context("Manager", func() { + It("should run successfully", func() { + By("validating that the controller-manager pod is running as expected") + verifyControllerUp := func(g Gomega) { + // Get the name of the controller-manager pod + cmd := exec.Command("kubectl", "get", + "pods", "-l", "control-plane=controller-manager", + "-o", "go-template={{ range .items }}"+ + "{{ if not .metadata.deletionTimestamp }}"+ + "{{ .metadata.name }}"+ + "{{ \"\\n\" }}{{ end }}{{ end }}", + "-n", namespace, + ) + + podOutput, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred(), "Failed to retrieve controller-manager pod information") + podNames := utils.GetNonEmptyLines(podOutput) + g.Expect(podNames).To(HaveLen(1), "expected 1 controller pod running") + controllerPodName = podNames[0] + g.Expect(controllerPodName).To(ContainSubstring("controller-manager")) + + // Validate the pod's status + cmd = exec.Command("kubectl", "get", + "pods", controllerPodName, "-o", "jsonpath={.status.phase}", + "-n", namespace, + ) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(Equal("Running"), "Incorrect controller-manager pod status") + } + Eventually(verifyControllerUp).Should(Succeed()) + }) + + It("should ensure the metrics endpoint is serving metrics", func() { + By("creating a ClusterRoleBinding for the service account to allow access to metrics") + cmd := exec.Command("kubectl", "create", "clusterrolebinding", metricsRoleBindingName, + "--clusterrole=drop-metrics-reader", + fmt.Sprintf("--serviceaccount=%s:%s", namespace, serviceAccountName), + ) + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create ClusterRoleBinding") + + By("validating that the metrics service is available") + cmd = exec.Command("kubectl", "get", "service", metricsServiceName, "-n", namespace) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Metrics service should exist") + + By("getting the service account token") + token, err := serviceAccountToken() + Expect(err).NotTo(HaveOccurred()) + Expect(token).NotTo(BeEmpty()) + + By("waiting for the metrics endpoint to be ready") + verifyMetricsEndpointReady := func(g Gomega) { + cmd := exec.Command("kubectl", "get", "endpoints", metricsServiceName, "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(ContainSubstring("8443"), "Metrics endpoint is not ready") + } + Eventually(verifyMetricsEndpointReady).Should(Succeed()) + + By("verifying that the controller manager is serving the metrics server") + verifyMetricsServerStarted := func(g Gomega) { + cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(ContainSubstring("controller-runtime.metrics\tServing metrics server"), + "Metrics server not yet started") + } + Eventually(verifyMetricsServerStarted).Should(Succeed()) + + By("creating the curl-metrics pod to access the metrics endpoint") + cmd = exec.Command("kubectl", "run", "curl-metrics", "--restart=Never", + "--namespace", namespace, + "--image=curlimages/curl:latest", + "--overrides", + fmt.Sprintf(`{ + "spec": { + "containers": [{ + "name": "curl", + "image": "curlimages/curl:latest", + "command": ["/bin/sh", "-c"], + "args": ["curl -v -k -H 'Authorization: Bearer %s' https://%s.%s.svc.cluster.local:8443/metrics"], + "securityContext": { + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + }, + "runAsNonRoot": true, + "runAsUser": 1000, + "seccompProfile": { + "type": "RuntimeDefault" + } + } + }], + "serviceAccount": "%s" + } + }`, token, metricsServiceName, namespace, serviceAccountName)) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create curl-metrics pod") + + By("waiting for the curl-metrics pod to complete.") + verifyCurlUp := func(g Gomega) { + cmd := exec.Command("kubectl", "get", "pods", "curl-metrics", + "-o", "jsonpath={.status.phase}", + "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(Equal("Succeeded"), "curl pod in wrong status") + } + Eventually(verifyCurlUp, 5*time.Minute).Should(Succeed()) + + By("getting the metrics by checking curl-metrics logs") + metricsOutput := getMetricsOutput() + Expect(metricsOutput).To(ContainSubstring( + "controller_runtime_reconcile_total", + )) + }) + + // +kubebuilder:scaffold:e2e-webhooks-checks + + // TODO: Customize the e2e test suite with scenarios specific to your project. + // Consider applying sample/CR(s) and check their status and/or verifying + // the reconciliation by using the metrics, i.e.: + // metricsOutput := getMetricsOutput() + // Expect(metricsOutput).To(ContainSubstring( + // fmt.Sprintf(`controller_runtime_reconcile_total{controller="%s",result="success"} 1`, + // strings.ToLower(), + // )) + }) +}) + +// serviceAccountToken returns a token for the specified service account in the given namespace. +// It uses the Kubernetes TokenRequest API to generate a token by directly sending a request +// and parsing the resulting token from the API response. +func serviceAccountToken() (string, error) { + const tokenRequestRawString = `{ + "apiVersion": "authentication.k8s.io/v1", + "kind": "TokenRequest" + }` + + // Temporary file to store the token request + secretName := fmt.Sprintf("%s-token-request", serviceAccountName) + tokenRequestFile := filepath.Join("/tmp", secretName) + err := os.WriteFile(tokenRequestFile, []byte(tokenRequestRawString), os.FileMode(0o644)) + if err != nil { + return "", err + } + + var out string + verifyTokenCreation := func(g Gomega) { + // Execute kubectl command to create the token + cmd := exec.Command("kubectl", "create", "--raw", fmt.Sprintf( + "/api/v1/namespaces/%s/serviceaccounts/%s/token", + namespace, + serviceAccountName, + ), "-f", tokenRequestFile) + + output, err := cmd.CombinedOutput() + g.Expect(err).NotTo(HaveOccurred()) + + // Parse the JSON output to extract the token + var token tokenRequest + err = json.Unmarshal(output, &token) + g.Expect(err).NotTo(HaveOccurred()) + + out = token.Status.Token + } + Eventually(verifyTokenCreation).Should(Succeed()) + + return out, err +} + +// getMetricsOutput retrieves and returns the logs from the curl pod used to access the metrics endpoint. +func getMetricsOutput() string { + By("getting the curl-metrics logs") + cmd := exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace) + metricsOutput, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to retrieve logs from curl pod") + Expect(metricsOutput).To(ContainSubstring("< HTTP/1.1 200 OK")) + return metricsOutput +} + +// tokenRequest is a simplified representation of the Kubernetes TokenRequest API response, +// containing only the token field that we need to extract. +type tokenRequest struct { + Status struct { + Token string `json:"token"` + } `json:"status"` +} diff --git a/test/utils/utils.go b/test/utils/utils.go new file mode 100644 index 0000000..0488aa7 --- /dev/null +++ b/test/utils/utils.go @@ -0,0 +1,251 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "bufio" + "bytes" + "fmt" + "os" + "os/exec" + "strings" + + . "github.com/onsi/ginkgo/v2" //nolint:golint,revive +) + +const ( + prometheusOperatorVersion = "v0.77.1" + prometheusOperatorURL = "https://github.com/prometheus-operator/prometheus-operator/" + + "releases/download/%s/bundle.yaml" + + certmanagerVersion = "v1.16.3" + certmanagerURLTmpl = "https://github.com/cert-manager/cert-manager/releases/download/%s/cert-manager.yaml" +) + +func warnError(err error) { + _, _ = fmt.Fprintf(GinkgoWriter, "warning: %v\n", err) +} + +// Run executes the provided command within this context +func Run(cmd *exec.Cmd) (string, error) { + dir, _ := GetProjectDir() + cmd.Dir = dir + + if err := os.Chdir(cmd.Dir); err != nil { + _, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err) + } + + cmd.Env = append(os.Environ(), "GO111MODULE=on") + command := strings.Join(cmd.Args, " ") + _, _ = fmt.Fprintf(GinkgoWriter, "running: %s\n", command) + output, err := cmd.CombinedOutput() + if err != nil { + return string(output), fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output)) + } + + return string(output), nil +} + +// InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics. +func InstallPrometheusOperator() error { + url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) + cmd := exec.Command("kubectl", "create", "-f", url) + _, err := Run(cmd) + return err +} + +// UninstallPrometheusOperator uninstalls the prometheus +func UninstallPrometheusOperator() { + url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) + cmd := exec.Command("kubectl", "delete", "-f", url) + if _, err := Run(cmd); err != nil { + warnError(err) + } +} + +// IsPrometheusCRDsInstalled checks if any Prometheus CRDs are installed +// by verifying the existence of key CRDs related to Prometheus. +func IsPrometheusCRDsInstalled() bool { + // List of common Prometheus CRDs + prometheusCRDs := []string{ + "prometheuses.monitoring.coreos.com", + "prometheusrules.monitoring.coreos.com", + "prometheusagents.monitoring.coreos.com", + } + + cmd := exec.Command("kubectl", "get", "crds", "-o", "custom-columns=NAME:.metadata.name") + output, err := Run(cmd) + if err != nil { + return false + } + crdList := GetNonEmptyLines(output) + for _, crd := range prometheusCRDs { + for _, line := range crdList { + if strings.Contains(line, crd) { + return true + } + } + } + + return false +} + +// UninstallCertManager uninstalls the cert manager +func UninstallCertManager() { + url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) + cmd := exec.Command("kubectl", "delete", "-f", url) + if _, err := Run(cmd); err != nil { + warnError(err) + } +} + +// InstallCertManager installs the cert manager bundle. +func InstallCertManager() error { + url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) + cmd := exec.Command("kubectl", "apply", "-f", url) + if _, err := Run(cmd); err != nil { + return err + } + // Wait for cert-manager-webhook to be ready, which can take time if cert-manager + // was re-installed after uninstalling on a cluster. + cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook", + "--for", "condition=Available", + "--namespace", "cert-manager", + "--timeout", "5m", + ) + + _, err := Run(cmd) + return err +} + +// IsCertManagerCRDsInstalled checks if any Cert Manager CRDs are installed +// by verifying the existence of key CRDs related to Cert Manager. +func IsCertManagerCRDsInstalled() bool { + // List of common Cert Manager CRDs + certManagerCRDs := []string{ + "certificates.cert-manager.io", + "issuers.cert-manager.io", + "clusterissuers.cert-manager.io", + "certificaterequests.cert-manager.io", + "orders.acme.cert-manager.io", + "challenges.acme.cert-manager.io", + } + + // Execute the kubectl command to get all CRDs + cmd := exec.Command("kubectl", "get", "crds") + output, err := Run(cmd) + if err != nil { + return false + } + + // Check if any of the Cert Manager CRDs are present + crdList := GetNonEmptyLines(output) + for _, crd := range certManagerCRDs { + for _, line := range crdList { + if strings.Contains(line, crd) { + return true + } + } + } + + return false +} + +// LoadImageToKindClusterWithName loads a local docker image to the kind cluster +func LoadImageToKindClusterWithName(name string) error { + cluster := "kind" + if v, ok := os.LookupEnv("KIND_CLUSTER"); ok { + cluster = v + } + kindOptions := []string{"load", "docker-image", name, "--name", cluster} + cmd := exec.Command("kind", kindOptions...) + _, err := Run(cmd) + return err +} + +// GetNonEmptyLines converts given command output string into individual objects +// according to line breakers, and ignores the empty elements in it. +func GetNonEmptyLines(output string) []string { + var res []string + elements := strings.Split(output, "\n") + for _, element := range elements { + if element != "" { + res = append(res, element) + } + } + + return res +} + +// GetProjectDir will return the directory where the project is +func GetProjectDir() (string, error) { + wd, err := os.Getwd() + if err != nil { + return wd, err + } + wd = strings.Replace(wd, "/test/e2e", "", -1) + return wd, nil +} + +// UncommentCode searches for target in the file and remove the comment prefix +// of the target content. The target content may span multiple lines. +func UncommentCode(filename, target, prefix string) error { + // false positive + // nolint:gosec + content, err := os.ReadFile(filename) + if err != nil { + return err + } + strContent := string(content) + + idx := strings.Index(strContent, target) + if idx < 0 { + return fmt.Errorf("unable to find the code %s to be uncomment", target) + } + + out := new(bytes.Buffer) + _, err = out.Write(content[:idx]) + if err != nil { + return err + } + + scanner := bufio.NewScanner(bytes.NewBufferString(target)) + if !scanner.Scan() { + return nil + } + for { + _, err := out.WriteString(strings.TrimPrefix(scanner.Text(), prefix)) + if err != nil { + return err + } + // Avoid writing a newline in case the previous line was the last in target. + if !scanner.Scan() { + break + } + if _, err := out.WriteString("\n"); err != nil { + return err + } + } + + _, err = out.Write(content[idx+len(target):]) + if err != nil { + return err + } + // false positive + // nolint:gosec + return os.WriteFile(filename, out.Bytes(), 0644) +}